diff --git a/backends/tfhe-cuda-backend/cuda/include/integer.h b/backends/tfhe-cuda-backend/cuda/include/integer.h index c1874aed98..a5337a91d1 100644 --- a/backends/tfhe-cuda-backend/cuda/include/integer.h +++ b/backends/tfhe-cuda-backend/cuda/include/integer.h @@ -15,7 +15,6 @@ enum SHIFT_OR_ROTATE_TYPE { LEFT_ROTATE = 2, RIGHT_ROTATE = 3 }; -enum LUT_TYPE { OPERATOR = 0, MAXVALUE = 1, ISNONZERO = 2, BLOCKSLEN = 3 }; enum BITOP_TYPE { BITAND = 0, BITOR = 1, @@ -36,6 +35,11 @@ enum COMPARISON_TYPE { MIN = 7, }; +enum COMPRESSION_MODE { + COMPRESS = 0, + DECOMPRESS = 1, +}; + enum CMP_ORDERING { IS_INFERIOR = 0, IS_EQUAL = 1, IS_SUPERIOR = 2 }; enum SIGNED_OPERATION { ADDITION = 1, SUBTRACTION = -1 }; @@ -202,6 +206,30 @@ void cuda_scalar_comparison_integer_radix_ciphertext_kb_64( void cleanup_cuda_integer_comparison(void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr_void); +void scratch_cuda_compression_integer_radix_ciphertext_64( + void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr, + uint32_t encryption_glwe_dimension, uint32_t encryption_polynomial_size, + uint32_t compression_glwe_dimension, uint32_t compression_polynomial_size, + uint32_t lwe_dimension, uint32_t ks_level, uint32_t ks_base_log, + uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor, + uint32_t num_lwes, uint32_t message_modulus, uint32_t carry_modulus, + PBS_TYPE pbs_type, uint32_t lwe_per_glwe, uint32_t storage_log_modulus, + COMPRESSION_MODE mode, bool allocate_gpu_memory); + +void cuda_compression_compress_integer_radix_ciphertext_64( + void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, + void *glwe_array_out, void *lwe_array_in, void **fp_ksk, uint32_t num_nths, + int8_t *mem_ptr); + +void cuda_compression_decompress_integer_radix_ciphertext_64( + void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, + void *lwe_array_out, void *glwe_in, void *indexes_array, + uint32_t indexes_array_size, void **bsks, int8_t *mem_ptr); + +void cleanup_cuda_compression_integer_radix_ciphertext_64( + void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, + int8_t **mem_ptr_void); + void scratch_cuda_integer_radix_bitop_kb_64( void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size, @@ -452,7 +480,8 @@ struct int_radix_params { message_modulus(message_modulus), carry_modulus(carry_modulus){}; void print() { - printf("pbs_type: %u, glwe_dimension: %u, polynomial_size: %u, " + printf("pbs_type: %u, glwe_dimension: %u, " + "polynomial_size: %u, " "big_lwe_dimension: %u, " "small_lwe_dimension: %u, ks_level: %u, ks_base_log: %u, pbs_level: " "%u, pbs_base_log: " @@ -790,6 +819,91 @@ template struct int_radix_lut { } }; +template struct int_compression { + COMPRESSION_MODE mode; + int_radix_params encryption_params; + int_radix_params compression_params; + uint32_t storage_log_modulus; + uint32_t lwe_per_glwe; + + uint32_t body_count; + + // Compression + Torus *tmp_lwe; + Torus *tmp_glwe_array_out; + + // Decompression + Torus *tmp_extracted_glwe; + Torus *tmp_extracted_lwe; + int_radix_lut *carry_extract_lut; + + int_compression(cudaStream_t *streams, uint32_t *gpu_indexes, + uint32_t gpu_count, int_radix_params encryption_params, + int_radix_params compression_params, + uint32_t num_radix_blocks, uint32_t lwe_per_glwe, + uint32_t storage_log_modulus, COMPRESSION_MODE mode, + bool allocate_gpu_memory) { + this->mode = mode; + this->encryption_params = encryption_params; + this->compression_params = compression_params; + this->lwe_per_glwe = lwe_per_glwe; + this->storage_log_modulus = storage_log_modulus; + this->body_count = num_radix_blocks; + + if (allocate_gpu_memory) { + Torus glwe_accumulator_size = (compression_params.glwe_dimension + 1) * + compression_params.polynomial_size; + + tmp_lwe = (Torus *)cuda_malloc_async( + num_radix_blocks * (compression_params.small_lwe_dimension + 1) * + sizeof(Torus), + streams[0], gpu_indexes[0]); + tmp_glwe_array_out = (Torus *)cuda_malloc_async( + glwe_accumulator_size * sizeof(Torus), streams[0], gpu_indexes[0]); + + if (mode == COMPRESSION_MODE::DECOMPRESS) { + carry_extract_lut = new int_radix_lut( + streams, gpu_indexes, gpu_count, encryption_params, 1, + num_radix_blocks, allocate_gpu_memory); + + tmp_extracted_glwe = (Torus *)cuda_malloc_async( + glwe_accumulator_size * sizeof(Torus), streams[0], gpu_indexes[0]); + tmp_extracted_lwe = (Torus *)cuda_malloc_async( + num_radix_blocks * + (compression_params.glwe_dimension * + compression_params.polynomial_size + + 1) * + sizeof(Torus), + streams[0], gpu_indexes[0]); + // Decompression + // Carry extract LUT + auto carry_extract_f = [encryption_params](Torus x) -> Torus { + return x / encryption_params.message_modulus; + }; + + generate_device_accumulator( + streams[0], gpu_indexes[0], + carry_extract_lut->get_lut(gpu_indexes[0], 0), + encryption_params.glwe_dimension, encryption_params.polynomial_size, + encryption_params.message_modulus, encryption_params.carry_modulus, + carry_extract_f); + + carry_extract_lut->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]); + } + } + } + void release(cudaStream_t *streams, uint32_t *gpu_indexes, + uint32_t gpu_count) { + cuda_drop_async(tmp_lwe, streams[0], gpu_indexes[0]); + cuda_drop_async(tmp_glwe_array_out, streams[0], gpu_indexes[0]); + if (mode == COMPRESSION_MODE::DECOMPRESS) { + carry_extract_lut->release(streams, gpu_indexes, gpu_count); + delete (carry_extract_lut); + cuda_drop_async(tmp_extracted_glwe, streams[0], gpu_indexes[0]); + cuda_drop_async(tmp_extracted_lwe, streams[0], gpu_indexes[0]); + } + } +}; template struct int_bit_extract_luts_buffer { int_radix_params params; int_radix_lut *lut; diff --git a/backends/tfhe-cuda-backend/cuda/include/keyswitch.h b/backends/tfhe-cuda-backend/cuda/include/keyswitch.h index 924ec131f9..202c3d6329 100644 --- a/backends/tfhe-cuda-backend/cuda/include/keyswitch.h +++ b/backends/tfhe-cuda-backend/cuda/include/keyswitch.h @@ -16,6 +16,12 @@ void cuda_keyswitch_lwe_ciphertext_vector_64( void *lwe_output_indexes, void *lwe_array_in, void *lwe_input_indexes, void *ksk, uint32_t lwe_dimension_in, uint32_t lwe_dimension_out, uint32_t base_log, uint32_t level_count, uint32_t num_samples); + +void cuda_fp_keyswitch_lwe_list_to_glwe_64( + void *stream, uint32_t gpu_index, void *glwe_array_out, void *lwe_array_in, + void *fp_ksk_array, uint32_t input_lwe_dimension, + uint32_t output_glwe_dimension, uint32_t output_polynomial_size, + uint32_t base_log, uint32_t level_count, uint32_t num_lwes); } #endif // CNCRT_KS_H_ diff --git a/backends/tfhe-cuda-backend/cuda/src/CMakeLists.txt b/backends/tfhe-cuda-backend/cuda/src/CMakeLists.txt index 3190501fc5..5fec699fbc 100644 --- a/backends/tfhe-cuda-backend/cuda/src/CMakeLists.txt +++ b/backends/tfhe-cuda-backend/cuda/src/CMakeLists.txt @@ -1,17 +1,3 @@ -set(SOURCES - ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/bit_extraction.h - ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/bitwise_ops.h - ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/bootstrap.h - ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/bootstrap_multibit.h - ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/ciphertext.h - ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/circuit_bootstrap.h - ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/device.h - ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/integer.h - ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/keyswitch.h - ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/linear_algebra.h - ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/shifts.h - ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/vertical_packing.h - ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/helper_multi_gpu.h) file(GLOB_RECURSE SOURCES "*.cu") add_library(tfhe_cuda_backend STATIC ${SOURCES}) set_target_properties(tfhe_cuda_backend PROPERTIES CUDA_SEPARABLE_COMPILATION ON CUDA_RESOLVE_DEVICE_SYMBOLS ON) diff --git a/backends/tfhe-cuda-backend/cuda/src/crypto/ciphertext.cuh b/backends/tfhe-cuda-backend/cuda/src/crypto/ciphertext.cuh index b45353a616..9971b5c000 100644 --- a/backends/tfhe-cuda-backend/cuda/src/crypto/ciphertext.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/crypto/ciphertext.cuh @@ -38,8 +38,8 @@ __global__ void sample_extract(Torus *lwe_array_out, Torus *glwe_array_in, auto lwe_out = lwe_array_out + input_id * lwe_output_size; // We assume each GLWE will store the first polynomial_size inputs - uint32_t nth_per_glwe = params::degree; - auto glwe_in = glwe_array_in + (input_id / nth_per_glwe) * glwe_input_size; + uint32_t lwe_per_glwe = params::degree; + auto glwe_in = glwe_array_in + (input_id / lwe_per_glwe) * glwe_input_size; auto nth = nth_array[input_id]; @@ -50,11 +50,11 @@ __global__ void sample_extract(Torus *lwe_array_out, Torus *glwe_array_in, template __host__ void host_sample_extract(cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out, Torus *glwe_array_in, - uint32_t *nth_array, uint32_t num_glwes, + uint32_t *nth_array, uint32_t num_nths, uint32_t glwe_dimension) { cudaSetDevice(gpu_index); - dim3 grid(num_glwes); + dim3 grid(num_nths); dim3 thds(params::degree / params::opt); sample_extract<<>>( lwe_array_out, glwe_array_in, nth_array, glwe_dimension); diff --git a/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cu b/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cu index 524a1fa45e..921aa01d7b 100644 --- a/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cu +++ b/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cu @@ -10,7 +10,7 @@ void cuda_keyswitch_lwe_ciphertext_vector_32( void *lwe_output_indexes, void *lwe_array_in, void *lwe_input_indexes, void *ksk, uint32_t lwe_dimension_in, uint32_t lwe_dimension_out, uint32_t base_log, uint32_t level_count, uint32_t num_samples) { - cuda_keyswitch_lwe_ciphertext_vector( + host_keyswitch_lwe_ciphertext_vector( static_cast(stream), gpu_index, static_cast(lwe_array_out), static_cast(lwe_output_indexes), @@ -40,7 +40,7 @@ void cuda_keyswitch_lwe_ciphertext_vector_64( void *lwe_output_indexes, void *lwe_array_in, void *lwe_input_indexes, void *ksk, uint32_t lwe_dimension_in, uint32_t lwe_dimension_out, uint32_t base_log, uint32_t level_count, uint32_t num_samples) { - cuda_keyswitch_lwe_ciphertext_vector( + host_keyswitch_lwe_ciphertext_vector( static_cast(stream), gpu_index, static_cast(lwe_array_out), static_cast(lwe_output_indexes), @@ -48,3 +48,21 @@ void cuda_keyswitch_lwe_ciphertext_vector_64( static_cast(lwe_input_indexes), static_cast(ksk), lwe_dimension_in, lwe_dimension_out, base_log, level_count, num_samples); } + +/* Perform functional packing keyswitch on a batch of 64 bits input LWE + * ciphertexts. + */ +void cuda_fp_keyswitch_lwe_list_to_glwe_64( + void *stream, uint32_t gpu_index, void *glwe_array_out, void *lwe_array_in, + void *fp_ksk_array, uint32_t input_lwe_dimension, + uint32_t output_glwe_dimension, uint32_t output_polynomial_size, + uint32_t base_log, uint32_t level_count, uint32_t num_lwes) { + + host_fp_keyswitch_lwe_list_to_glwe( + static_cast(stream), gpu_index, + static_cast(glwe_array_out), + static_cast(lwe_array_in), + static_cast(fp_ksk_array), input_lwe_dimension, + output_glwe_dimension, output_polynomial_size, base_log, level_count, + num_lwes); +} diff --git a/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cuh b/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cuh index db78104a98..423375bccf 100644 --- a/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cuh @@ -7,6 +7,7 @@ #include "polynomial/functions.cuh" #include "polynomial/polynomial_math.cuh" #include "torus.cuh" +#include "utils/helper.cuh" #include "utils/kernel_dimensions.cuh" #include #include @@ -98,7 +99,7 @@ keyswitch(Torus *lwe_array_out, const Torus *__restrict__ lwe_output_indexes, } template -__host__ void cuda_keyswitch_lwe_ciphertext_vector( +__host__ void host_keyswitch_lwe_ciphertext_vector( cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out, Torus *lwe_output_indexes, Torus *lwe_array_in, Torus *lwe_input_indexes, Torus *ksk, uint32_t lwe_dimension_in, uint32_t lwe_dimension_out, @@ -146,7 +147,7 @@ void execute_keyswitch_async(cudaStream_t *streams, uint32_t *gpu_indexes, GET_VARIANT_ELEMENT(lwe_input_indexes, i); // Compute Keyswitch - cuda_keyswitch_lwe_ciphertext_vector( + host_keyswitch_lwe_ciphertext_vector( streams[i], gpu_indexes[i], current_lwe_array_out, current_lwe_output_indexes, current_lwe_array_in, current_lwe_input_indexes, ksks[i], lwe_dimension_in, lwe_dimension_out, @@ -154,4 +155,146 @@ void execute_keyswitch_async(cudaStream_t *streams, uint32_t *gpu_indexes, } } +// chunk_count = glwe_size * polynomial_size / threads. +// each threads will responsible to process only lwe_size times multiplication +template +__device__ void keyswitch_lwe_ciphertext_into_glwe_ciphertext( + Torus *glwe_out, Torus *lwe_in, Torus *fp_ksk, uint32_t lwe_dimension_in, + uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log, + uint32_t level_count) { + + extern __shared__ int8_t sharedmem[]; + + // result accumulator, shared memory is used because of frequent access + Torus *local_glwe_chunk = (Torus *)sharedmem; + + const int tid = threadIdx.x + blockIdx.x * blockDim.x; + + const int shmem_index = threadIdx.x; + // the output_glwe is split in chunks and each x-block takes one of them + size_t chunk_id = blockIdx.x; + size_t coef_per_block = blockDim.x; + + // dimensions + size_t glwe_size = (glwe_dimension + 1); + // number of coefficients inside fp-ksk block for each lwe_input coefficient + size_t ksk_block_size = glwe_size * polynomial_size * level_count; + + // initialize accumulator to 0 + local_glwe_chunk[shmem_index] = + SEL(0, lwe_in[lwe_dimension_in], tid == glwe_dimension * polynomial_size); + + // Iterate through all lwe elements + for (int i = 0; i < lwe_dimension_in; i++) { + // Round and prepare decomposition + Torus a_i = round_to_closest_multiple(lwe_in[i], base_log, level_count); + + Torus state = a_i >> (sizeof(Torus) * 8 - base_log * level_count); + Torus mod_b_mask = (1ll << base_log) - 1ll; + + // block of key for current lwe coefficient (cur_input_lwe[i]) + auto ksk_block = &fp_ksk[i * ksk_block_size]; + for (int j = 0; j < level_count; j++) { + auto ksk_glwe = &ksk_block[j * glwe_size * polynomial_size]; + // Iterate through each level and multiply by the ksk piece + auto ksk_glwe_chunk = &ksk_glwe[chunk_id * coef_per_block]; + Torus decomposed = decompose_one(state, mod_b_mask, base_log); + local_glwe_chunk[shmem_index] -= decomposed * ksk_glwe_chunk[shmem_index]; + } + } + + // Persist + glwe_out[tid] = local_glwe_chunk[shmem_index]; +} + +// public functional packing keyswitch +// +// blockIdx.y - input +// chunk_count = glwe_size * polynomial_size / threads. +template +__global__ void +fp_keyswitch_lwe_list_to_glwe(Torus *glwe_array_out, Torus *lwe_array_in, + Torus *fp_ksk, uint32_t lwe_dimension_in, + uint32_t glwe_dimension, uint32_t polynomial_size, + uint32_t base_log, uint32_t level_count, + Torus *d_mem) { + const int glwe_accumulator_size = (glwe_dimension + 1) * polynomial_size; + const int lwe_size = (lwe_dimension_in + 1); + + const int input_id = blockIdx.y; + const int degree = input_id; + + // Select an input + auto lwe_in = lwe_array_in + input_id * lwe_size; + auto ks_glwe_out = d_mem + input_id * glwe_accumulator_size; + auto glwe_out = glwe_array_out + input_id * glwe_accumulator_size; + // KS LWE to GLWE + keyswitch_lwe_ciphertext_into_glwe_ciphertext( + ks_glwe_out, lwe_in, fp_ksk, lwe_dimension_in, glwe_dimension, + polynomial_size, base_log, level_count); + + // P * x ^degree + polynomial_accumulate_monic_monomial_mul( + glwe_out, ks_glwe_out, degree, polynomial_size, true); +} + +/// To-do: Rewrite this kernel for efficiency +template +__global__ void accumulate_glwes(Torus *glwe_out, Torus *glwe_array_in, + uint32_t glwe_dimension, + uint32_t polynomial_size, uint32_t num_lwes) { + const int tid = threadIdx.x + blockIdx.x * blockDim.x; + + extern __shared__ int8_t sharedmem[]; + const int shmem_index = threadIdx.x; + + Torus *glwe_acc = (Torus *)sharedmem; + glwe_acc[shmem_index] = glwe_array_in[tid]; + + // Accumulate + for (int i = 1; i < num_lwes; i++) { + auto glwe_in = glwe_array_in + i * (glwe_dimension + 1) * polynomial_size; + glwe_acc[shmem_index] += glwe_in[tid]; + } + + glwe_out[tid] = glwe_acc[shmem_index]; +} + +template +__host__ void host_fp_keyswitch_lwe_list_to_glwe( + cudaStream_t stream, uint32_t gpu_index, Torus *glwe_out, + Torus *lwe_array_in, Torus *fp_ksk_array, uint32_t lwe_dimension_in, + uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log, + uint32_t level_count, uint32_t num_lwes) { + cudaSetDevice(gpu_index); + int glwe_accumulator_size = (glwe_dimension + 1) * polynomial_size; + + int num_blocks = 0, num_threads = 0; + getNumBlocksAndThreads(glwe_accumulator_size, 512, num_blocks, num_threads); + + auto shared_mem = sizeof(Torus) * num_threads; + dim3 grid(num_blocks, num_lwes); + dim3 threads(num_threads); + + auto d_mem = (Torus *)cuda_malloc_async( + num_lwes * glwe_accumulator_size * sizeof(Torus), stream, gpu_index); + auto d_tmp_glwe_array_out = (Torus *)cuda_malloc_async( + num_lwes * glwe_accumulator_size * sizeof(Torus), stream, gpu_index); + + // individually keyswitch each lwe + fp_keyswitch_lwe_list_to_glwe<<>>( + d_tmp_glwe_array_out, lwe_array_in, fp_ksk_array, lwe_dimension_in, + glwe_dimension, polynomial_size, base_log, level_count, d_mem); + check_cuda_error(cudaGetLastError()); + + // accumulate to a single glwe + accumulate_glwes<<>>( + glwe_out, d_tmp_glwe_array_out, glwe_dimension, polynomial_size, + num_lwes); + check_cuda_error(cudaGetLastError()); + + cuda_drop_async(d_mem, stream, gpu_index); + cuda_drop_async(d_tmp_glwe_array_out, stream, gpu_index); +} + #endif diff --git a/backends/tfhe-cuda-backend/cuda/src/crypto/torus.cuh b/backends/tfhe-cuda-backend/cuda/src/crypto/torus.cuh index 177892e5a9..69b85c30af 100644 --- a/backends/tfhe-cuda-backend/cuda/src/crypto/torus.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/crypto/torus.cuh @@ -2,6 +2,7 @@ #define CNCRT_TORUS_CUH #include "types/int128.cuh" +#include "utils/kernel_dimensions.cuh" #include template @@ -29,29 +30,51 @@ __device__ inline void typecast_double_to_torus(double x, template __device__ inline T round_to_closest_multiple(T x, uint32_t base_log, uint32_t level_count) { - T shift = sizeof(T) * 8 - level_count * base_log; - T mask = 1ll << (shift - 1); - T b = (x & mask) >> (shift - 1); + const T non_rep_bit_count = sizeof(T) * 8 - level_count * base_log; + const T shift = non_rep_bit_count - 1; T res = x >> shift; - res += b; - res <<= shift; - return res; + res += 1; + res &= (T)(-2); + return res << shift; } template -__device__ __forceinline__ void modulus_switch(T input, T &output, - uint32_t log_modulus) { +__device__ __forceinline__ void apply_modulus_switch(T input, T &output, + uint32_t log_modulus) { constexpr uint32_t BITS = sizeof(T) * 8; - output = input + (((T)1) << (BITS - log_modulus - 1)); output >>= (BITS - log_modulus); } template -__device__ __forceinline__ T modulus_switch(T input, uint32_t log_modulus) { +__device__ __forceinline__ T apply_modulus_switch(T input, + uint32_t log_modulus) { T output; - modulus_switch(input, output, log_modulus); + apply_modulus_switch(input, output, log_modulus); return output; } +template +__global__ void apply_modulus_switch_inplace(Torus *array, int size, + uint32_t log_modulus) { + const int tid = threadIdx.x + blockIdx.x * blockDim.x; + if (tid < size) { + array[tid] = apply_modulus_switch(array[tid], log_modulus); + } +} + +template +__host__ void modulus_switch_inplace(cudaStream_t stream, uint32_t gpu_index, + Torus *array, int size, + uint32_t log_modulus) { + cudaSetDevice(gpu_index); + + int num_threads = 0, num_blocks = 0; + getNumBlocksAndThreads(size, 1024, num_blocks, num_threads); + + apply_modulus_switch_inplace<<>>( + array, size, log_modulus); + check_cuda_error(cudaGetLastError()); +} + #endif // CNCRT_TORUS_H diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cu b/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cu new file mode 100644 index 0000000000..22cd3c6161 --- /dev/null +++ b/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cu @@ -0,0 +1,61 @@ +#include "compression.cuh" + +void scratch_cuda_compression_integer_radix_ciphertext_64( + void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr, + uint32_t encryption_glwe_dimension, uint32_t encryption_polynomial_size, + uint32_t compression_glwe_dimension, uint32_t compression_polynomial_size, + uint32_t lwe_dimension, uint32_t ks_level, uint32_t ks_base_log, + uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor, + uint32_t num_lwes, uint32_t message_modulus, uint32_t carry_modulus, + PBS_TYPE pbs_type, uint32_t lwe_per_glwe, uint32_t storage_log_modulus, + COMPRESSION_MODE mode, bool allocate_gpu_memory) { + + int_radix_params encryption_params( + pbs_type, encryption_glwe_dimension, encryption_polynomial_size, + (encryption_glwe_dimension + 1) * encryption_polynomial_size, + lwe_dimension, ks_level, ks_base_log, pbs_level, pbs_base_log, + grouping_factor, message_modulus, carry_modulus); + + int_radix_params compression_params( + pbs_type, compression_glwe_dimension, compression_polynomial_size, + (compression_glwe_dimension + 1) * compression_polynomial_size, + lwe_dimension, ks_level, ks_base_log, pbs_level, pbs_base_log, + grouping_factor, message_modulus, carry_modulus); + + scratch_cuda_compression_integer_radix_ciphertext_64( + (cudaStream_t *)(streams), gpu_indexes, gpu_count, + (int_compression **)mem_ptr, num_lwes, encryption_params, + compression_params, lwe_per_glwe, storage_log_modulus, mode, + allocate_gpu_memory); +} +void cuda_compression_compress_integer_radix_ciphertext_64( + void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, + void *glwe_array_out, void *lwe_array_in, void **fp_ksk, uint32_t num_nths, + int8_t *mem_ptr) { + + host_integer_compression_compress( + (cudaStream_t *)(streams), gpu_indexes, gpu_count, + static_cast(glwe_array_out), + static_cast(lwe_array_in), (uint64_t **)(fp_ksk), num_nths, + (int_compression *)mem_ptr); +} +void cuda_compression_decompress_integer_radix_ciphertext_64( + void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, + void *lwe_array_out, void *glwe_in, void *indexes_array, + uint32_t indexes_array_size, void **bsks, int8_t *mem_ptr) { + + host_integer_compression_decompress( + (cudaStream_t *)(streams), gpu_indexes, gpu_count, + static_cast(lwe_array_out), static_cast(glwe_in), + static_cast(indexes_array), indexes_array_size, bsks, + (int_compression *)mem_ptr); +} + +void cleanup_cuda_compression_integer_radix_ciphertext_64( + void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, + int8_t **mem_ptr_void) { + + int_compression *mem_ptr = + (int_compression *)(*mem_ptr_void); + mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count); +} diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cuh new file mode 100644 index 0000000000..0bf8b7c54a --- /dev/null +++ b/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cuh @@ -0,0 +1,225 @@ +#ifndef CUDA_INTEGER_COMPRESSION_CUH +#define CUDA_INTEGER_COMPRESSION_CUH + +#include "ciphertext.h" +#include "crypto/keyswitch.cuh" +#include "device.h" +#include "integer.h" +#include "integer/integer.cuh" +#include "linearalgebra/multiplication.cuh" +#include "polynomial/functions.cuh" +#include "utils/kernel_dimensions.cuh" + +template +__global__ void kernel_pack(Torus *array_out, Torus *array_in, + uint32_t log_modulus, uint32_t in_len, + uint32_t len) { + auto nbits = sizeof(Torus) * 8; + + auto i = threadIdx.x + blockIdx.x * blockDim.x; + if (i < len) { + auto k = nbits * i / log_modulus; + auto j = k; + + auto start_shift = i * nbits - j * log_modulus; + + auto value = array_in[j] >> start_shift; + j++; + + while (j * log_modulus < ((i + 1) * nbits) && j < in_len) { + auto shift = j * log_modulus - i * nbits; + value |= array_in[j] << shift; + j++; + } + + array_out[i] = value; + } +} + +template +__host__ void pack(cudaStream_t stream, Torus *array_out, Torus *array_in, + uint32_t num_inputs, uint32_t body_count, + int_compression *mem_ptr) { + + auto params = mem_ptr->compression_params; + + auto log_modulus = mem_ptr->storage_log_modulus; + auto in_len = params.glwe_dimension * params.polynomial_size + body_count; + auto number_bits_to_pack = in_len * log_modulus; + + auto nbits = sizeof(Torus) * 8; + // number_bits_to_pack.div_ceil(Scalar::BITS) + auto len = (number_bits_to_pack + nbits - 1) / nbits; + + int num_blocks = 0, num_threads = 0; + getNumBlocksAndThreads(len, 1024, num_blocks, num_threads); + + dim3 grid(num_blocks); + dim3 threads(num_threads); + kernel_pack<<>>(array_out, array_in, log_modulus, + in_len, len); +} + +template +__host__ void host_integer_compression_compress( + cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count, + Torus *glwe_array_out, Torus *lwe_array_in, Torus **fp_ksk, + uint32_t num_lwes, int_compression *mem_ptr) { + + auto compression_params = mem_ptr->compression_params; + auto input_lwe_dimension = compression_params.small_lwe_dimension; + + // Shift + auto lwe_shifted = mem_ptr->tmp_lwe; + host_cleartext_multiplication(streams[0], gpu_indexes[0], lwe_shifted, + lwe_array_in, + (uint64_t)compression_params.message_modulus, + input_lwe_dimension, num_lwes); + + uint32_t lwe_in_size = input_lwe_dimension + 1; + uint32_t glwe_out_size = (compression_params.glwe_dimension + 1) * + compression_params.polynomial_size; + uint32_t num_glwes = num_lwes / mem_ptr->lwe_per_glwe + 1; + + // Keyswitch LWEs to GLWE + auto tmp_glwe_array_out = mem_ptr->tmp_glwe_array_out; + for (int i = 0; i < num_glwes; i++) { + auto lwe_subset = lwe_shifted + i * lwe_in_size; + auto glwe_out = tmp_glwe_array_out + i * glwe_out_size; + + host_fp_keyswitch_lwe_list_to_glwe( + streams[0], gpu_indexes[0], glwe_out, lwe_subset, fp_ksk[0], + input_lwe_dimension, compression_params.glwe_dimension, + compression_params.polynomial_size, compression_params.ks_base_log, + compression_params.ks_level, min(num_lwes, mem_ptr->lwe_per_glwe)); + } + + auto body_count = min(num_lwes, mem_ptr->lwe_per_glwe); + + // Modulus switch + modulus_switch_inplace(streams[0], gpu_indexes[0], tmp_glwe_array_out, + num_glwes * (compression_params.glwe_dimension * + compression_params.polynomial_size + + body_count), + mem_ptr->storage_log_modulus); + check_cuda_error(cudaGetLastError()); + + pack(streams[0], glwe_array_out, tmp_glwe_array_out, num_glwes, body_count, + mem_ptr); +} + +template +__global__ void kernel_extract(Torus *glwe_array_out, Torus *array_in, + uint32_t index, uint32_t log_modulus, + uint32_t initial_out_len) { + auto nbits = sizeof(Torus) * 8; + + auto i = threadIdx.x + blockIdx.x * blockDim.x; + + if (i < initial_out_len) { + // Unpack + Torus mask = ((Torus)1 << log_modulus) - 1; + auto start = i * log_modulus; + auto end = (i + 1) * log_modulus; + + auto start_block = start / nbits; + auto start_remainder = start % nbits; + + auto end_block_inclusive = (end - 1) / nbits; + + Torus unpacked_i; + if (start_block == end_block_inclusive) { + auto single_part = array_in[start_block] >> start_remainder; + unpacked_i = single_part & mask; + } else { + auto first_part = array_in[start_block] >> start_remainder; + auto second_part = array_in[start_block + 1] << (nbits - start_remainder); + + unpacked_i = (first_part | second_part) & mask; + } + + // Extract + glwe_array_out[i] = unpacked_i << (nbits - log_modulus); + } +} + +template +__host__ void extract(cudaStream_t stream, uint32_t gpu_index, + Torus *glwe_array_out, Torus *array_in, + uint32_t glwe_index, int_compression *mem_ptr) { + + auto params = mem_ptr->compression_params; + + auto log_modulus = mem_ptr->storage_log_modulus; + + uint32_t body_count = mem_ptr->body_count; + auto initial_out_len = + params.glwe_dimension * params.polynomial_size + body_count * body_count; + + // We assure the tail of the glwe is zeroed + auto zeroed_slice = glwe_array_out + initial_out_len; + cuda_memset_async(zeroed_slice, 0, + (params.polynomial_size - body_count * body_count) * + sizeof(Torus), + stream, gpu_index); + + int num_blocks = 0, num_threads = 0; + getNumBlocksAndThreads(initial_out_len, 512, num_blocks, num_threads); + dim3 grid(num_blocks); + dim3 threads(num_threads); + kernel_extract<<>>( + glwe_array_out, array_in, glwe_index, log_modulus, initial_out_len); + check_cuda_error(cudaGetLastError()); +} + +template +__host__ void host_integer_compression_decompress( + cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count, + Torus *lwe_array_out, Torus *packed_glwe_in, uint32_t *indexes_array, + uint32_t indexes_array_size, void **bsks, int_compression *mem_ptr) { + + auto extracted_glwe = mem_ptr->tmp_extracted_glwe; + auto compression_params = mem_ptr->compression_params; + extract(streams[0], gpu_indexes[0], extracted_glwe, packed_glwe_in, 0, + mem_ptr); + + auto num_lwes = mem_ptr->body_count; + + // Sample extract + auto extracted_lwe = mem_ptr->tmp_extracted_lwe; + cuda_glwe_sample_extract_64(streams[0], gpu_indexes[0], extracted_lwe, + extracted_glwe, indexes_array, indexes_array_size, + compression_params.glwe_dimension, + compression_params.polynomial_size); + + /// Apply PBS to apply a LUT, reduce the noise and go from a small LWE + /// dimension to a big LWE dimension + auto encryption_params = mem_ptr->encryption_params; + auto carry_extract_lut = mem_ptr->carry_extract_lut; + execute_pbs_async( + streams, gpu_indexes, gpu_count, lwe_array_out, + carry_extract_lut->lwe_indexes_out, carry_extract_lut->lut_vec, + carry_extract_lut->lut_indexes_vec, extracted_lwe, + carry_extract_lut->lwe_indexes_in, bsks, carry_extract_lut->buffer, + encryption_params.glwe_dimension, + compression_params.glwe_dimension * compression_params.polynomial_size, + encryption_params.polynomial_size, encryption_params.pbs_base_log, + encryption_params.pbs_level, encryption_params.grouping_factor, num_lwes, + encryption_params.pbs_type); + + cudaDeviceSynchronize(); +} + +template +__host__ void scratch_cuda_compression_integer_radix_ciphertext_64( + cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count, + int_compression **mem_ptr, uint32_t num_lwes, + int_radix_params encryption_params, int_radix_params compression_params, + uint32_t lwe_per_glwe, uint32_t storage_log_modulus, COMPRESSION_MODE mode, + bool allocate_gpu_memory) { + + *mem_ptr = new int_compression( + streams, gpu_indexes, gpu_count, encryption_params, compression_params, + num_lwes, lwe_per_glwe, storage_log_modulus, mode, allocate_gpu_memory); +} +#endif diff --git a/backends/tfhe-cuda-backend/cuda/src/linearalgebra/multiplication.cu b/backends/tfhe-cuda-backend/cuda/src/linearalgebra/multiplication.cu index 2a936d7cdf..a64c15378d 100644 --- a/backends/tfhe-cuda-backend/cuda/src/linearalgebra/multiplication.cu +++ b/backends/tfhe-cuda-backend/cuda/src/linearalgebra/multiplication.cu @@ -9,12 +9,12 @@ void cuda_mult_lwe_ciphertext_vector_cleartext_vector_32( void *cleartext_array_in, uint32_t input_lwe_dimension, uint32_t input_lwe_ciphertext_count) { - host_cleartext_multiplication(static_cast(stream), gpu_index, - static_cast(lwe_array_out), - static_cast(lwe_array_in), - static_cast(cleartext_array_in), - input_lwe_dimension, - input_lwe_ciphertext_count); + host_cleartext_vec_multiplication( + static_cast(stream), gpu_index, + static_cast(lwe_array_out), + static_cast(lwe_array_in), + static_cast(cleartext_array_in), input_lwe_dimension, + input_lwe_ciphertext_count); } /* * Perform the multiplication of a u64 input LWE ciphertext vector with a u64 @@ -49,10 +49,10 @@ void cuda_mult_lwe_ciphertext_vector_cleartext_vector_64( void *cleartext_array_in, uint32_t input_lwe_dimension, uint32_t input_lwe_ciphertext_count) { - host_cleartext_multiplication(static_cast(stream), gpu_index, - static_cast(lwe_array_out), - static_cast(lwe_array_in), - static_cast(cleartext_array_in), - input_lwe_dimension, - input_lwe_ciphertext_count); + host_cleartext_vec_multiplication( + static_cast(stream), gpu_index, + static_cast(lwe_array_out), + static_cast(lwe_array_in), + static_cast(cleartext_array_in), input_lwe_dimension, + input_lwe_ciphertext_count); } diff --git a/backends/tfhe-cuda-backend/cuda/src/linearalgebra/multiplication.cuh b/backends/tfhe-cuda-backend/cuda/src/linearalgebra/multiplication.cuh index 9fd9be9419..b9864973a0 100644 --- a/backends/tfhe-cuda-backend/cuda/src/linearalgebra/multiplication.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/linearalgebra/multiplication.cuh @@ -14,9 +14,10 @@ #include template -__global__ void -cleartext_multiplication(T *output, T *lwe_input, T *cleartext_input, - uint32_t input_lwe_dimension, uint32_t num_entries) { +__global__ void cleartext_vec_multiplication(T *output, T *lwe_input, + T *cleartext_input, + uint32_t input_lwe_dimension, + uint32_t num_entries) { int tid = threadIdx.x; int index = blockIdx.x * blockDim.x + tid; @@ -27,10 +28,46 @@ cleartext_multiplication(T *output, T *lwe_input, T *cleartext_input, } } +template +__host__ void +host_cleartext_vec_multiplication(cudaStream_t stream, uint32_t gpu_index, + T *output, T *lwe_input, T *cleartext_input, + uint32_t input_lwe_dimension, + uint32_t input_lwe_ciphertext_count) { + + cudaSetDevice(gpu_index); + // lwe_size includes the presence of the body + // whereas lwe_dimension is the number of elements in the mask + int lwe_size = input_lwe_dimension + 1; + // Create a 1-dimensional grid of threads + int num_blocks = 0, num_threads = 0; + int num_entries = input_lwe_ciphertext_count * lwe_size; + getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads); + dim3 grid(num_blocks, 1, 1); + dim3 thds(num_threads, 1, 1); + + cleartext_vec_multiplication<<>>( + output, lwe_input, cleartext_input, input_lwe_dimension, num_entries); + check_cuda_error(cudaGetLastError()); +} + +template +__global__ void +cleartext_multiplication(T *output, T *lwe_input, T cleartext_input, + uint32_t input_lwe_dimension, uint32_t num_entries) { + + int tid = threadIdx.x; + int index = blockIdx.x * blockDim.x + tid; + if (index < num_entries) { + // Here we take advantage of the wrapping behaviour of uint + output[index] = lwe_input[index] * cleartext_input; + } +} + template __host__ void host_cleartext_multiplication(cudaStream_t stream, uint32_t gpu_index, - T *output, T *lwe_input, T *cleartext_input, + T *output, T *lwe_input, T cleartext_input, uint32_t input_lwe_dimension, uint32_t input_lwe_ciphertext_count) { diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_amortized.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_amortized.cuh index 6a60a0f6d1..ea95197ec9 100644 --- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_amortized.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_amortized.cuh @@ -88,8 +88,8 @@ __global__ void device_programmable_bootstrap_amortized( // Put "b", the body, in [0, 2N[ Torus b_hat = 0; - modulus_switch(block_lwe_array_in[lwe_dimension], b_hat, - params::log2_degree + 1); + apply_modulus_switch(block_lwe_array_in[lwe_dimension], b_hat, + params::log2_degree + 1); divide_by_monomial_negacyclic_inplace( @@ -104,8 +104,8 @@ __global__ void device_programmable_bootstrap_amortized( // Put "a" in [0, 2N[ instead of Zq Torus a_hat = 0; - modulus_switch(block_lwe_array_in[iteration], a_hat, - params::log2_degree + 1); + apply_modulus_switch(block_lwe_array_in[iteration], a_hat, + params::log2_degree + 1); // Perform ACC * (X^ä - 1) multiply_by_monomial_negacyclic_and_sub_polynomial< diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_classic.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_classic.cuh index 94d383a2f3..9d0c50c78a 100644 --- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_classic.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_classic.cuh @@ -93,8 +93,8 @@ __global__ void device_programmable_bootstrap_cg( // Put "b" in [0, 2N[ Torus b_hat = 0; - modulus_switch(block_lwe_array_in[lwe_dimension], b_hat, - params::log2_degree + 1); + apply_modulus_switch(block_lwe_array_in[lwe_dimension], b_hat, + params::log2_degree + 1); divide_by_monomial_negacyclic_inplace( @@ -106,7 +106,7 @@ __global__ void device_programmable_bootstrap_cg( // Put "a" in [0, 2N[ Torus a_hat = 0; - modulus_switch(block_lwe_array_in[i], a_hat, params::log2_degree + 1); + apply_modulus_switch(block_lwe_array_in[i], a_hat, params::log2_degree + 1); // Perform ACC * (X^ä - 1) multiply_by_monomial_negacyclic_and_sub_polynomial< diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_multibit.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_multibit.cuh index a40fdf8b9d..daf43bb097 100644 --- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_multibit.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_multibit.cuh @@ -80,8 +80,8 @@ __global__ void __launch_bounds__(params::degree / params::opt) if (lwe_offset == 0) { // Put "b" in [0, 2N[ Torus b_hat = 0; - modulus_switch(block_lwe_array_in[lwe_dimension], b_hat, - params::log2_degree + 1); + apply_modulus_switch(block_lwe_array_in[lwe_dimension], b_hat, + params::log2_degree + 1); divide_by_monomial_negacyclic_inplace( diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cuh index 47d6955cb1..64a73b3afc 100644 --- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cuh @@ -75,8 +75,8 @@ __global__ void __launch_bounds__(params::degree / params::opt) // First iteration // Put "b" in [0, 2N[ Torus b_hat = 0; - modulus_switch(block_lwe_array_in[lwe_dimension], b_hat, - params::log2_degree + 1); + apply_modulus_switch(block_lwe_array_in[lwe_dimension], b_hat, + params::log2_degree + 1); // The y-dimension is used to select the element of the GLWE this block will // compute divide_by_monomial_negacyclic_inplace @@ -102,7 +102,7 @@ __global__ void device_multi_bit_programmable_bootstrap_keybundle( synchronize_threads_in_block(); // Multiply by the bsk element - polynomial_product_accumulate_by_monomial( + polynomial_accumulate_monic_monomial_mul( accumulator, bsk_poly, monomial_degree, false); } @@ -203,8 +203,8 @@ __global__ void __launch_bounds__(params::degree / params::opt) // Initializes the accumulator with the body of LWE // Put "b" in [0, 2N[ Torus b_hat = 0; - modulus_switch(block_lwe_array_in[lwe_dimension], b_hat, - params::log2_degree + 1); + apply_modulus_switch(block_lwe_array_in[lwe_dimension], b_hat, + params::log2_degree + 1); divide_by_monomial_negacyclic_inplace( diff --git a/backends/tfhe-cuda-backend/cuda/src/polynomial/polynomial_math.cuh b/backends/tfhe-cuda-backend/cuda/src/polynomial/polynomial_math.cuh index a6d7901f20..9654e66b68 100644 --- a/backends/tfhe-cuda-backend/cuda/src/polynomial/polynomial_math.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/polynomial/polynomial_math.cuh @@ -55,14 +55,15 @@ __device__ void polynomial_product_accumulate_in_fourier_domain( } } -// If init_accumulator is set, assumes that result was not initialized and does -// that with the outcome of first * second +// This method expects to work with polynomial_size / compression_params::opt +// threads in the x-block If init_accumulator is set, assumes that result was +// not initialized and does that with the outcome of first * second template __device__ void -polynomial_product_accumulate_by_monomial(T *result, const T *__restrict__ poly, - uint64_t monomial_degree, - bool init_accumulator = false) { - // monomial_degree \in [0, 2 * params::degree) +polynomial_accumulate_monic_monomial_mul(T *result, const T *__restrict__ poly, + uint64_t monomial_degree, + bool init_accumulator = false) { + // monomial_degree \in [0, 2 * compression_params::degree) int full_cycles_count = monomial_degree / params::degree; int remainder_degrees = monomial_degree % params::degree; @@ -82,4 +83,37 @@ polynomial_product_accumulate_by_monomial(T *result, const T *__restrict__ poly, } } +// This method expects to work with num_poly * polynomial_size threads in the +// grid +template +__device__ void polynomial_accumulate_monic_monomial_mul( + T *result_array, T *poly_array, uint64_t monomial_degree, + uint32_t polynomial_size, + bool init_accumulator = false) { + // monomial_degree \in [0, 2 * compression_params::degree) + int full_cycles_count = monomial_degree / polynomial_size; + int remainder_degrees = monomial_degree % polynomial_size; + + auto tid = threadIdx.x + blockIdx.x * blockDim.x; + int pos = tid % polynomial_size; + + // Select a input + auto poly = poly_array + (tid / polynomial_size) * polynomial_size; + auto result = result_array + (tid / polynomial_size) * polynomial_size; + + // Calculate the rotation + T element = poly[pos]; + int new_pos = (pos + monomial_degree) % polynomial_size; + + // Calculate the new coefficient + T x = SEL(element, -element, full_cycles_count % 2); // monomial coefficient + x = SEL(-x, x, new_pos >= remainder_degrees); + + // Write result + if (init_accumulator) + result[new_pos] = x; + else + result[new_pos] += x; +} + #endif // CNCRT_POLYNOMIAL_MATH_H diff --git a/backends/tfhe-cuda-backend/src/cuda_bind.rs b/backends/tfhe-cuda-backend/src/cuda_bind.rs index 70f9b2a607..2d2a609e09 100644 --- a/backends/tfhe-cuda-backend/src/cuda_bind.rs +++ b/backends/tfhe-cuda-backend/src/cuda_bind.rs @@ -310,6 +310,19 @@ extern "C" { level_count: u32, num_samples: u32, ); + pub fn cuda_fp_keyswitch_lwe_list_to_glwe_64( + stream: *mut c_void, + gpu_index: u32, + glwe_array_out: *mut c_void, + lwe_array_in: *const c_void, + fp_ksk_array: *const c_void, + input_lwe_dimension: u32, + output_glwe_dimension: u32, + polynomial_size: u32, + base_log: u32, + level_count: u32, + num_lwes: u32, + ); /// Perform the negation of a u64 input LWE ciphertext vector. /// - `v_stream` is a void pointer to the Cuda stream to be used in the kernel launch @@ -484,6 +497,61 @@ extern "C" { mem_ptr: *mut *mut i8, ); + pub fn scratch_cuda_compression_integer_radix_ciphertext_64( + streams: *const *mut c_void, + gpu_indexes: *const u32, + gpu_count: u32, + mem_ptr: *mut *mut i8, + encryption_glwe_dimension: u32, + encryption_polynomial_size: u32, + compression_glwe_dimension: u32, + compression_polynomial_size: u32, + lwe_dimension: u32, + ks_level: u32, + ks_base_log: u32, + pbs_level: u32, + pbs_base_log: u32, + grouping_factor: u32, + num_lwes: u32, + message_modulus: u32, + carry_modulus: u32, + pbs_type: u32, + lwe_per_glwe: u32, + storage_log_modulus: u32, + mode: u32, + allocate_gpu_memory: bool, + ); + + pub fn cuda_compression_compress_integer_radix_ciphertext_64( + streams: *const *mut c_void, + gpu_indexes: *const u32, + gpu_count: u32, + glwe_array_out: *mut c_void, + lwe_array_in: *const c_void, + fp_ksk: *const *mut c_void, + num_lwes: u32, + mem_ptr: *mut i8, + ); + + pub fn cuda_compression_decompress_integer_radix_ciphertext_64( + streams: *const *mut c_void, + gpu_indexes: *const u32, + gpu_count: u32, + lwe_out: *mut c_void, + glwe_array_in: *const c_void, + indexes_array: *const c_void, + indexes_array_size: u32, + bsks: *const *mut c_void, + mem_ptr: *mut i8, + ); + + pub fn cleanup_cuda_compression_integer_radix_ciphertext_64( + streams: *const *mut c_void, + gpu_indexes: *const u32, + gpu_count: u32, + mem_ptr: *mut *mut i8, + ); + pub fn cuda_scalar_addition_integer_radix_ciphertext_64_inplace( streams: *const *mut c_void, gpu_indexes: *const u32, diff --git a/tfhe/src/core_crypto/gpu/algorithms/lwe_packing_keyswitch.rs b/tfhe/src/core_crypto/gpu/algorithms/lwe_packing_keyswitch.rs new file mode 100644 index 0000000000..3fe1180952 --- /dev/null +++ b/tfhe/src/core_crypto/gpu/algorithms/lwe_packing_keyswitch.rs @@ -0,0 +1,34 @@ +use crate::core_crypto::gpu::glwe_ciphertext_list::CudaGlweCiphertextList; +use crate::core_crypto::gpu::lwe_ciphertext_list::CudaLweCiphertextList; +use crate::core_crypto::gpu::lwe_packing_keyswitch_key::CudaLwePackingKeyswitchKey; +use crate::core_crypto::gpu::{packing_keyswitch_list_async, CudaStreams}; +use crate::core_crypto::prelude::{CastInto, UnsignedTorus}; + +pub fn cuda_keyswitch_lwe_ciphertext_list_into_glwe_ciphertext( + lwe_pksk: &CudaLwePackingKeyswitchKey, + input_lwe_ciphertext_list: &CudaLweCiphertextList, + output_glwe_ciphertext: &mut CudaGlweCiphertextList, + streams: &CudaStreams, +) where + // CastInto required for PBS modulus switch which returns a usize + Scalar: UnsignedTorus + CastInto, +{ + let input_lwe_dimension = input_lwe_ciphertext_list.lwe_dimension(); + let output_glwe_dimension = output_glwe_ciphertext.glwe_dimension(); + let output_polynomial_size = output_glwe_ciphertext.polynomial_size(); + + unsafe { + packing_keyswitch_list_async( + streams, + &mut output_glwe_ciphertext.0.d_vec, + &input_lwe_ciphertext_list.0.d_vec, + input_lwe_dimension, + output_glwe_dimension, + output_polynomial_size, + &lwe_pksk.d_vec, + lwe_pksk.decomposition_base_log(), + lwe_pksk.decomposition_level_count(), + input_lwe_ciphertext_list.lwe_ciphertext_count(), + ); + } +} diff --git a/tfhe/src/core_crypto/gpu/algorithms/mod.rs b/tfhe/src/core_crypto/gpu/algorithms/mod.rs index ee6ddd2b1d..489bbb1565 100644 --- a/tfhe/src/core_crypto/gpu/algorithms/mod.rs +++ b/tfhe/src/core_crypto/gpu/algorithms/mod.rs @@ -1,13 +1,15 @@ +pub mod glwe_sample_extraction; +pub mod lwe_keyswitch; pub mod lwe_linear_algebra; pub mod lwe_multi_bit_programmable_bootstrapping; +pub mod lwe_packing_keyswitch; pub mod lwe_programmable_bootstrapping; -pub mod glwe_sample_extraction; -mod lwe_keyswitch; #[cfg(test)] mod test; pub use lwe_keyswitch::*; pub use lwe_linear_algebra::*; pub use lwe_multi_bit_programmable_bootstrapping::*; +pub use lwe_packing_keyswitch::*; pub use lwe_programmable_bootstrapping::*; diff --git a/tfhe/src/core_crypto/gpu/algorithms/test/lwe_packing_keyswitch.rs b/tfhe/src/core_crypto/gpu/algorithms/test/lwe_packing_keyswitch.rs new file mode 100644 index 0000000000..0c782862c2 --- /dev/null +++ b/tfhe/src/core_crypto/gpu/algorithms/test/lwe_packing_keyswitch.rs @@ -0,0 +1,227 @@ +use super::*; +use crate::core_crypto::gpu::algorithms::lwe_packing_keyswitch::cuda_keyswitch_lwe_ciphertext_list_into_glwe_ciphertext; +use crate::core_crypto::gpu::glwe_ciphertext_list::CudaGlweCiphertextList; +use crate::core_crypto::gpu::lwe_ciphertext_list::CudaLweCiphertextList; +use crate::core_crypto::gpu::CudaStreams; +use serde::de::DeserializeOwned; +use serde::Serialize; + +const NB_TESTS: usize = 10; +fn generate_keys( + params: PackingKeySwitchTestParams, + streams: &CudaStreams, + rsc: &mut TestResources, +) -> CudaPackingKeySwitchKeys { + let lwe_sk = allocate_and_generate_new_binary_lwe_secret_key( + params.lwe_dimension, + &mut rsc.secret_random_generator, + ); + + let glwe_sk = allocate_and_generate_new_binary_glwe_secret_key( + params.glwe_dimension, + params.polynomial_size, + &mut rsc.secret_random_generator, + ); + + let pksk = allocate_and_generate_new_lwe_packing_keyswitch_key( + &lwe_sk, + &glwe_sk, + params.pbs_base_log, + params.pbs_level, + params.glwe_noise_distribution, + params.ciphertext_modulus, + &mut rsc.encryption_random_generator, + ); + + assert!(check_encrypted_content_respects_mod( + &pksk, + params.ciphertext_modulus + )); + + let cuda_pksk = CudaLwePackingKeyswitchKey::from_lwe_packing_keyswitch_key(&pksk, streams); + + CudaPackingKeySwitchKeys { + lwe_sk, + glwe_sk, + pksk: cuda_pksk, + } +} + +fn lwe_encrypt_pks_to_glwe_decrypt_custom_mod(params: P) +where + Scalar: UnsignedTorus + CastInto + Serialize + DeserializeOwned, + P: Into>, + PackingKeySwitchTestParams: KeyCacheAccess>, +{ + let params = params.into(); + + let lwe_noise_distribution = params.lwe_noise_distribution; + let ciphertext_modulus = params.ciphertext_modulus; + let message_modulus_log = params.message_modulus_log; + let encoding_with_padding = get_encoding_with_padding(ciphertext_modulus); + + let mut rsc = TestResources::new(); + + let msg_modulus = Scalar::ONE.shl(message_modulus_log.0); + let mut msg = msg_modulus; + let delta: Scalar = encoding_with_padding / msg_modulus; + + let gpu_index = 0; + let stream = CudaStreams::new_single_gpu(gpu_index); + + while msg != Scalar::ZERO { + msg = msg.wrapping_sub(Scalar::ONE); + for _ in 0..NB_TESTS { + let keys = generate_keys(params, &stream, &mut rsc); + let (pksk, lwe_sk, glwe_sk) = (keys.pksk, keys.lwe_sk, keys.glwe_sk); + + let plaintext = Plaintext(msg * delta); + + let input_lwe = allocate_and_encrypt_new_lwe_ciphertext( + &lwe_sk, + plaintext, + lwe_noise_distribution, + ciphertext_modulus, + &mut rsc.encryption_random_generator, + ); + + let d_input_lwe = CudaLweCiphertextList::from_lwe_ciphertext(&input_lwe, &stream); + + assert!(check_encrypted_content_respects_mod( + &input_lwe, + ciphertext_modulus + )); + + let mut d_output_glwe = CudaGlweCiphertextList::new( + glwe_sk.glwe_dimension(), + glwe_sk.polynomial_size(), + GlweCiphertextCount(1), + ciphertext_modulus, + &stream, + ); + + cuda_keyswitch_lwe_ciphertext_list_into_glwe_ciphertext( + &pksk, + &d_input_lwe, + &mut d_output_glwe, + &stream, + ); + + let output_glwe_list = d_output_glwe.to_glwe_ciphertext_list(&stream); + let mut decrypted_plaintext_list = PlaintextList::new( + Scalar::ZERO, + PlaintextCount(output_glwe_list.polynomial_size().0), + ); + + decrypt_glwe_ciphertext_list( + &glwe_sk, + &output_glwe_list, + &mut decrypted_plaintext_list, + ); + let decoded = round_decode(*decrypted_plaintext_list.get(0).0, delta) % msg_modulus; + + assert_eq!(msg, decoded); + } + + // In coverage, we break after one while loop iteration, changing message values does not + // yield higher coverage + #[cfg(tarpaulin)] + break; + } +} + +fn lwe_list_encrypt_pks_to_glwe_decrypt_custom_mod(params: P) +where + Scalar: UnsignedTorus + CastInto + Serialize + DeserializeOwned, + P: Into>, + PackingKeySwitchTestParams: KeyCacheAccess>, +{ + let params = params.into(); + + let lwe_noise_distribution = params.lwe_noise_distribution; + let ciphertext_modulus = params.ciphertext_modulus; + let message_modulus_log = params.message_modulus_log; + let encoding_with_padding = get_encoding_with_padding(ciphertext_modulus); + + let mut rsc = TestResources::new(); + + let msg_modulus = Scalar::ONE.shl(message_modulus_log.0); + let mut msg = msg_modulus; + let delta: Scalar = encoding_with_padding / msg_modulus; + + let gpu_index = 0; + let stream = CudaStreams::new_single_gpu(gpu_index); + + // while msg != Scalar::ZERO { + msg = msg.wrapping_sub(Scalar::ONE); + for _ in 0..NB_TESTS { + let keys = generate_keys(params, &stream, &mut rsc); + let (pksk, lwe_sk, glwe_sk) = (keys.pksk, keys.lwe_sk, keys.glwe_sk); + + let mut input_lwe_list = LweCiphertextList::new( + Scalar::ZERO, + lwe_sk.lwe_dimension().to_lwe_size(), + LweCiphertextCount(glwe_sk.polynomial_size().0), + ciphertext_modulus, + ); + + let mut input_plaintext_list = + PlaintextList::new(msg * delta, PlaintextCount(glwe_sk.polynomial_size().0)); + + encrypt_lwe_ciphertext_list( + &lwe_sk, + &mut input_lwe_list, + &input_plaintext_list, + lwe_noise_distribution, + &mut rsc.encryption_random_generator, + ); + + let d_input_lwe_list = + CudaLweCiphertextList::from_lwe_ciphertext_list(&input_lwe_list, &stream); + + assert!(check_encrypted_content_respects_mod( + &input_lwe_list, + ciphertext_modulus + )); + + let mut d_output_glwe = CudaGlweCiphertextList::new( + glwe_sk.glwe_dimension(), + glwe_sk.polynomial_size(), + GlweCiphertextCount(1), + ciphertext_modulus, + &stream, + ); + + cuda_keyswitch_lwe_ciphertext_list_into_glwe_ciphertext( + &pksk, + &d_input_lwe_list, + &mut d_output_glwe, + &stream, + ); + + let output_glwe_list = d_output_glwe.to_glwe_ciphertext_list(&stream); + + let mut decrypted_plaintext_list = PlaintextList::new( + Scalar::ZERO, + PlaintextCount(output_glwe_list.polynomial_size().0), + ); + + decrypt_glwe_ciphertext_list(&glwe_sk, &output_glwe_list, &mut decrypted_plaintext_list); + + decrypted_plaintext_list + .iter_mut() + .for_each(|x| *x.0 = round_decode(*x.0, delta) % msg_modulus); + input_plaintext_list.iter_mut().for_each(|x| *x.0 /= delta); + + assert_eq!(decrypted_plaintext_list, input_plaintext_list); + } + + // In coverage, we break after one while loop iteration, changing message values does not + // yield higher coverage + #[cfg(tarpaulin)] + break; + // } +} + +create_gpu_parametrized_test!(lwe_encrypt_pks_to_glwe_decrypt_custom_mod); +create_gpu_parametrized_test!(lwe_list_encrypt_pks_to_glwe_decrypt_custom_mod); diff --git a/tfhe/src/core_crypto/gpu/algorithms/test/mod.rs b/tfhe/src/core_crypto/gpu/algorithms/test/mod.rs index 59ba20af18..779a419702 100644 --- a/tfhe/src/core_crypto/gpu/algorithms/test/mod.rs +++ b/tfhe/src/core_crypto/gpu/algorithms/test/mod.rs @@ -4,8 +4,15 @@ mod glwe_sample_extraction; mod lwe_keyswitch; mod lwe_linear_algebra; mod lwe_multi_bit_programmable_bootstrapping; +mod lwe_packing_keyswitch; mod lwe_programmable_bootstrapping; +pub struct CudaPackingKeySwitchKeys { + pub lwe_sk: LweSecretKey>, + pub glwe_sk: GlweSecretKey>, + pub pksk: CudaLwePackingKeyswitchKey, +} + // Macro to generate tests for all parameter sets macro_rules! create_gpu_parametrized_test{ ($name:ident { $($param:ident),* }) => { @@ -47,4 +54,5 @@ macro_rules! create_gpu_multi_bit_parametrized_test{ }; } +use crate::core_crypto::gpu::lwe_packing_keyswitch_key::CudaLwePackingKeyswitchKey; use {create_gpu_multi_bit_parametrized_test, create_gpu_parametrized_test}; diff --git a/tfhe/src/core_crypto/gpu/entities/lwe_ciphertext_list.rs b/tfhe/src/core_crypto/gpu/entities/lwe_ciphertext_list.rs index 9defa579f6..1fde0545bf 100644 --- a/tfhe/src/core_crypto/gpu/entities/lwe_ciphertext_list.rs +++ b/tfhe/src/core_crypto/gpu/entities/lwe_ciphertext_list.rs @@ -4,6 +4,7 @@ use crate::core_crypto::prelude::{ CiphertextModulus, Container, LweCiphertext, LweCiphertextCount, LweCiphertextList, LweDimension, LweSize, UnsignedInteger, }; +use std::cmp::min; use tfhe_cuda_backend::cuda_bind::cuda_memcpy_async_gpu_to_gpu; /// A structure representing a vector of LWE ciphertexts with 64 bits of precision on the GPU. @@ -199,49 +200,60 @@ impl CudaLweCiphertextList { LweCiphertext::from_container(container, self.ciphertext_modulus()) } - /// ```rust - /// use tfhe::core_crypto::gpu::lwe_ciphertext_list::CudaLweCiphertextList; - /// use tfhe::core_crypto::gpu::CudaStreams; - /// use tfhe::core_crypto::prelude::{ - /// CiphertextModulus, LweCiphertextCount, LweCiphertextList, LweSize, - /// }; - /// - /// let mut streams = CudaStreams::new_single_gpu(0); - /// - /// let lwe_size = LweSize(743); - /// let ciphertext_modulus = CiphertextModulus::new_native(); - /// let lwe_ciphertext_count = LweCiphertextCount(2); - /// - /// // Create a new LweCiphertextList - /// let lwe_list = LweCiphertextList::new(0u64, lwe_size, lwe_ciphertext_count, ciphertext_modulus); - /// - /// // Copy to GPU - /// let d_lwe_list = CudaLweCiphertextList::from_lwe_ciphertext_list(&lwe_list, &mut streams); - /// let d_lwe_list_copied = d_lwe_list.duplicate(&mut streams); - /// - /// let lwe_list_copied = d_lwe_list_copied.to_lwe_ciphertext_list(&mut streams); - /// - /// assert_eq!(lwe_list, lwe_list_copied); - /// ``` - pub fn duplicate(&self, streams: &CudaStreams) -> Self { - let lwe_dimension = self.lwe_dimension(); - let lwe_ciphertext_count = self.lwe_ciphertext_count(); - let ciphertext_modulus = self.ciphertext_modulus(); + // Retrieve [start,end) lwe ciphertexts in the list + pub fn retrieve_subset( + &self, + start: usize, + end: usize, + streams: &CudaStreams, + gpu_index: u32, + ) -> Option { + if end < start { + None + } else { + let lwe_dimension = self.lwe_dimension(); + let lwe_ciphertext_count = + LweCiphertextCount(min(end - start, self.lwe_ciphertext_count().0)); + let ciphertext_modulus = self.ciphertext_modulus(); - // Copy to the GPU - let mut d_vec = CudaVec::new(self.0.d_vec.len(), streams, 0); - unsafe { - d_vec.copy_from_gpu_async(&self.0.d_vec, streams, 0); - } - streams.synchronize(); + // Copy to the GPU + let d_vec = unsafe { + let mut d_vec = CudaVec::new_async( + lwe_ciphertext_count.0 * lwe_dimension.to_lwe_size().0, + streams, + gpu_index, + ); + // Todo: We might use copy_src_range_gpu_to_gpu_async here + let src_ptr = self + .0 + .d_vec + .as_c_ptr(gpu_index) + .add(start * lwe_dimension.to_lwe_size().0 * std::mem::size_of::()); + let size = lwe_ciphertext_count.0 + * lwe_dimension.to_lwe_size().0 + * std::mem::size_of::(); + cuda_memcpy_async_gpu_to_gpu( + d_vec.as_mut_c_ptr(gpu_index), + src_ptr, + size as u64, + streams.ptr[gpu_index as usize], + streams.gpu_indexes[gpu_index as usize], + ); - let cuda_lwe_list = CudaLweList { - d_vec, - lwe_ciphertext_count, - lwe_dimension, - ciphertext_modulus, - }; - Self(cuda_lwe_list) + d_vec + }; + + streams.synchronize(); + + let cuda_lwe_list = CudaLweList { + d_vec, + lwe_ciphertext_count, + lwe_dimension, + ciphertext_modulus, + }; + + Some(Self(cuda_lwe_list)) + } } pub(crate) fn lwe_dimension(&self) -> LweDimension { diff --git a/tfhe/src/core_crypto/gpu/entities/lwe_packing_keyswitch_key.rs b/tfhe/src/core_crypto/gpu/entities/lwe_packing_keyswitch_key.rs new file mode 100644 index 0000000000..0c6664aa59 --- /dev/null +++ b/tfhe/src/core_crypto/gpu/entities/lwe_packing_keyswitch_key.rs @@ -0,0 +1,84 @@ +use crate::core_crypto::gpu::vec::CudaVec; +use crate::core_crypto::gpu::{convert_lwe_keyswitch_key_async, CudaStreams}; +use crate::core_crypto::prelude::{ + lwe_packing_keyswitch_key_input_key_element_encrypted_size, CiphertextModulus, + DecompositionBaseLog, DecompositionLevelCount, GlweSize, LweDimension, + LwePackingKeyswitchKeyOwned, PolynomialSize, UnsignedInteger, +}; + +#[derive(Debug)] +pub struct CudaLwePackingKeyswitchKey { + pub(crate) d_vec: CudaVec, + decomp_base_log: DecompositionBaseLog, + decomp_level_count: DecompositionLevelCount, + output_glwe_size: GlweSize, + output_polynomial_size: PolynomialSize, + ciphertext_modulus: CiphertextModulus, +} + +impl CudaLwePackingKeyswitchKey { + pub fn from_lwe_packing_keyswitch_key( + h_ksk: &LwePackingKeyswitchKeyOwned, + streams: &CudaStreams, + ) -> Self { + let decomp_base_log = h_ksk.decomposition_base_log(); + let decomp_level_count = h_ksk.decomposition_level_count(); + let input_lwe_size = h_ksk.input_key_lwe_dimension().to_lwe_size(); + let output_glwe_size = h_ksk.output_key_glwe_dimension().to_glwe_size(); + let output_polynomial_size = h_ksk.output_polynomial_size(); + let ciphertext_modulus = h_ksk.ciphertext_modulus(); + + // Allocate memory + let mut d_vec = CudaVec::::new_multi_gpu( + input_lwe_size.to_lwe_dimension().0 + * lwe_packing_keyswitch_key_input_key_element_encrypted_size( + decomp_level_count, + output_glwe_size, + output_polynomial_size, + ), + streams, + ); + + unsafe { + convert_lwe_keyswitch_key_async(streams, &mut d_vec, h_ksk.as_ref()); + } + + streams.synchronize(); + + Self { + d_vec, + decomp_base_log, + decomp_level_count, + output_glwe_size, + output_polynomial_size, + ciphertext_modulus, + } + } + + pub(crate) fn decomposition_base_log(&self) -> DecompositionBaseLog { + self.decomp_base_log + } + pub(crate) fn decomposition_level_count(&self) -> DecompositionLevelCount { + self.decomp_level_count + } + + pub(crate) fn output_glwe_size(&self) -> GlweSize { + self.output_glwe_size + } + pub(crate) fn ciphertext_modulus(&self) -> CiphertextModulus { + self.ciphertext_modulus + } + pub(crate) fn output_polynomial_size(&self) -> PolynomialSize { + self.output_polynomial_size + } + pub fn input_key_lwe_dimension(&self) -> LweDimension { + LweDimension( + self.d_vec.len + / lwe_packing_keyswitch_key_input_key_element_encrypted_size( + self.decomp_level_count, + self.output_glwe_size, + self.output_polynomial_size, + ), + ) + } +} diff --git a/tfhe/src/core_crypto/gpu/entities/mod.rs b/tfhe/src/core_crypto/gpu/entities/mod.rs index 34dcb5f03e..623bf42497 100644 --- a/tfhe/src/core_crypto/gpu/entities/mod.rs +++ b/tfhe/src/core_crypto/gpu/entities/mod.rs @@ -3,3 +3,4 @@ pub mod lwe_bootstrap_key; pub mod lwe_ciphertext_list; pub mod lwe_keyswitch_key; pub mod lwe_multi_bit_bootstrap_key; +pub mod lwe_packing_keyswitch_key; diff --git a/tfhe/src/core_crypto/gpu/mod.rs b/tfhe/src/core_crypto/gpu/mod.rs index 3341ba6e5e..40f9d11e0f 100644 --- a/tfhe/src/core_crypto/gpu/mod.rs +++ b/tfhe/src/core_crypto/gpu/mod.rs @@ -13,7 +13,6 @@ pub use algorithms::*; pub use entities::*; use std::ffi::c_void; pub(crate) use tfhe_cuda_backend::cuda_bind::*; - #[derive(Debug)] pub struct CudaStreams { pub ptr: Vec<*mut c_void>, @@ -260,6 +259,40 @@ pub unsafe fn convert_lwe_keyswitch_key_async( dest.copy_from_cpu_multi_gpu_async(src, streams); } +/// Discarding packing keyswitch on a vector of LWE ciphertexts +/// +/// # Safety +/// +/// [CudaStreams::synchronize] __must__ be called as soon as synchronization is +/// required +#[allow(clippy::too_many_arguments)] +pub unsafe fn packing_keyswitch_list_async( + streams: &CudaStreams, + glwe_array_out: &mut CudaVec, + lwe_array_in: &CudaVec, + input_lwe_dimension: LweDimension, + output_glwe_dimension: GlweDimension, + output_polynomial_size: PolynomialSize, + fp_keyswitch_key: &CudaVec, + base_log: DecompositionBaseLog, + l_gadget: DecompositionLevelCount, + num_lwes: LweCiphertextCount, +) { + cuda_fp_keyswitch_lwe_list_to_glwe_64( + streams.ptr[0], + streams.gpu_indexes[0], + glwe_array_out.as_mut_c_ptr(0), + lwe_array_in.as_c_ptr(0), + fp_keyswitch_key.as_c_ptr(0), + input_lwe_dimension.0 as u32, + output_glwe_dimension.0 as u32, + output_polynomial_size.0 as u32, + base_log.0 as u32, + l_gadget.0 as u32, + num_lwes.0 as u32, + ); +} + /// Convert programmable bootstrap key /// /// # Safety diff --git a/tfhe/src/integer/client_key/radix.rs b/tfhe/src/integer/client_key/radix.rs index 36333ddbc2..fce78b2335 100644 --- a/tfhe/src/integer/client_key/radix.rs +++ b/tfhe/src/integer/client_key/radix.rs @@ -1,11 +1,22 @@ //! Definition of the client key for radix decomposition use super::{ClientKey, RecomposableSignedInteger, SecretEncryptionKeyView}; +#[cfg(feature = "gpu")] +use crate::core_crypto::gpu::CudaStreams; use crate::core_crypto::prelude::{SignedNumeric, UnsignedNumeric}; use crate::integer::backward_compatibility::client_key::RadixClientKeyVersions; use crate::integer::block_decomposition::{DecomposableInto, RecomposableFrom}; use crate::integer::ciphertext::{RadixCiphertext, SignedRadixCiphertext}; +#[cfg(feature = "gpu")] +use crate::integer::gpu::list_compression::server_keys::{ + CudaCompressionKey, CudaDecompressionKey, +}; use crate::integer::BooleanBlock; +use crate::shortint::list_compression::{ + CompressedCompressionKey, CompressedDecompressionKey, CompressionKey, CompressionPrivateKeys, + DecompressionKey, +}; +use crate::shortint::parameters::CompressionParameters; use crate::shortint::{Ciphertext as ShortintCiphertext, PBSParameters as ShortintParameters}; use serde::{Deserialize, Serialize}; use tfhe_versionable::Versionize; @@ -131,6 +142,40 @@ impl RadixClientKey { pub fn num_blocks(&self) -> usize { self.num_blocks } + + pub fn new_compression_private_key( + &self, + params: CompressionParameters, + ) -> CompressionPrivateKeys { + self.key.key.new_compression_private_key(params) + } + + pub fn new_compression_decompression_keys( + &self, + private_compression_key: &CompressionPrivateKeys, + ) -> (CompressionKey, DecompressionKey) { + self.key + .key + .new_compression_decompression_keys(private_compression_key) + } + pub fn new_compressed_compression_decompression_keys( + &self, + private_compression_key: &CompressionPrivateKeys, + ) -> (CompressedCompressionKey, CompressedDecompressionKey) { + self.key + .key + .new_compressed_compression_decompression_keys(private_compression_key) + } + #[cfg(feature = "gpu")] + pub fn new_cuda_compression_decompression_keys( + &self, + private_compression_key: &CompressionPrivateKeys, + streams: &CudaStreams, + ) -> (CudaCompressionKey, CudaDecompressionKey) { + self.key + .key + .new_cuda_compression_decompression_keys(private_compression_key, streams) + } } impl From<(ClientKey, usize)> for RadixClientKey { diff --git a/tfhe/src/integer/gpu/ciphertext/compressed_ciphertext_list.rs b/tfhe/src/integer/gpu/ciphertext/compressed_ciphertext_list.rs new file mode 100644 index 0000000000..f734d59279 --- /dev/null +++ b/tfhe/src/integer/gpu/ciphertext/compressed_ciphertext_list.rs @@ -0,0 +1,205 @@ +use crate::core_crypto::gpu::glwe_ciphertext_list::CudaGlweCiphertextList; +use crate::core_crypto::gpu::CudaStreams; +use crate::core_crypto::prelude::CiphertextModulusLog; +use crate::integer::ciphertext::DataKind; +use crate::integer::gpu::ciphertext::info::CudaBlockInfo; +use crate::integer::gpu::ciphertext::{ + CudaRadixCiphertext, CudaSignedRadixCiphertext, CudaUnsignedRadixCiphertext, +}; +use crate::integer::gpu::list_compression::server_keys::{ + CudaCompressionKey, CudaDecompressionKey, +}; + +pub struct CudaCompressedCiphertextList { + pub(crate) packed_list: ( + CudaGlweCiphertextList, + Vec, + CiphertextModulusLog, + ), + info: Vec, +} +impl CudaCompressedCiphertextList { + pub fn len(&self) -> usize { + self.info.len() + } + + pub fn is_empty(&self) -> bool { + self.info.len() == 0 + } + + pub fn get( + &self, + index: usize, + decomp_key: &CudaDecompressionKey, + streams: &CudaStreams, + gpu_index: u32, + ) -> CudaRadixCiphertext +where { + let preceding_infos = self.info.get(..index).unwrap(); + let current_info = self.info.get(index).copied().unwrap(); + + let start_block_index: usize = preceding_infos + .iter() + .copied() + .map(DataKind::num_blocks) + .sum(); + + let end_block_index = start_block_index + current_info.num_blocks() - 1; + + decomp_key.unpack( + &self.packed_list, + start_block_index, + end_block_index, + streams, + gpu_index, + ) + } +} + +pub trait CudaCompressible { + fn compress_into( + self, + messages: &mut Vec, + streams: &CudaStreams, + ) -> DataKind; +} + +// Todo: Can we combine these two impl using CudaIntegerRadixCiphertext? +impl CudaCompressible for CudaSignedRadixCiphertext { + fn compress_into( + self, + messages: &mut Vec, + streams: &CudaStreams, + ) -> DataKind { + let x = self.ciphertext.duplicate(streams); + + let copy = x.duplicate(streams); + messages.push(copy); + + let num_blocks = x.d_blocks.lwe_ciphertext_count().0; + DataKind::Signed(num_blocks) + } +} +impl CudaCompressible for CudaUnsignedRadixCiphertext { + fn compress_into( + self, + messages: &mut Vec, + streams: &CudaStreams, + ) -> DataKind { + let x = self.ciphertext.duplicate(streams); + + let copy = x.duplicate(streams); + messages.push(copy); + + let num_blocks = x.d_blocks.lwe_ciphertext_count().0; + + DataKind::Unsigned(num_blocks) + } +} + +pub struct CudaCompressedCiphertextListBuilder { + pub(crate) ciphertexts: Vec, + pub(crate) info: Vec, +} + +impl CudaCompressedCiphertextListBuilder { + #[allow(clippy::new_without_default)] + pub fn new() -> Self { + Self { + ciphertexts: vec![], + info: vec![], + } + } + + pub fn push(&mut self, data: T, streams: &CudaStreams) -> &mut Self { + let kind = data.compress_into(&mut self.ciphertexts, streams); + + if kind.num_blocks() != 0 { + self.info.push(kind); + } + + self + } + + pub fn build( + &self, + comp_key: &CudaCompressionKey, + streams: &CudaStreams, + gpu_index: u32, + ) -> CudaCompressedCiphertextList { + let packed_list = + comp_key.compress_ciphertexts_into_list(&self.ciphertexts, streams, gpu_index); + + CudaCompressedCiphertextList { + packed_list, + info: self.info.clone(), + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::integer::gpu::gen_keys_radix_gpu; + use crate::shortint::parameters::list_compression::COMP_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M64; + use crate::shortint::prelude::PARAM_MESSAGE_2_CARRY_2_KS_PBS; + + #[test] + fn test_gpu_ciphertext_compression() { + let gpu_index = 0; + let streams = CudaStreams::new_single_gpu(gpu_index); + + let num_blocks = 4; + let (cks, _) = gen_keys_radix_gpu(PARAM_MESSAGE_2_CARRY_2_KS_PBS, num_blocks, &streams); + + let private_compression_key = + cks.new_compression_private_key(COMP_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M64); + + let (compressed_compression_key, compressed_decompression_key) = + cks.new_compressed_compression_decompression_keys(&private_compression_key); + + let cuda_compression_key = compressed_compression_key.decompress_to_cuda(&streams); + let cuda_decompression_key = + compressed_decompression_key.decompress_to_cuda(cks.parameters(), &streams); + + let ct1 = cks.encrypt(3_u32); + let ct2 = cks.encrypt(2_u32); + let ct3 = cks.encrypt_signed(-2); + + // Copy to GPU + let d_ct1 = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct1, &streams); + let d_ct2 = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct2, &streams); + let d_ct3 = CudaSignedRadixCiphertext::from_signed_radix_ciphertext(&ct3, &streams); + + let cuda_compressed = CudaCompressedCiphertextListBuilder::new() + .push(d_ct1, &streams) + .push(d_ct2, &streams) + .push(d_ct3, &streams) + .build(&cuda_compression_key, &streams, gpu_index); + + let d_decompressed1 = CudaUnsignedRadixCiphertext { + ciphertext: cuda_compressed.get(0, &cuda_decompression_key, &streams, 0), + }; + + let decompressed1 = d_decompressed1.to_radix_ciphertext(&streams); + let decrypted: u32 = cks.decrypt(&decompressed1); + + assert_eq!(decrypted, 3_u32); + let d_decompressed2 = CudaUnsignedRadixCiphertext { + ciphertext: cuda_compressed.get(1, &cuda_decompression_key, &streams, 0), + }; + + let decompressed2 = d_decompressed2.to_radix_ciphertext(&streams); + let decrypted: u32 = cks.decrypt(&decompressed2); + + assert_eq!(decrypted, 2_u32); + let d_decompressed3 = CudaSignedRadixCiphertext { + ciphertext: cuda_compressed.get(2, &cuda_decompression_key, &streams, 0), + }; + + let decompressed3 = d_decompressed3.to_signed_radix_ciphertext(&streams); + let decrypted: i32 = cks.decrypt_signed(&decompressed3); + + assert_eq!(decrypted, -2); + } +} diff --git a/tfhe/src/integer/gpu/ciphertext/mod.rs b/tfhe/src/integer/gpu/ciphertext/mod.rs index 7f0408eddb..51e702d0c8 100644 --- a/tfhe/src/integer/gpu/ciphertext/mod.rs +++ b/tfhe/src/integer/gpu/ciphertext/mod.rs @@ -1,4 +1,5 @@ pub mod boolean_value; +pub mod compressed_ciphertext_list; pub mod info; use crate::core_crypto::gpu::lwe_ciphertext_list::CudaLweCiphertextList; diff --git a/tfhe/src/integer/gpu/list_compression/compressed_server_keys.rs b/tfhe/src/integer/gpu/list_compression/compressed_server_keys.rs new file mode 100644 index 0000000000..0a89c6efba --- /dev/null +++ b/tfhe/src/integer/gpu/list_compression/compressed_server_keys.rs @@ -0,0 +1,51 @@ +use crate::core_crypto::gpu::lwe_bootstrap_key::CudaLweBootstrapKey; +use crate::core_crypto::gpu::CudaStreams; +use crate::integer::gpu::list_compression::server_keys::{ + CudaCompressionKey, CudaDecompressionKey, +}; +use crate::integer::gpu::server_key::CudaBootstrappingKey; +use crate::shortint::list_compression::{ + CompressedCompressionKey, CompressedDecompressionKey, CompressionKey, +}; +use crate::shortint::PBSParameters; + +impl CompressedDecompressionKey { + pub fn decompress_to_cuda( + &self, + parameters: PBSParameters, + streams: &CudaStreams, + ) -> CudaDecompressionKey { + let h_bootstrap_key = self + .blind_rotate_key + .as_view() + .par_decompress_into_lwe_bootstrap_key(); + + let d_bootstrap_key = + CudaLweBootstrapKey::from_lwe_bootstrap_key(&h_bootstrap_key, streams); + + let blind_rotate_key = CudaBootstrappingKey::Classic(d_bootstrap_key); + + CudaDecompressionKey { + blind_rotate_key, + lwe_per_glwe: self.lwe_per_glwe, + parameters, + } + } +} + +impl CompressedCompressionKey { + pub fn decompress_to_cuda(&self, streams: &CudaStreams) -> CudaCompressionKey { + let packing_key_switching_key = self + .packing_key_switching_key + .as_view() + .decompress_into_lwe_packing_keyswitch_key(); + + let glwe_compression_key = CompressionKey { + packing_key_switching_key, + lwe_per_glwe: self.lwe_per_glwe, + storage_log_modulus: self.storage_log_modulus, + }; + + CudaCompressionKey::from_compression_key(&glwe_compression_key, streams) + } +} diff --git a/tfhe/src/integer/gpu/list_compression/mod.rs b/tfhe/src/integer/gpu/list_compression/mod.rs new file mode 100644 index 0000000000..8cb3b2e38c --- /dev/null +++ b/tfhe/src/integer/gpu/list_compression/mod.rs @@ -0,0 +1,2 @@ +pub mod compressed_server_keys; +pub mod server_keys; diff --git a/tfhe/src/integer/gpu/list_compression/server_keys.rs b/tfhe/src/integer/gpu/list_compression/server_keys.rs new file mode 100644 index 0000000000..44193c3ee4 --- /dev/null +++ b/tfhe/src/integer/gpu/list_compression/server_keys.rs @@ -0,0 +1,328 @@ +use crate::core_crypto::gpu::entities::lwe_packing_keyswitch_key::CudaLwePackingKeyswitchKey; +use crate::core_crypto::gpu::glwe_ciphertext_list::CudaGlweCiphertextList; +use crate::core_crypto::gpu::lwe_bootstrap_key::CudaLweBootstrapKey; +use crate::core_crypto::gpu::lwe_ciphertext_list::CudaLweCiphertextList; +use crate::core_crypto::gpu::vec::CudaVec; +use crate::core_crypto::gpu::CudaStreams; +use crate::core_crypto::prelude::{ + allocate_and_generate_new_lwe_packing_keyswitch_key, + par_allocate_and_generate_new_lwe_bootstrap_key, CiphertextModulusLog, GlweCiphertextCount, + LweBootstrapKeyOwned, LweCiphertextCount, LwePackingKeyswitchKey, +}; +use crate::integer::gpu::ciphertext::info::{CudaBlockInfo, CudaRadixCiphertextInfo}; +use crate::integer::gpu::ciphertext::CudaRadixCiphertext; +use crate::integer::gpu::server_key::CudaBootstrappingKey; +use crate::integer::gpu::{ + compression_compress_integer_radix_async, compression_decompress_integer_radix_async, + cuda_memcpy_async_gpu_to_gpu, +}; +use crate::integer::parameters::{GlweDimension, PolynomialSize}; +use crate::shortint::client_key::ClientKey; +use crate::shortint::engine::ShortintEngine; +use crate::shortint::list_compression::{CompressionKey, CompressionPrivateKeys}; +use crate::shortint::{ClassicPBSParameters, EncryptionKeyChoice, PBSParameters}; +use itertools::Itertools; + +#[derive(Debug)] +pub struct CudaCompressionKey { + pub packing_key_switching_key: CudaLwePackingKeyswitchKey, + pub h_packing_key_switching_key: LwePackingKeyswitchKey>, + pub lwe_per_glwe: LweCiphertextCount, + pub storage_log_modulus: CiphertextModulusLog, +} + +pub struct CudaDecompressionKey { + pub blind_rotate_key: CudaBootstrappingKey, + pub lwe_per_glwe: LweCiphertextCount, + pub parameters: PBSParameters, +} + +impl CudaCompressionKey { + pub fn from_compression_key(compression_key: &CompressionKey, streams: &CudaStreams) -> Self { + Self { + packing_key_switching_key: CudaLwePackingKeyswitchKey::from_lwe_packing_keyswitch_key( + &compression_key.packing_key_switching_key, + streams, + ), + h_packing_key_switching_key: compression_key.clone().packing_key_switching_key, + lwe_per_glwe: compression_key.lwe_per_glwe, + storage_log_modulus: compression_key.storage_log_modulus, + } + } + + fn flatten( + vec_ciphertexts: &Vec, + streams: &CudaStreams, + gpu_index: u32, + ) -> (CudaLweCiphertextList, Vec) { + let first_ct = &vec_ciphertexts.first().unwrap().d_blocks; + + // We assume all ciphertexts will have the same lwe dimension + let lwe_dimension = first_ct.lwe_dimension(); + let ciphertext_modulus = first_ct.ciphertext_modulus(); + + // Compute total number of lwe ciphertexts we will be handling + let total_num_blocks: usize = vec_ciphertexts + .iter() + .map(|x| x.d_blocks.lwe_ciphertext_count().0) + .sum(); + + let lwe_ciphertext_count = LweCiphertextCount(total_num_blocks); + + let d_vec = unsafe { + let mut d_vec = CudaVec::new_async( + lwe_dimension.to_lwe_size().0 * lwe_ciphertext_count.0, + streams, + gpu_index, + ); + let mut offset: usize = 0; + for ciphertext in vec_ciphertexts { + // Todo: We might use copy_self_range_gpu_to_gpu_async here + let dest_ptr = d_vec + .as_mut_c_ptr(gpu_index) + .add(offset * std::mem::size_of::()); + let size = ciphertext.d_blocks.0.d_vec.len * std::mem::size_of::(); + cuda_memcpy_async_gpu_to_gpu( + dest_ptr, + ciphertext.d_blocks.0.d_vec.as_c_ptr(gpu_index), + size as u64, + streams.ptr[gpu_index as usize], + streams.gpu_indexes[gpu_index as usize], + ); + + offset += ciphertext.d_blocks.0.d_vec.len; + } + + streams.synchronize(); + d_vec + }; + + let flattened_ciphertexts = + CudaLweCiphertextList::from_cuda_vec(d_vec, lwe_ciphertext_count, ciphertext_modulus); + + let info = vec_ciphertexts + .iter() + .flat_map(|x| x.info.blocks.clone()) + .collect_vec(); + + (flattened_ciphertexts, info) + } + + pub fn compress_ciphertexts_into_list( + &self, + ciphertexts: &Vec, + streams: &CudaStreams, + gpu_index: u32, + ) -> ( + CudaGlweCiphertextList, + Vec, + CiphertextModulusLog, + ) { + let lwe_pksk = &self.packing_key_switching_key; + + let ciphertext_modulus = lwe_pksk.ciphertext_modulus(); + let compress_polynomial_size = lwe_pksk.output_polynomial_size(); + let compress_glwe_size = lwe_pksk.output_glwe_size(); + + let first_ct = ciphertexts.first().unwrap(); + let first_ct_info = first_ct.info.blocks.first().unwrap(); + let message_modulus = first_ct_info.message_modulus; + let carry_modulus = first_ct_info.carry_modulus; + + let lwe_dimension = first_ct.d_blocks.lwe_dimension(); + + let num_lwes: usize = ciphertexts + .iter() + .map(|x| x.d_blocks.lwe_ciphertext_count().0) + .sum(); + + let mut output_glwe = CudaGlweCiphertextList::new( + compress_glwe_size.to_glwe_dimension(), + compress_polynomial_size, + GlweCiphertextCount(ciphertexts.len()), + ciphertext_modulus, + streams, + ); + + let (input_lwes, info) = Self::flatten(ciphertexts, streams, gpu_index); + + unsafe { + compression_compress_integer_radix_async( + streams, + &mut output_glwe.0.d_vec, + &input_lwes.0.d_vec, + &self.packing_key_switching_key.d_vec, + message_modulus, + carry_modulus, + GlweDimension(0), + PolynomialSize(0), + compress_glwe_size.to_glwe_dimension(), + compress_polynomial_size, + lwe_dimension, + lwe_pksk.decomposition_base_log(), + lwe_pksk.decomposition_level_count(), + self.lwe_per_glwe.0 as u32, + self.storage_log_modulus.0 as u32, + num_lwes as u32, + ); + } + + (output_glwe, info, self.storage_log_modulus) + } +} + +impl CudaDecompressionKey { + pub fn unpack( + &self, + packed_list: &( + CudaGlweCiphertextList, + Vec, + CiphertextModulusLog, + ), + start_block_index: usize, + end_block_index: usize, + streams: &CudaStreams, + gpu_index: u32, + ) -> CudaRadixCiphertext { + let indexes_array = (start_block_index..=end_block_index) + .map(|x| x as u32) + .collect_vec(); + + let encryption_glwe_dimension = self.parameters.glwe_dimension(); + let encryption_polynomial_size = self.parameters.polynomial_size(); + let compression_glwe_dimension = packed_list.0.glwe_dimension(); + let compression_polynomial_size = packed_list.0.polynomial_size(); + let lwe_ciphertext_count = LweCiphertextCount(end_block_index - start_block_index + 1); + let message_modulus = self.parameters.message_modulus(); + let carry_modulus = self.parameters.carry_modulus(); + let ciphertext_modulus = self.parameters.ciphertext_modulus(); + let storage_log_modulus = packed_list.2; + + match &self.blind_rotate_key { + CudaBootstrappingKey::Classic(bsk) => { + let lwe_dimension = bsk.output_lwe_dimension(); + + let mut output_lwe = CudaLweCiphertextList::new( + lwe_dimension, + lwe_ciphertext_count, + ciphertext_modulus, + streams, + ); + + unsafe { + let d_indexes_array = + CudaVec::from_cpu_async(indexes_array.as_slice(), streams, gpu_index); + + compression_decompress_integer_radix_async( + streams, + &mut output_lwe.0.d_vec, + &packed_list.0 .0.d_vec, + &bsk.d_vec, + message_modulus, + carry_modulus, + encryption_glwe_dimension, + encryption_polynomial_size, + compression_glwe_dimension, + compression_polynomial_size, + lwe_dimension, + bsk.decomp_base_log(), + bsk.decomp_level_count(), + self.lwe_per_glwe.0 as u32, + storage_log_modulus.0 as u32, + &d_indexes_array, + lwe_ciphertext_count.0 as u32, + ); + } + + CudaRadixCiphertext { + d_blocks: output_lwe, + info: CudaRadixCiphertextInfo { + blocks: packed_list.1.clone(), + }, + } + } + CudaBootstrappingKey::MultiBit(_) => { + panic! {"Unsupported PBS"} + } + } + } +} + +impl ClientKey { + pub fn new_cuda_compression_decompression_keys( + &self, + private_compression_key: &CompressionPrivateKeys, + streams: &CudaStreams, + ) -> (CudaCompressionKey, CudaDecompressionKey) { + let params = &private_compression_key.params; + let cks_params: ClassicPBSParameters = match self.parameters.pbs_parameters().unwrap() { + PBSParameters::PBS(a) => a, + PBSParameters::MultiBitPBS(_) => { + panic!("Compression is currently not compatible with Multi Bit PBS") + } + }; + + assert_eq!( + cks_params.encryption_key_choice, + EncryptionKeyChoice::Big, + "Compression is only compatible with ciphertext in post PBS dimension" + ); + + let packing_key_switching_key = ShortintEngine::with_thread_local_mut(|engine| { + allocate_and_generate_new_lwe_packing_keyswitch_key( + &self.large_lwe_secret_key(), + &private_compression_key.post_packing_ks_key, + params.packing_ks_base_log, + params.packing_ks_level, + params.packing_ks_key_noise_distribution, + self.parameters.ciphertext_modulus(), + &mut engine.encryption_generator, + ) + }); + + assert!( + private_compression_key.params.storage_log_modulus.0 + <= cks_params + .polynomial_size + .to_blind_rotation_input_modulus_log() + .0, + "Compression parameters say to store more bits than useful" + ); + + let glwe_compression_key = CompressionKey { + packing_key_switching_key, + lwe_per_glwe: params.lwe_per_glwe, + storage_log_modulus: private_compression_key.params.storage_log_modulus, + }; + + let mut engine = ShortintEngine::new(); + let h_bootstrap_key: LweBootstrapKeyOwned = + par_allocate_and_generate_new_lwe_bootstrap_key( + &private_compression_key + .post_packing_ks_key + .as_lwe_secret_key(), + &self.glwe_secret_key, + private_compression_key.params.br_base_log, + private_compression_key.params.br_level, + self.parameters.glwe_noise_distribution(), + self.parameters.ciphertext_modulus(), + &mut engine.encryption_generator, + ); + + let d_bootstrap_key = + CudaLweBootstrapKey::from_lwe_bootstrap_key(&h_bootstrap_key, streams); + + let blind_rotate_key = CudaBootstrappingKey::Classic(d_bootstrap_key); + + let cuda_glwe_decompression_key = CudaDecompressionKey { + blind_rotate_key, + lwe_per_glwe: params.lwe_per_glwe, + parameters: self.parameters.pbs_parameters().unwrap(), + }; + + ( + CudaCompressionKey::from_compression_key(&glwe_compression_key, streams), + cuda_glwe_decompression_key, + ) + } +} diff --git a/tfhe/src/integer/gpu/mod.rs b/tfhe/src/integer/gpu/mod.rs index bb2151caa2..62669adfe1 100644 --- a/tfhe/src/integer/gpu/mod.rs +++ b/tfhe/src/integer/gpu/mod.rs @@ -1,6 +1,6 @@ pub mod ciphertext; +pub mod list_compression; pub mod server_key; - use crate::core_crypto::gpu::slice::{CudaSlice, CudaSliceMut}; use crate::core_crypto::gpu::vec::CudaVec; use crate::core_crypto::gpu::CudaStreams; @@ -12,6 +12,7 @@ use crate::integer::{ClientKey, RadixClientKey}; use crate::shortint::{CarryModulus, MessageModulus}; pub use server_key::CudaServerKey; use std::cmp::min; + use tfhe_cuda_backend::cuda_bind::*; #[repr(u32)] @@ -32,6 +33,13 @@ pub enum PBSType { Classical = 1, } +#[allow(dead_code)] +#[repr(u32)] +pub enum CompressionType { + Compress = 0, + Decompress = 1, +} + #[repr(u32)] pub enum ShiftRotateType { LeftShift = 0, @@ -273,6 +281,164 @@ pub unsafe fn unchecked_scalar_mul_integer_radix_kb_async( + streams: &CudaStreams, + glwe_array_out: &mut CudaVec, + lwe_array_in: &CudaVec, + fp_keyswitch_key: &CudaVec, + message_modulus: MessageModulus, + carry_modulus: CarryModulus, + encryption_glwe_dimension: GlweDimension, + encryption_polynomial_size: PolynomialSize, + compression_glwe_dimension: GlweDimension, + compression_polynomial_size: PolynomialSize, + lwe_dimension: LweDimension, + ks_base_log: DecompositionBaseLog, + ks_level: DecompositionLevelCount, + lwe_per_glwe: u32, + storage_log_modulus: u32, + num_blocks: u32, +) { + assert_eq!( + streams.gpu_indexes[0], + lwe_array_in.gpu_index(0), + "GPU error: all data should reside on the same GPU." + ); + assert_eq!( + streams.gpu_indexes[0], + fp_keyswitch_key.gpu_index(0), + "GPU error: all data should reside on the same GPU." + ); + let mut mem_ptr: *mut i8 = std::ptr::null_mut(); + scratch_cuda_compression_integer_radix_ciphertext_64( + streams.ptr.as_ptr(), + streams.gpu_indexes.as_ptr(), + streams.len() as u32, + std::ptr::addr_of_mut!(mem_ptr), + encryption_glwe_dimension.0 as u32, + encryption_polynomial_size.0 as u32, + compression_glwe_dimension.0 as u32, + compression_polynomial_size.0 as u32, + lwe_dimension.0 as u32, + ks_level.0 as u32, + ks_base_log.0 as u32, + 0u32, + 0u32, + 0u32, + num_blocks, + message_modulus.0 as u32, + carry_modulus.0 as u32, + PBSType::Classical as u32, + lwe_per_glwe, + storage_log_modulus, + CompressionType::Compress as u32, + true, + ); + + cuda_compression_compress_integer_radix_ciphertext_64( + streams.ptr.as_ptr(), + streams.gpu_indexes.as_ptr(), + streams.len() as u32, + glwe_array_out.as_mut_c_ptr(0), + lwe_array_in.as_c_ptr(0), + fp_keyswitch_key.ptr.as_ptr(), + num_blocks, + mem_ptr, + ); + + cleanup_cuda_compression_integer_radix_ciphertext_64( + streams.ptr.as_ptr(), + streams.gpu_indexes.as_ptr(), + streams.len() as u32, + std::ptr::addr_of_mut!(mem_ptr), + ); +} + +#[allow(clippy::too_many_arguments)] +/// # Safety +/// +/// - [CudaStreams::synchronize] __must__ be called after this function as soon as synchronization +/// is required +pub unsafe fn compression_decompress_integer_radix_async( + streams: &CudaStreams, + lwe_array_out: &mut CudaVec, + glwe_in: &CudaVec, + bootstrapping_key: &CudaVec, + message_modulus: MessageModulus, + carry_modulus: CarryModulus, + encryption_glwe_dimension: GlweDimension, + encryption_polynomial_size: PolynomialSize, + compression_glwe_dimension: GlweDimension, + compression_polynomial_size: PolynomialSize, + lwe_dimension: LweDimension, + pbs_base_log: DecompositionBaseLog, + pbs_level: DecompositionLevelCount, + lwe_per_glwe: u32, + storage_log_modulus: u32, + vec_indexes: &CudaVec, + num_blocks: u32, +) { + assert_eq!( + streams.gpu_indexes[0], + glwe_in.gpu_index(0), + "GPU error: all data should reside on the same GPU." + ); + assert_eq!( + streams.gpu_indexes[0], + bootstrapping_key.gpu_index(0), + "GPU error: all data should reside on the same GPU." + ); + let mut mem_ptr: *mut i8 = std::ptr::null_mut(); + scratch_cuda_compression_integer_radix_ciphertext_64( + streams.ptr.as_ptr(), + streams.gpu_indexes.as_ptr(), + streams.len() as u32, + std::ptr::addr_of_mut!(mem_ptr), + encryption_glwe_dimension.0 as u32, + encryption_polynomial_size.0 as u32, + compression_glwe_dimension.0 as u32, + compression_polynomial_size.0 as u32, + lwe_dimension.0 as u32, + 0u32, + 0u32, + pbs_level.0 as u32, + pbs_base_log.0 as u32, + 0, + num_blocks, + message_modulus.0 as u32, + carry_modulus.0 as u32, + PBSType::Classical as u32, + lwe_per_glwe, + storage_log_modulus, + CompressionType::Decompress as u32, + true, + ); + + cuda_compression_decompress_integer_radix_ciphertext_64( + streams.ptr.as_ptr(), + streams.gpu_indexes.as_ptr(), + streams.len() as u32, + lwe_array_out.as_mut_c_ptr(0), + glwe_in.as_c_ptr(0), + vec_indexes.as_c_ptr(0), + vec_indexes.len as u32, + bootstrapping_key.ptr.as_ptr(), + mem_ptr, + ); + + cleanup_cuda_compression_integer_radix_ciphertext_64( + streams.ptr.as_ptr(), + streams.gpu_indexes.as_ptr(), + streams.len() as u32, + std::ptr::addr_of_mut!(mem_ptr), + ); +} + #[allow(clippy::too_many_arguments)] /// # Safety ///