From ceaaa3201eebf285d45c63583c1218ce17dc8e19 Mon Sep 17 00:00:00 2001 From: Adam Reeve Date: Mon, 31 Mar 2025 17:12:58 +1300 Subject: [PATCH 1/4] Add encryption round-trip example --- parquet/src/encryption/mod.rs | 92 ++++++++++++++++++++++++++++++++++- 1 file changed, 90 insertions(+), 2 deletions(-) diff --git a/parquet/src/encryption/mod.rs b/parquet/src/encryption/mod.rs index 062c351ac1cd..f42f026b6ecd 100644 --- a/parquet/src/encryption/mod.rs +++ b/parquet/src/encryption/mod.rs @@ -15,8 +15,96 @@ // specific language governing permissions and limitations // under the License. -//! Encryption implementation specific to Parquet, as described -//! in the [spec](https://github.com/apache/parquet-format/blob/master/Encryption.md). +//! This module implements Parquet Modular Encryption, as described in the +//! [specification](https://github.com/apache/parquet-format/blob/master/Encryption.md). +//! +//! # Example of writing and reading an encrypted Parquet file +//! +//! ``` +//! use arrow::array::{ArrayRef, Float32Array, Int32Array, RecordBatch}; +//! use parquet::arrow::arrow_reader::{ArrowReaderOptions, ParquetRecordBatchReaderBuilder}; +//! use parquet::arrow::ArrowWriter; +//! use parquet::encryption::decrypt::FileDecryptionProperties; +//! use parquet::encryption::encrypt::FileEncryptionProperties; +//! use parquet::errors::Result; +//! use parquet::file::properties::WriterProperties; +//! use std::fs::File; +//! use std::sync::Arc; +//! use tempfile::TempDir; +//! +//! // Define 16 byte AES encryption keys to use. +//! static FOOTER_KEY: &[u8; 16] = b"0123456789012345"; +//! static COLUMN_KEY_1: &[u8; 16] = b"1234567890123450"; +//! static COLUMN_KEY_2: &[u8; 16] = b"1234567890123451"; +//! +//! let temp_dir = TempDir::new()?; +//! let file_path = temp_dir.path().join("encrypted_example.parquet"); +//! +//! // Create file encryption properties, which define how the file is encrypted. +//! // We will specify a key to encrypt the footer metadata, +//! // then separate keys for different columns. +//! // This allows fine-grained control of access to different columns within a Parquet file. +//! // Note that any columns without an encryption key specified will be left un-encrypted. +//! // If only a footer key is specified, then all columns are encrypted with the footer key. +//! let encryption_properties = FileEncryptionProperties::builder(FOOTER_KEY.into()) +//! .with_column_key("x", COLUMN_KEY_1.into()) +//! .with_column_key("y", COLUMN_KEY_2.into()) +//! // We also set an AAD prefix, which is optional. +//! // This contributes to the "additional authenticated data" that is used to verify file +//! // integrity and prevents data being swapped with data encrypted with the same key. +//! .with_aad_prefix(b"example_aad".into()) +//! // Specify that the AAD prefix is stored in the file, so readers don't need +//! // to provide it to read the data, but can optionally provide it if they want to +//! // verify file integrity. +//! .with_aad_prefix_storage(true) +//! .build()?; +//! +//! let writer_properties = WriterProperties::builder() +//! .with_file_encryption_properties(encryption_properties) +//! .build(); +//! +//! // Write the encrypted Parquet file +//! { +//! let file = File::create(&file_path)?; +//! +//! let ids = Int32Array::from(vec![0, 1, 2, 3, 4, 5]); +//! let x_vals = Float32Array::from(vec![0.0, 0.1, 0.2, 0.3, 0.4, 0.5]); +//! let y_vals = Float32Array::from(vec![1.0, 1.1, 1.2, 1.3, 1.4, 1.5]); +//! let batch = RecordBatch::try_from_iter(vec![ +//! ("id", Arc::new(ids) as ArrayRef), +//! ("x", Arc::new(x_vals) as ArrayRef), +//! ("y", Arc::new(y_vals) as ArrayRef), +//! ])?; +//! +//! let mut writer = ArrowWriter::try_new(file, batch.schema(), Some(writer_properties))?; +//! +//! writer.write(&batch)?; +//! writer.close()?; +//! } +//! +//! // In order to read the encrypted Parquet file, we need to know the encryption +//! // keys used to encrypt it. +//! // We don't need to provide the AAD prefix as it was stored in the file metadata, +//! // but we could specify it here if we wanted to verify the file hasn't been tampered with: +//! let decryption_properties = FileDecryptionProperties::builder(FOOTER_KEY.into()) +//! .with_column_key("x", COLUMN_KEY_1.into()) +//! .with_column_key("y", COLUMN_KEY_2.into()) +//! .build()?; +//! +//! let reader_options = +//! ArrowReaderOptions::new().with_file_decryption_properties(decryption_properties); +//! +//! // Read the file using the configured decryption properties +//! let file = File::open(&file_path)?; +//! +//! let builder = ParquetRecordBatchReaderBuilder::try_new_with_options(file, reader_options)?; +//! let record_reader = builder.build()?; +//! for batch in record_reader { +//! let batch = batch?; +//! println!("Read batch: {:?}", batch); +//! } +//! # Ok::<(), parquet::errors::ParquetError>(()) +//! ``` pub(crate) mod ciphers; pub mod decrypt; From 473a3998772b3fd8eac683117540a9ed961c4479 Mon Sep 17 00:00:00 2001 From: Adam Reeve Date: Wed, 2 Apr 2025 13:43:45 +1300 Subject: [PATCH 2/4] Add examples for building FileEncryptionProperties and FileDecryptionProperties --- parquet/src/encryption/decrypt.rs | 40 ++++++++++++++++++++++++++++++- parquet/src/encryption/encrypt.rs | 37 ++++++++++++++++++++++++++++ 2 files changed, 76 insertions(+), 1 deletion(-) diff --git a/parquet/src/encryption/decrypt.rs b/parquet/src/encryption/decrypt.rs index 41e5757f3670..d150587119f0 100644 --- a/parquet/src/encryption/decrypt.rs +++ b/parquet/src/encryption/decrypt.rs @@ -195,7 +195,43 @@ impl PartialEq for DecryptionKeys { } } -/// FileDecryptionProperties hold keys and AAD data required to decrypt a Parquet file. +/// `FileDecryptionProperties` hold keys and AAD data required to decrypt a Parquet file. +/// +/// When reading Arrow data, the `FileDecryptionProperties` should be included in the +/// [`ArrowReaderOptions`](crate::arrow::arrow_reader::ArrowReaderOptions) using +/// [`with_file_decryption_properties`](crate::arrow::arrow_reader::ArrowReaderOptions::with_file_decryption_properties). +/// +/// # Examples +/// +/// Create `FileDecryptionProperties` for a file encrypted with uniform encryption, +/// where all metadata and data are encrypted with the footer key: +/// ``` +/// # use parquet::encryption::decrypt::FileDecryptionProperties; +/// let file_encryption_properties = FileDecryptionProperties::builder(b"0123456789012345".into()) +/// .build()?; +/// # Ok::<(), parquet::errors::ParquetError>(()) +/// ``` +/// +/// Create properties for a file where columns are encrypted with different keys: +/// ``` +/// # use parquet::encryption::decrypt::FileDecryptionProperties; +/// let file_encryption_properties = FileDecryptionProperties::builder(b"0123456789012345".into()) +/// .with_column_key("x", b"1234567890123450".into()) +/// .with_column_key("y", b"1234567890123451".into()) +/// .build()?; +/// # Ok::<(), parquet::errors::ParquetError>(()) +/// ``` +/// +/// Specify additional authenticated data, used to protect against data replacement. +/// This must match the AAD prefix provided when the file was written, otherwise +/// data decryption will fail. +/// ``` +/// # use parquet::encryption::decrypt::FileDecryptionProperties; +/// let file_encryption_properties = FileDecryptionProperties::builder(b"0123456789012345".into()) +/// .with_aad_prefix("example_file".as_bytes().to_vec()) +/// .build()?; +/// # Ok::<(), parquet::errors::ParquetError>(()) +/// ``` #[derive(Clone, PartialEq)] pub struct FileDecryptionProperties { keys: DecryptionKeys, @@ -277,6 +313,8 @@ impl std::fmt::Debug for FileDecryptionProperties { } /// Builder for [`FileDecryptionProperties`] +/// +/// See [`FileDecryptionProperties`] for example usage. pub struct DecryptionPropertiesBuilder { footer_key: Option>, key_retriever: Option>, diff --git a/parquet/src/encryption/encrypt.rs b/parquet/src/encryption/encrypt.rs index 13cab64fa6a1..1f5212048ce0 100644 --- a/parquet/src/encryption/encrypt.rs +++ b/parquet/src/encryption/encrypt.rs @@ -53,6 +53,41 @@ impl EncryptionKey { #[derive(Debug, Clone, PartialEq)] /// Defines how data in a Parquet file should be encrypted +/// +/// The `FileEncryptionProperties` should be included in the [`WriterProperties`](crate::file::properties::WriterProperties) +/// used to write a file by using [`WriterPropertiesBuilder::with_file_encryption_properties`](crate::file::properties::WriterPropertiesBuilder::with_file_encryption_properties). +/// +/// # Examples +/// +/// Create `FileEncryptionProperties` for a file encrypted with uniform encryption, +/// where all metadata and data are encrypted with the footer key: +/// ``` +/// # use parquet::encryption::encrypt::FileEncryptionProperties; +/// let file_encryption_properties = FileEncryptionProperties::builder(b"0123456789012345".into()) +/// .build()?; +/// # Ok::<(), parquet::errors::ParquetError>(()) +/// ``` +/// +/// Create properties for a file where columns are encrypted with different keys. +/// Any columns without a key specified will be unencrypted: +/// ``` +/// # use parquet::encryption::encrypt::FileEncryptionProperties; +/// let file_encryption_properties = FileEncryptionProperties::builder(b"0123456789012345".into()) +/// .with_column_key("x", b"1234567890123450".into()) +/// .with_column_key("y", b"1234567890123451".into()) +/// .build()?; +/// # Ok::<(), parquet::errors::ParquetError>(()) +/// ``` +/// +/// Specify additional authenticated data, used to protect against data replacement. +/// This should represent the file identity: +/// ``` +/// # use parquet::encryption::encrypt::FileEncryptionProperties; +/// let file_encryption_properties = FileEncryptionProperties::builder(b"0123456789012345".into()) +/// .with_aad_prefix("example_file".as_bytes().to_vec()) +/// .build()?; +/// # Ok::<(), parquet::errors::ParquetError>(()) +/// ``` pub struct FileEncryptionProperties { encrypt_footer: bool, footer_key: EncryptionKey, @@ -141,6 +176,8 @@ impl FileEncryptionProperties { } /// Builder for [`FileEncryptionProperties`] +/// +/// See [`FileEncryptionProperties`] for example usage. pub struct EncryptionPropertiesBuilder { encrypt_footer: bool, footer_key: EncryptionKey, From 2e14d6a71e53baa683a3bda80cb605191f5bf73c Mon Sep 17 00:00:00 2001 From: Adam Reeve Date: Wed, 2 Apr 2025 14:19:50 +1300 Subject: [PATCH 3/4] Add an example of using a KeyRetriever --- parquet/src/encryption/decrypt.rs | 72 +++++++++++++++++++++++++++++++ 1 file changed, 72 insertions(+) diff --git a/parquet/src/encryption/decrypt.rs b/parquet/src/encryption/decrypt.rs index d150587119f0..529c098ca5a1 100644 --- a/parquet/src/encryption/decrypt.rs +++ b/parquet/src/encryption/decrypt.rs @@ -28,6 +28,78 @@ use std::io::Read; use std::sync::Arc; /// Trait for retrieving an encryption key using the key's metadata +/// +/// # Example +/// +/// This shows how you might use a `KeyRetriever` to decrypt a Parquet file +/// if you have a set of known encryption keys with identifiers, but at read time +/// you may not know which columns were encrypted and which keys were used. +/// +/// In practice, the key metadata might instead store an encrypted key that must +/// be decrypted with a Key Management Server. +/// +/// ``` +/// # use std::collections::HashMap; +/// # use std::sync::{Arc, Mutex}; +/// # use parquet::encryption::decrypt::{FileDecryptionProperties, KeyRetriever}; +/// # use parquet::encryption::encrypt::FileEncryptionProperties; +/// # use parquet::errors::ParquetError; +/// // Define known encryption keys +/// let mut keys = HashMap::new(); +/// keys.insert("kf".to_owned(), b"0123456789012345".to_vec()); +/// keys.insert("kc1".to_owned(), b"1234567890123450".to_vec()); +/// keys.insert("kc2".to_owned(), b"1234567890123451".to_vec()); +/// +/// // Create encryption properties for writing a file, +/// // and specify the key identifiers as the key metadata. +/// let encryption_properties = FileEncryptionProperties::builder(keys.get("kf").unwrap().clone()) +/// .with_footer_key_metadata("kf".as_bytes().into()) +/// .with_column_key_and_metadata("x", keys.get("kc1").unwrap().clone(), "kc1".as_bytes().into()) +/// .with_column_key_and_metadata("y", keys.get("kc2").unwrap().clone(), "kc2".as_bytes().into()) +/// .build()?; +/// +/// // Write an encrypted file with the properties +/// // ... +/// +/// // Define a KeyRetriever that can get encryption keys using their identifiers +/// struct CustomKeyRetriever { +/// keys: Mutex>>, +/// } +/// +/// impl KeyRetriever for CustomKeyRetriever { +/// fn retrieve_key(&self, key_metadata: &[u8]) -> parquet::errors::Result> { +/// // Metadata is bytes, so convert it to a string identifier +/// let key_metadata = std::str::from_utf8(key_metadata).map_err(|e| { +/// ParquetError::General(format!("Could not convert key metadata to string: {}", e)) +/// })?; +/// // Lookup the key +/// let keys = self.keys.lock().unwrap(); +/// match keys.get(key_metadata) { +/// Some(key) => Ok(key.clone()), +/// None => Err(ParquetError::General(format!( +/// "Could not retrieve key for metadata {:?}", +/// key_metadata +/// ))), +/// } +/// } +/// } +/// +/// let key_retriever = Arc::new(CustomKeyRetriever { +/// keys: Mutex::new(keys), +/// }); +/// +/// // Create decryption properties for reading an encrypted file. +/// // Note that we don't need to specify which columns are encrypted, +/// // this is determined by the file metadata and the required keys will be retrieved +/// // dynamically using our key retriever. +/// let decryption_properties = FileDecryptionProperties::with_key_retriever(key_retriever) +/// .build()?; +/// +/// // Read an encrypted file with the decryption properties +/// // ... +/// +/// # Ok::<(), parquet::errors::ParquetError>(()) +/// ``` pub trait KeyRetriever: Send + Sync { /// Retrieve a decryption key given the key metadata fn retrieve_key(&self, key_metadata: &[u8]) -> Result>; From 965c0e00805af622ca885cb4397d345d124a1c61 Mon Sep 17 00:00:00 2001 From: Adam Reeve Date: Fri, 4 Apr 2025 22:20:03 +1300 Subject: [PATCH 4/4] Apply suggestions from code review Co-authored-by: Matthijs Brobbel --- parquet/src/encryption/decrypt.rs | 9 ++++----- parquet/src/encryption/encrypt.rs | 2 +- parquet/src/encryption/mod.rs | 2 +- 3 files changed, 6 insertions(+), 7 deletions(-) diff --git a/parquet/src/encryption/decrypt.rs b/parquet/src/encryption/decrypt.rs index 529c098ca5a1..09274213445f 100644 --- a/parquet/src/encryption/decrypt.rs +++ b/parquet/src/encryption/decrypt.rs @@ -53,7 +53,7 @@ use std::sync::Arc; /// // Create encryption properties for writing a file, /// // and specify the key identifiers as the key metadata. /// let encryption_properties = FileEncryptionProperties::builder(keys.get("kf").unwrap().clone()) -/// .with_footer_key_metadata("kf".as_bytes().into()) +/// .with_footer_key_metadata("kf".into()) /// .with_column_key_and_metadata("x", keys.get("kc1").unwrap().clone(), "kc1".as_bytes().into()) /// .with_column_key_and_metadata("y", keys.get("kc2").unwrap().clone(), "kc2".as_bytes().into()) /// .build()?; @@ -70,15 +70,14 @@ use std::sync::Arc; /// fn retrieve_key(&self, key_metadata: &[u8]) -> parquet::errors::Result> { /// // Metadata is bytes, so convert it to a string identifier /// let key_metadata = std::str::from_utf8(key_metadata).map_err(|e| { -/// ParquetError::General(format!("Could not convert key metadata to string: {}", e)) +/// ParquetError::General(format!("Could not convert key metadata to string: {e}")) /// })?; /// // Lookup the key /// let keys = self.keys.lock().unwrap(); /// match keys.get(key_metadata) { /// Some(key) => Ok(key.clone()), /// None => Err(ParquetError::General(format!( -/// "Could not retrieve key for metadata {:?}", -/// key_metadata +/// "Could not retrieve key for metadata {key_metadata:?}" /// ))), /// } /// } @@ -300,7 +299,7 @@ impl PartialEq for DecryptionKeys { /// ``` /// # use parquet::encryption::decrypt::FileDecryptionProperties; /// let file_encryption_properties = FileDecryptionProperties::builder(b"0123456789012345".into()) -/// .with_aad_prefix("example_file".as_bytes().to_vec()) +/// .with_aad_prefix("example_file".into()) /// .build()?; /// # Ok::<(), parquet::errors::ParquetError>(()) /// ``` diff --git a/parquet/src/encryption/encrypt.rs b/parquet/src/encryption/encrypt.rs index 1f5212048ce0..9a801434c0db 100644 --- a/parquet/src/encryption/encrypt.rs +++ b/parquet/src/encryption/encrypt.rs @@ -84,7 +84,7 @@ impl EncryptionKey { /// ``` /// # use parquet::encryption::encrypt::FileEncryptionProperties; /// let file_encryption_properties = FileEncryptionProperties::builder(b"0123456789012345".into()) -/// .with_aad_prefix("example_file".as_bytes().to_vec()) +/// .with_aad_prefix("example_file".into()) /// .build()?; /// # Ok::<(), parquet::errors::ParquetError>(()) /// ``` diff --git a/parquet/src/encryption/mod.rs b/parquet/src/encryption/mod.rs index f42f026b6ecd..c1f4ca0da362 100644 --- a/parquet/src/encryption/mod.rs +++ b/parquet/src/encryption/mod.rs @@ -101,7 +101,7 @@ //! let record_reader = builder.build()?; //! for batch in record_reader { //! let batch = batch?; -//! println!("Read batch: {:?}", batch); +//! println!("Read batch: {batch:?}"); //! } //! # Ok::<(), parquet::errors::ParquetError>(()) //! ```