diff --git a/parquet/src/encryption/decrypt.rs b/parquet/src/encryption/decrypt.rs index 41e5757f3670..09274213445f 100644 --- a/parquet/src/encryption/decrypt.rs +++ b/parquet/src/encryption/decrypt.rs @@ -28,6 +28,77 @@ use std::io::Read; use std::sync::Arc; /// Trait for retrieving an encryption key using the key's metadata +/// +/// # Example +/// +/// This shows how you might use a `KeyRetriever` to decrypt a Parquet file +/// if you have a set of known encryption keys with identifiers, but at read time +/// you may not know which columns were encrypted and which keys were used. +/// +/// In practice, the key metadata might instead store an encrypted key that must +/// be decrypted with a Key Management Server. +/// +/// ``` +/// # use std::collections::HashMap; +/// # use std::sync::{Arc, Mutex}; +/// # use parquet::encryption::decrypt::{FileDecryptionProperties, KeyRetriever}; +/// # use parquet::encryption::encrypt::FileEncryptionProperties; +/// # use parquet::errors::ParquetError; +/// // Define known encryption keys +/// let mut keys = HashMap::new(); +/// keys.insert("kf".to_owned(), b"0123456789012345".to_vec()); +/// keys.insert("kc1".to_owned(), b"1234567890123450".to_vec()); +/// keys.insert("kc2".to_owned(), b"1234567890123451".to_vec()); +/// +/// // Create encryption properties for writing a file, +/// // and specify the key identifiers as the key metadata. +/// let encryption_properties = FileEncryptionProperties::builder(keys.get("kf").unwrap().clone()) +/// .with_footer_key_metadata("kf".into()) +/// .with_column_key_and_metadata("x", keys.get("kc1").unwrap().clone(), "kc1".as_bytes().into()) +/// .with_column_key_and_metadata("y", keys.get("kc2").unwrap().clone(), "kc2".as_bytes().into()) +/// .build()?; +/// +/// // Write an encrypted file with the properties +/// // ... +/// +/// // Define a KeyRetriever that can get encryption keys using their identifiers +/// struct CustomKeyRetriever { +/// keys: Mutex>>, +/// } +/// +/// impl KeyRetriever for CustomKeyRetriever { +/// fn retrieve_key(&self, key_metadata: &[u8]) -> parquet::errors::Result> { +/// // Metadata is bytes, so convert it to a string identifier +/// let key_metadata = std::str::from_utf8(key_metadata).map_err(|e| { +/// ParquetError::General(format!("Could not convert key metadata to string: {e}")) +/// })?; +/// // Lookup the key +/// let keys = self.keys.lock().unwrap(); +/// match keys.get(key_metadata) { +/// Some(key) => Ok(key.clone()), +/// None => Err(ParquetError::General(format!( +/// "Could not retrieve key for metadata {key_metadata:?}" +/// ))), +/// } +/// } +/// } +/// +/// let key_retriever = Arc::new(CustomKeyRetriever { +/// keys: Mutex::new(keys), +/// }); +/// +/// // Create decryption properties for reading an encrypted file. +/// // Note that we don't need to specify which columns are encrypted, +/// // this is determined by the file metadata and the required keys will be retrieved +/// // dynamically using our key retriever. +/// let decryption_properties = FileDecryptionProperties::with_key_retriever(key_retriever) +/// .build()?; +/// +/// // Read an encrypted file with the decryption properties +/// // ... +/// +/// # Ok::<(), parquet::errors::ParquetError>(()) +/// ``` pub trait KeyRetriever: Send + Sync { /// Retrieve a decryption key given the key metadata fn retrieve_key(&self, key_metadata: &[u8]) -> Result>; @@ -195,7 +266,43 @@ impl PartialEq for DecryptionKeys { } } -/// FileDecryptionProperties hold keys and AAD data required to decrypt a Parquet file. +/// `FileDecryptionProperties` hold keys and AAD data required to decrypt a Parquet file. +/// +/// When reading Arrow data, the `FileDecryptionProperties` should be included in the +/// [`ArrowReaderOptions`](crate::arrow::arrow_reader::ArrowReaderOptions) using +/// [`with_file_decryption_properties`](crate::arrow::arrow_reader::ArrowReaderOptions::with_file_decryption_properties). +/// +/// # Examples +/// +/// Create `FileDecryptionProperties` for a file encrypted with uniform encryption, +/// where all metadata and data are encrypted with the footer key: +/// ``` +/// # use parquet::encryption::decrypt::FileDecryptionProperties; +/// let file_encryption_properties = FileDecryptionProperties::builder(b"0123456789012345".into()) +/// .build()?; +/// # Ok::<(), parquet::errors::ParquetError>(()) +/// ``` +/// +/// Create properties for a file where columns are encrypted with different keys: +/// ``` +/// # use parquet::encryption::decrypt::FileDecryptionProperties; +/// let file_encryption_properties = FileDecryptionProperties::builder(b"0123456789012345".into()) +/// .with_column_key("x", b"1234567890123450".into()) +/// .with_column_key("y", b"1234567890123451".into()) +/// .build()?; +/// # Ok::<(), parquet::errors::ParquetError>(()) +/// ``` +/// +/// Specify additional authenticated data, used to protect against data replacement. +/// This must match the AAD prefix provided when the file was written, otherwise +/// data decryption will fail. +/// ``` +/// # use parquet::encryption::decrypt::FileDecryptionProperties; +/// let file_encryption_properties = FileDecryptionProperties::builder(b"0123456789012345".into()) +/// .with_aad_prefix("example_file".into()) +/// .build()?; +/// # Ok::<(), parquet::errors::ParquetError>(()) +/// ``` #[derive(Clone, PartialEq)] pub struct FileDecryptionProperties { keys: DecryptionKeys, @@ -277,6 +384,8 @@ impl std::fmt::Debug for FileDecryptionProperties { } /// Builder for [`FileDecryptionProperties`] +/// +/// See [`FileDecryptionProperties`] for example usage. pub struct DecryptionPropertiesBuilder { footer_key: Option>, key_retriever: Option>, diff --git a/parquet/src/encryption/encrypt.rs b/parquet/src/encryption/encrypt.rs index 13cab64fa6a1..9a801434c0db 100644 --- a/parquet/src/encryption/encrypt.rs +++ b/parquet/src/encryption/encrypt.rs @@ -53,6 +53,41 @@ impl EncryptionKey { #[derive(Debug, Clone, PartialEq)] /// Defines how data in a Parquet file should be encrypted +/// +/// The `FileEncryptionProperties` should be included in the [`WriterProperties`](crate::file::properties::WriterProperties) +/// used to write a file by using [`WriterPropertiesBuilder::with_file_encryption_properties`](crate::file::properties::WriterPropertiesBuilder::with_file_encryption_properties). +/// +/// # Examples +/// +/// Create `FileEncryptionProperties` for a file encrypted with uniform encryption, +/// where all metadata and data are encrypted with the footer key: +/// ``` +/// # use parquet::encryption::encrypt::FileEncryptionProperties; +/// let file_encryption_properties = FileEncryptionProperties::builder(b"0123456789012345".into()) +/// .build()?; +/// # Ok::<(), parquet::errors::ParquetError>(()) +/// ``` +/// +/// Create properties for a file where columns are encrypted with different keys. +/// Any columns without a key specified will be unencrypted: +/// ``` +/// # use parquet::encryption::encrypt::FileEncryptionProperties; +/// let file_encryption_properties = FileEncryptionProperties::builder(b"0123456789012345".into()) +/// .with_column_key("x", b"1234567890123450".into()) +/// .with_column_key("y", b"1234567890123451".into()) +/// .build()?; +/// # Ok::<(), parquet::errors::ParquetError>(()) +/// ``` +/// +/// Specify additional authenticated data, used to protect against data replacement. +/// This should represent the file identity: +/// ``` +/// # use parquet::encryption::encrypt::FileEncryptionProperties; +/// let file_encryption_properties = FileEncryptionProperties::builder(b"0123456789012345".into()) +/// .with_aad_prefix("example_file".into()) +/// .build()?; +/// # Ok::<(), parquet::errors::ParquetError>(()) +/// ``` pub struct FileEncryptionProperties { encrypt_footer: bool, footer_key: EncryptionKey, @@ -141,6 +176,8 @@ impl FileEncryptionProperties { } /// Builder for [`FileEncryptionProperties`] +/// +/// See [`FileEncryptionProperties`] for example usage. pub struct EncryptionPropertiesBuilder { encrypt_footer: bool, footer_key: EncryptionKey, diff --git a/parquet/src/encryption/mod.rs b/parquet/src/encryption/mod.rs index 062c351ac1cd..c1f4ca0da362 100644 --- a/parquet/src/encryption/mod.rs +++ b/parquet/src/encryption/mod.rs @@ -15,8 +15,96 @@ // specific language governing permissions and limitations // under the License. -//! Encryption implementation specific to Parquet, as described -//! in the [spec](https://github.com/apache/parquet-format/blob/master/Encryption.md). +//! This module implements Parquet Modular Encryption, as described in the +//! [specification](https://github.com/apache/parquet-format/blob/master/Encryption.md). +//! +//! # Example of writing and reading an encrypted Parquet file +//! +//! ``` +//! use arrow::array::{ArrayRef, Float32Array, Int32Array, RecordBatch}; +//! use parquet::arrow::arrow_reader::{ArrowReaderOptions, ParquetRecordBatchReaderBuilder}; +//! use parquet::arrow::ArrowWriter; +//! use parquet::encryption::decrypt::FileDecryptionProperties; +//! use parquet::encryption::encrypt::FileEncryptionProperties; +//! use parquet::errors::Result; +//! use parquet::file::properties::WriterProperties; +//! use std::fs::File; +//! use std::sync::Arc; +//! use tempfile::TempDir; +//! +//! // Define 16 byte AES encryption keys to use. +//! static FOOTER_KEY: &[u8; 16] = b"0123456789012345"; +//! static COLUMN_KEY_1: &[u8; 16] = b"1234567890123450"; +//! static COLUMN_KEY_2: &[u8; 16] = b"1234567890123451"; +//! +//! let temp_dir = TempDir::new()?; +//! let file_path = temp_dir.path().join("encrypted_example.parquet"); +//! +//! // Create file encryption properties, which define how the file is encrypted. +//! // We will specify a key to encrypt the footer metadata, +//! // then separate keys for different columns. +//! // This allows fine-grained control of access to different columns within a Parquet file. +//! // Note that any columns without an encryption key specified will be left un-encrypted. +//! // If only a footer key is specified, then all columns are encrypted with the footer key. +//! let encryption_properties = FileEncryptionProperties::builder(FOOTER_KEY.into()) +//! .with_column_key("x", COLUMN_KEY_1.into()) +//! .with_column_key("y", COLUMN_KEY_2.into()) +//! // We also set an AAD prefix, which is optional. +//! // This contributes to the "additional authenticated data" that is used to verify file +//! // integrity and prevents data being swapped with data encrypted with the same key. +//! .with_aad_prefix(b"example_aad".into()) +//! // Specify that the AAD prefix is stored in the file, so readers don't need +//! // to provide it to read the data, but can optionally provide it if they want to +//! // verify file integrity. +//! .with_aad_prefix_storage(true) +//! .build()?; +//! +//! let writer_properties = WriterProperties::builder() +//! .with_file_encryption_properties(encryption_properties) +//! .build(); +//! +//! // Write the encrypted Parquet file +//! { +//! let file = File::create(&file_path)?; +//! +//! let ids = Int32Array::from(vec![0, 1, 2, 3, 4, 5]); +//! let x_vals = Float32Array::from(vec![0.0, 0.1, 0.2, 0.3, 0.4, 0.5]); +//! let y_vals = Float32Array::from(vec![1.0, 1.1, 1.2, 1.3, 1.4, 1.5]); +//! let batch = RecordBatch::try_from_iter(vec![ +//! ("id", Arc::new(ids) as ArrayRef), +//! ("x", Arc::new(x_vals) as ArrayRef), +//! ("y", Arc::new(y_vals) as ArrayRef), +//! ])?; +//! +//! let mut writer = ArrowWriter::try_new(file, batch.schema(), Some(writer_properties))?; +//! +//! writer.write(&batch)?; +//! writer.close()?; +//! } +//! +//! // In order to read the encrypted Parquet file, we need to know the encryption +//! // keys used to encrypt it. +//! // We don't need to provide the AAD prefix as it was stored in the file metadata, +//! // but we could specify it here if we wanted to verify the file hasn't been tampered with: +//! let decryption_properties = FileDecryptionProperties::builder(FOOTER_KEY.into()) +//! .with_column_key("x", COLUMN_KEY_1.into()) +//! .with_column_key("y", COLUMN_KEY_2.into()) +//! .build()?; +//! +//! let reader_options = +//! ArrowReaderOptions::new().with_file_decryption_properties(decryption_properties); +//! +//! // Read the file using the configured decryption properties +//! let file = File::open(&file_path)?; +//! +//! let builder = ParquetRecordBatchReaderBuilder::try_new_with_options(file, reader_options)?; +//! let record_reader = builder.build()?; +//! for batch in record_reader { +//! let batch = batch?; +//! println!("Read batch: {batch:?}"); +//! } +//! # Ok::<(), parquet::errors::ParquetError>(()) +//! ``` pub(crate) mod ciphers; pub mod decrypt;