diff --git a/parquet/README.md b/parquet/README.md index 9245664b4ef0..8fc72bfbc32a 100644 --- a/parquet/README.md +++ b/parquet/README.md @@ -84,7 +84,7 @@ The `parquet` crate provides the following features which may be enabled in your - [ ] Row record writer - [x] Arrow record writer - [x] Async support - - [ ] Encrypted files + - [x] Encrypted files - [x] Predicate pushdown - [x] Parquet format 4.0.0 support diff --git a/parquet/examples/read_with_rowgroup.rs b/parquet/examples/read_with_rowgroup.rs index 44d25596110e..8cccc7fe14ac 100644 --- a/parquet/examples/read_with_rowgroup.rs +++ b/parquet/examples/read_with_rowgroup.rs @@ -35,12 +35,7 @@ async fn main() -> Result<()> { let mut file = File::open(&path).await.unwrap(); // The metadata could be cached in other places, this example only shows how to read - let metadata = file - .get_metadata( - #[cfg(feature = "encryption")] - None, - ) - .await?; + let metadata = file.get_metadata().await?; for rg in metadata.row_groups() { let mut rowgroup = InMemoryRowGroup::create(rg.clone(), ProjectionMask::all()); @@ -126,8 +121,6 @@ impl RowGroups for InMemoryRowGroup { self.metadata.column(i), self.num_rows(), None, - #[cfg(feature = "encryption")] - None, )?); Ok(Box::new(ColumnChunkIterator { diff --git a/parquet/src/arrow/arrow_writer/mod.rs b/parquet/src/arrow/arrow_writer/mod.rs index 0e087b1b21bf..527e79e61f82 100644 --- a/parquet/src/arrow/arrow_writer/mod.rs +++ b/parquet/src/arrow/arrow_writer/mod.rs @@ -1311,9 +1311,7 @@ mod tests { use crate::basic::Encoding; use crate::data_type::AsBytes; #[cfg(feature = "encryption")] - use crate::encryption::{ - decryption::FileDecryptionProperties, encrypt::FileEncryptionProperties, - }; + use crate::encryption::{decrypt::FileDecryptionProperties, encrypt::FileEncryptionProperties}; use crate::file::metadata::ParquetMetaData; use crate::file::page_index::index::Index; use crate::file::page_index::index_reader::read_offset_indexes; @@ -3812,8 +3810,8 @@ mod tests { let column_2_key = "1234567890123451".as_bytes(); let decryption_properties = FileDecryptionProperties::builder(footer_key.to_vec()) - .with_column_key("double_field".as_bytes().to_vec(), column_1_key.to_vec()) - .with_column_key("float_field".as_bytes().to_vec(), column_2_key.to_vec()) + .with_column_key("double_field", column_1_key.to_vec()) + .with_column_key("float_field", column_2_key.to_vec()) .build() .unwrap(); diff --git a/parquet/src/arrow/async_reader/metadata.rs b/parquet/src/arrow/async_reader/metadata.rs index 4f41cd6ceebd..71d2e57ddd50 100644 --- a/parquet/src/arrow/async_reader/metadata.rs +++ b/parquet/src/arrow/async_reader/metadata.rs @@ -128,26 +128,13 @@ impl MetadataLoader { let (metadata, remainder) = if length > suffix_len - FOOTER_SIZE { let metadata_start = file_size - length - FOOTER_SIZE; let meta = fetch.fetch(metadata_start..file_size - FOOTER_SIZE).await?; - ( - ParquetMetaDataReader::decode_metadata( - &meta, - footer.is_encrypted_footer(), - #[cfg(feature = "encryption")] - None, - )?, - None, - ) + (ParquetMetaDataReader::decode_metadata(&meta)?, None) } else { let metadata_start = file_size - length - FOOTER_SIZE - footer_start; let slice = &suffix[metadata_start..suffix_len - FOOTER_SIZE]; ( - ParquetMetaDataReader::decode_metadata( - slice, - footer.is_encrypted_footer(), - #[cfg(feature = "encryption")] - None, - )?, + ParquetMetaDataReader::decode_metadata(slice)?, Some((footer_start, suffix.slice(..metadata_start))), ) }; diff --git a/parquet/src/column/writer/mod.rs b/parquet/src/column/writer/mod.rs index 32139e6cd827..91259ab662b5 100644 --- a/parquet/src/column/writer/mod.rs +++ b/parquet/src/column/writer/mod.rs @@ -1542,9 +1542,7 @@ mod tests { reader::{get_column_reader, get_typed_column_reader, ColumnReaderImpl}, }; #[cfg(feature = "encryption")] - use crate::encryption::{ - decryption::FileDecryptionProperties, encrypt::FileEncryptionProperties, - }; + use crate::encryption::{decrypt::FileDecryptionProperties, encrypt::FileEncryptionProperties}; use crate::file::writer::TrackedWrite; use crate::file::{ properties::ReaderProperties, reader::SerializedPageReader, writer::SerializedPageWriter, @@ -2126,8 +2124,6 @@ mod tests { r.rows_written as usize, None, Arc::new(props), - #[cfg(feature = "encryption")] - None, ) .unwrap(); @@ -2180,8 +2176,6 @@ mod tests { r.rows_written as usize, None, Arc::new(props), - #[cfg(feature = "encryption")] - None, ) .unwrap(); @@ -2317,8 +2311,6 @@ mod tests { r.rows_written as usize, None, Arc::new(props), - #[cfg(feature = "encryption")] - None, ) .unwrap(), ); @@ -3543,7 +3535,7 @@ mod tests { let _file_metadata = writer.close().unwrap(); let decryption_properties = FileDecryptionProperties::builder(footer_key.to_vec()) - .with_column_key(b"a".to_vec(), column_key.key().clone()) + .with_column_key("a", column_key.key().clone()) .build() .unwrap(); let options = ArrowReaderOptions::default() @@ -3955,8 +3947,6 @@ mod tests { result.rows_written as usize, None, Arc::new(props), - #[cfg(feature = "encryption")] - None, ) .unwrap(), ); diff --git a/parquet/src/encryption/ciphers.rs b/parquet/src/encryption/ciphers.rs index 88ec2a7431f7..a77b829745ca 100644 --- a/parquet/src/encryption/ciphers.rs +++ b/parquet/src/encryption/ciphers.rs @@ -20,6 +20,7 @@ use crate::errors::Result; use ring::aead::{Aad, LessSafeKey, NonceSequence, UnboundKey, AES_128_GCM}; use ring::rand::{SecureRandom, SystemRandom}; use std::fmt::Debug; +use crate::errors::ParquetError; const RIGHT_TWELVE: u128 = 0x0000_0000_ffff_ffff_ffff_ffff_ffff_ffff; const NONCE_LEN: usize = 12; @@ -167,7 +168,7 @@ mod tests { fn test_round_trip() { let key = [0u8; 16]; let mut encryptor = RingGcmBlockEncryptor::new(&key).unwrap(); - let decryptor = RingGcmBlockDecryptor::new(&key); + let decryptor = RingGcmBlockDecryptor::new(&key).unwrap(); let plaintext = b"hello, world!"; let aad = b"some aad"; diff --git a/parquet/src/encryption/decryption.rs b/parquet/src/encryption/decryption.rs deleted file mode 100644 index 2c789d020923..000000000000 --- a/parquet/src/encryption/decryption.rs +++ /dev/null @@ -1,256 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -use crate::encryption::ciphers::{BlockDecryptor, RingGcmBlockDecryptor}; -use crate::encryption::modules::{create_module_aad, ModuleType}; -use crate::errors::Result; -use std::collections::HashMap; -use std::io::Read; -use std::sync::Arc; - -pub fn read_and_decrypt( - decryptor: &Arc, - input: &mut T, - aad: &[u8], -) -> Result> { - let mut len_bytes = [0; 4]; - input.read_exact(&mut len_bytes)?; - let ciphertext_len = u32::from_le_bytes(len_bytes) as usize; - let mut ciphertext = vec![0; 4 + ciphertext_len]; - ciphertext[0..4].copy_from_slice(&len_bytes); - input.read_exact(&mut ciphertext[4..])?; - - decryptor.decrypt(&ciphertext, aad.as_ref()) -} - -#[derive(Debug, Clone)] -pub struct CryptoContext { - pub(crate) row_group_ordinal: usize, - pub(crate) column_ordinal: usize, - pub(crate) page_ordinal: Option, - pub(crate) dictionary_page: bool, - // We have separate data and metadata decryptors because - // in GCM CTR mode, the metadata and data pages use - // different algorithms. - data_decryptor: Arc, - metadata_decryptor: Arc, - file_aad: Vec, -} - -impl CryptoContext { - pub fn new( - row_group_ordinal: usize, - column_ordinal: usize, - data_decryptor: Arc, - metadata_decryptor: Arc, - file_aad: Vec, - ) -> Self { - Self { - row_group_ordinal, - column_ordinal, - page_ordinal: None, - dictionary_page: false, - data_decryptor, - metadata_decryptor, - file_aad, - } - } - - pub fn with_page_ordinal(&self, page_ordinal: usize) -> Self { - Self { - row_group_ordinal: self.row_group_ordinal, - column_ordinal: self.column_ordinal, - page_ordinal: Some(page_ordinal), - dictionary_page: false, - data_decryptor: self.data_decryptor.clone(), - metadata_decryptor: self.metadata_decryptor.clone(), - file_aad: self.file_aad.clone(), - } - } - - pub(crate) fn create_page_header_aad(&self) -> Result> { - let module_type = if self.dictionary_page { - ModuleType::DictionaryPageHeader - } else { - ModuleType::DataPageHeader - }; - - create_module_aad( - self.file_aad(), - module_type, - self.row_group_ordinal, - self.column_ordinal, - self.page_ordinal, - ) - } - - pub(crate) fn create_page_aad(&self) -> Result> { - let module_type = if self.dictionary_page { - ModuleType::DictionaryPage - } else { - ModuleType::DataPage - }; - - create_module_aad( - self.file_aad(), - module_type, - self.row_group_ordinal, - self.column_ordinal, - self.page_ordinal, - ) - } - - pub fn for_dictionary_page(&self) -> Self { - Self { - row_group_ordinal: self.row_group_ordinal, - column_ordinal: self.column_ordinal, - page_ordinal: self.page_ordinal, - dictionary_page: true, - data_decryptor: self.data_decryptor.clone(), - metadata_decryptor: self.metadata_decryptor.clone(), - file_aad: self.file_aad.clone(), - } - } - - pub fn data_decryptor(&self) -> &Arc { - &self.data_decryptor - } - - pub fn metadata_decryptor(&self) -> &Arc { - &self.metadata_decryptor - } - - pub fn file_aad(&self) -> &Vec { - &self.file_aad - } -} - -/// FileDecryptionProperties hold keys and AAD data required to decrypt a Parquet file. -#[derive(Debug, Clone, PartialEq)] -pub struct FileDecryptionProperties { - footer_key: Vec, - column_keys: Option, Vec>>, - aad_prefix: Option>, -} - -impl FileDecryptionProperties { - /// Returns a new FileDecryptionProperties builder - pub fn builder(footer_key: Vec) -> DecryptionPropertiesBuilder { - DecryptionPropertiesBuilder::new(footer_key) - } -} - -pub struct DecryptionPropertiesBuilder { - footer_key: Vec, - column_keys: Option, Vec>>, - aad_prefix: Option>, -} - -impl DecryptionPropertiesBuilder { - pub fn new(footer_key: Vec) -> DecryptionPropertiesBuilder { - Self { - footer_key, - column_keys: None, - aad_prefix: None, - } - } - - pub fn build(self) -> Result { - Ok(FileDecryptionProperties { - footer_key: self.footer_key, - column_keys: self.column_keys, - aad_prefix: self.aad_prefix, - }) - } - - pub fn with_aad_prefix(mut self, value: Vec) -> Self { - self.aad_prefix = Some(value); - self - } - - pub fn with_column_key(mut self, column_name: Vec, decryption_key: Vec) -> Self { - let mut column_keys = self.column_keys.unwrap_or_default(); - column_keys.insert(column_name, decryption_key); - self.column_keys = Some(column_keys); - self - } -} - -#[derive(Clone, Debug)] -pub struct FileDecryptor { - decryption_properties: FileDecryptionProperties, - footer_decryptor: Option>, - file_aad: Vec, -} - -impl PartialEq for FileDecryptor { - fn eq(&self, other: &Self) -> bool { - self.decryption_properties == other.decryption_properties - } -} - -impl FileDecryptor { - pub(crate) fn new( - decryption_properties: &FileDecryptionProperties, - aad_file_unique: Vec, - aad_prefix: Vec, - ) -> Self { - let file_aad = [aad_prefix.as_slice(), aad_file_unique.as_slice()].concat(); - let footer_decryptor = RingGcmBlockDecryptor::new(&decryption_properties.footer_key); - - Self { - // todo decr: if no key available yet (not set in properties, will be retrieved from metadata) - footer_decryptor: Some(Arc::new(footer_decryptor)), - decryption_properties: decryption_properties.clone(), - file_aad, - } - } - - pub(crate) fn get_footer_decryptor(&self) -> Arc { - self.footer_decryptor.clone().unwrap() - } - - pub(crate) fn get_column_data_decryptor(&self, column_name: &[u8]) -> Arc { - match self.decryption_properties.column_keys.as_ref() { - None => self.get_footer_decryptor(), - Some(column_keys) => match column_keys.get(column_name) { - None => self.get_footer_decryptor(), - Some(column_key) => Arc::new(RingGcmBlockDecryptor::new(column_key)), - }, - } - } - - pub(crate) fn get_column_metadata_decryptor( - &self, - column_name: &[u8], - ) -> Arc { - // Once GCM CTR mode is implemented, data and metadata decryptors may be different - self.get_column_data_decryptor(column_name) - } - - pub(crate) fn file_aad(&self) -> &Vec { - &self.file_aad - } - - pub(crate) fn is_column_encrypted(&self, column_name: &[u8]) -> bool { - // Column is encrypted if either uniform encryption is used or an encryption key is set for the column - match self.decryption_properties.column_keys.as_ref() { - None => true, - Some(keys) => keys.contains_key(column_name), - } - } -} diff --git a/parquet/src/file/footer.rs b/parquet/src/file/footer.rs index 5be084259e18..85ef30cd0ecc 100644 --- a/parquet/src/file/footer.rs +++ b/parquet/src/file/footer.rs @@ -58,12 +58,7 @@ pub fn parse_metadata(chunk_reader: &R) -> Result Result { - ParquetMetaDataReader::decode_metadata( - buf, - false, - #[cfg(feature = "encryption")] - None, - ) + ParquetMetaDataReader::decode_metadata(buf) } /// Decodes the Parquet footer returning the metadata length in bytes diff --git a/parquet/src/file/metadata/mod.rs b/parquet/src/file/metadata/mod.rs index 6a5eb286db36..6dc559c8b816 100644 --- a/parquet/src/file/metadata/mod.rs +++ b/parquet/src/file/metadata/mod.rs @@ -103,14 +103,17 @@ use crate::encryption::{ }; use crate::errors::{ParquetError, Result}; #[cfg(feature = "encryption")] -use crate::file::column_crypto_metadata::{self, ColumnCryptoMetaData}; +use crate::file::column_crypto_metadata::ColumnCryptoMetaData; +#[cfg(feature = "encryption")] +use crate::format::ColumnCryptoMetaData as TColumnCryptoMetaData; + +#[cfg(feature = "encryption")] +use crate::file::column_crypto_metadata; pub(crate) use crate::file::metadata::memory::HeapSize; use crate::file::page_encoding_stats::{self, PageEncodingStats}; use crate::file::page_index::index::Index; use crate::file::page_index::offset_index::OffsetIndexMetaData; use crate::file::statistics::{self, Statistics}; -#[cfg(feature = "encryption")] -use crate::format::ColumnCryptoMetaData as TColumnCryptoMetaData; use crate::format::{ BoundaryOrder, ColumnChunk, ColumnIndex, ColumnMetaData, OffsetIndex, PageLocation, RowGroup, SizeStatistics, SortingColumn, @@ -659,11 +662,11 @@ impl RowGroupMetaData { d.path().string() )); } - Some(ColumnCryptoMetaData::ENCRYPTIONWITHCOLUMNKEY(crypto_metadata)) => { + Some(TColumnCryptoMetaData::ENCRYPTIONWITHCOLUMNKEY(crypto_metadata)) => { let column_name = crypto_metadata.path_in_schema.join("."); decryptor.get_column_metadata_decryptor(column_name.as_str())? } - Some(ColumnCryptoMetaData::ENCRYPTIONWITHFOOTERKEY(_)) => { + Some(TColumnCryptoMetaData::ENCRYPTIONWITHFOOTERKEY(_)) => { decryptor.get_footer_decryptor()? } }; diff --git a/parquet/src/file/writer.rs b/parquet/src/file/writer.rs index 36c75d3f1102..94089824b706 100644 --- a/parquet/src/file/writer.rs +++ b/parquet/src/file/writer.rs @@ -1467,8 +1467,6 @@ mod tests { total_num_values as usize, None, Arc::new(props), - #[cfg(feature = "encryption")] - None, ) .unwrap(); diff --git a/parquet/src/util/test_common/encryption_util.rs b/parquet/src/util/test_common/encryption_util.rs index 2e655330ff33..8412da99202c 100644 --- a/parquet/src/util/test_common/encryption_util.rs +++ b/parquet/src/util/test_common/encryption_util.rs @@ -18,14 +18,18 @@ use crate::arrow::arrow_reader::{ ArrowReaderMetadata, ArrowReaderOptions, ParquetRecordBatchReaderBuilder, }; +use crate::arrow::ParquetRecordBatchStreamBuilder; use arrow_array::cast::AsArray; use arrow_array::{types, RecordBatch}; use std::fs::File; use crate::arrow::ArrowWriter; +use crate::errors::ParquetError; use crate::encryption::encrypt::FileEncryptionProperties; use crate::encryption::decrypt::FileDecryptionProperties; use crate::file::properties::WriterProperties; +use crate::file::metadata::FileMetaData; +use futures::TryStreamExt; /// Tests reading an encrypted file from the parquet-testing repository pub(crate) fn verify_encryption_test_file_read( diff --git a/parquet/tests/arrow_writer_layout.rs b/parquet/tests/arrow_writer_layout.rs index 9297b8d13f07..9a66d13f84d7 100644 --- a/parquet/tests/arrow_writer_layout.rs +++ b/parquet/tests/arrow_writer_layout.rs @@ -141,8 +141,6 @@ fn assert_layout(file_reader: &Bytes, meta: &ParquetMetaData, layout: &Layout) { row_group.num_rows() as usize, None, Arc::new(properties), - #[cfg(feature = "encryption")] - None, ) .unwrap();