From 9f240bdb1f09d2ea9b6c6479b85721db8e37b196 Mon Sep 17 00:00:00 2001 From: Gidon Gershinsky Date: Thu, 21 Mar 2024 15:13:27 +0200 Subject: [PATCH 01/97] first commit --- parquet/Cargo.toml | 2 + parquet/src/arrow/arrow_reader/mod.rs | 65 +++++- parquet/src/encryption/ciphers.rs | 282 ++++++++++++++++++++++++++ parquet/src/encryption/mod.rs | 21 ++ parquet/src/file/footer.rs | 264 +++++++++++++++++++++++- parquet/src/file/mod.rs | 1 + parquet/src/lib.rs | 4 + 7 files changed, 636 insertions(+), 3 deletions(-) create mode 100644 parquet/src/encryption/ciphers.rs create mode 100644 parquet/src/encryption/mod.rs diff --git a/parquet/Cargo.toml b/parquet/Cargo.toml index 00d4c5b750f8..cc7b5688742a 100644 --- a/parquet/Cargo.toml +++ b/parquet/Cargo.toml @@ -70,6 +70,7 @@ half = { version = "2.1", default-features = false, features = ["num-traits"] } sysinfo = { version = "0.33.0", optional = true, default-features = false, features = ["system"] } crc32fast = { version = "1.4.2", optional = true, default-features = false } simdutf8 = { version = "0.1.5", optional = true, default-features = false } +ring = { version = "0.17", default-features = false, features = ["std"]} [dev-dependencies] base64 = { version = "0.22", default-features = false, features = ["std"] } @@ -125,6 +126,7 @@ sysinfo = ["dep:sysinfo"] crc = ["dep:crc32fast"] # Enable SIMD UTF-8 validation simdutf8 = ["dep:simdutf8"] +#encryption = ["aes-gcm", "base64"] [[example]] diff --git a/parquet/src/arrow/arrow_reader/mod.rs b/parquet/src/arrow/arrow_reader/mod.rs index 6eba04c86f91..55f3ab027b37 100644 --- a/parquet/src/arrow/arrow_reader/mod.rs +++ b/parquet/src/arrow/arrow_reader/mod.rs @@ -42,6 +42,10 @@ mod filter; mod selection; pub mod statistics; +use crate::file::footer; +use crate::file::page_index::index_reader; +use crate::encryption::ciphers::FileDecryptionProperties; + /// Builder for constructing parquet readers into arrow. /// /// Most users should use one of the following specializations: @@ -317,7 +321,7 @@ impl ArrowReaderOptions { /// /// // Create the reader and read the data using the supplied schema. /// let mut reader = builder.build().unwrap(); - /// let _batch = reader.next().unwrap().unwrap(); + /// let _batch = reader.next().unwrap().unwrap(); /// ``` pub fn with_schema(self, schema: SchemaRef) -> Self { Self { @@ -369,6 +373,35 @@ pub struct ArrowReaderMetadata { } impl ArrowReaderMetadata { + /// Loads [`ArrowReaderMetadata`] from the provided [`ChunkReader`] + /// + /// See [`ParquetRecordBatchReaderBuilder::new_with_metadata`] for how this can be used + pub fn load2(reader: &T, options: ArrowReaderOptions) -> Result { + Self::load_with_decryption(reader, options, FileDecryptionProperties::builder().build()) + } + + pub fn load_with_decryption(reader: &T, options: ArrowReaderOptions, + file_decryption_properties: FileDecryptionProperties) -> Result { + let mut metadata = footer::parse_metadata_with_decryption(reader, file_decryption_properties)?; + if options.page_index { + let column_index = metadata + .row_groups() + .iter() + .map(|rg| index_reader::read_columns_indexes(reader, rg.columns())) + .collect::>>()?; + metadata.set_column_index(Some(column_index)); + + let offset_index = metadata + .row_groups() + .iter() + .map(|rg| index_reader::read_offset_indexes(reader, rg.columns())) + .collect::>>()?; + + metadata.set_offset_index(Some(offset_index)) + } + Self::try_new(Arc::new(metadata), options) + } + /// Loads [`ArrowReaderMetadata`] from the provided [`ChunkReader`], if necessary /// /// See [`ParquetRecordBatchReaderBuilder::new_with_metadata`] for an @@ -532,6 +565,11 @@ impl ParquetRecordBatchReaderBuilder { Ok(Self::new_with_metadata(reader, metadata)) } + pub fn try_new_with_decryption(reader: T, options: ArrowReaderOptions, file_decryption_properties: FileDecryptionProperties) -> Result { + let metadata = ArrowReaderMetadata::load_with_decryption(&reader, options, file_decryption_properties)?; + Ok(Self::new_with_metadata(reader, metadata)) + } + /// Create a [`ParquetRecordBatchReaderBuilder`] from the provided [`ArrowReaderMetadata`] /// /// This interface allows: @@ -788,6 +826,13 @@ impl ParquetRecordBatchReader { .build() } + pub fn try_new_with_decryption(reader: T, batch_size: usize, + file_decryption_properties: FileDecryptionProperties) -> Result { + ParquetRecordBatchReaderBuilder::try_new_with_decryption(reader, Default::default(), file_decryption_properties)? + .with_batch_size(batch_size) + .build() + } + /// Create a new [`ParquetRecordBatchReader`] from the provided [`RowGroups`] /// /// Note: this is a low-level interface see [`ParquetRecordBatchReader::try_new`] for a @@ -955,6 +1000,7 @@ mod tests { BoolType, ByteArray, ByteArrayType, DataType, FixedLenByteArray, FixedLenByteArrayType, FloatType, Int32Type, Int64Type, Int96Type, }; + use crate::encryption::ciphers; use crate::errors::Result; use crate::file::properties::{EnabledStatistics, WriterProperties, WriterVersion}; use crate::file::writer::SerializedFileWriter; @@ -1788,6 +1834,23 @@ mod tests { assert!(col.value(2).is_nan()); } + #[test] + fn test_uniform_encryption() { + let path = format!( + "{}/uniform_encryption.parquet.encrypted", + arrow::util::test_util::parquet_test_data(), + ); + let file = File::open(path).unwrap(); + // todo + let key_code: &[u8] = "0123456789012345".as_bytes(); + // todo + let decryption_properties = ciphers::FileDecryptionProperties::builder() + .with_footer_key(key_code.to_vec()) + .build(); + let record_reader = ParquetRecordBatchReader::try_new_with_decryption(file, 128, decryption_properties).unwrap(); + // todo check contents + } + #[test] fn test_read_float32_float64_byte_stream_split() { let path = format!( diff --git a/parquet/src/encryption/ciphers.rs b/parquet/src/encryption/ciphers.rs new file mode 100644 index 000000000000..db32146c6d5f --- /dev/null +++ b/parquet/src/encryption/ciphers.rs @@ -0,0 +1,282 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Encryption implementation specific to Parquet, as described +//! in the [spec](https://github.com/apache/parquet-format/blob/master/Encryption.md). + +use ring::aead::{Aad, LessSafeKey, NonceSequence, UnboundKey, AES_128_GCM}; +use ring::rand::{SecureRandom, SystemRandom}; +use crate::errors::{ParquetError, Result}; + +pub trait BlockEncryptor { + fn encrypt(&mut self, plaintext: &[u8], aad: &[u8]) -> Vec; +} + +pub trait BlockDecryptor { + fn decrypt(&self, length_and_ciphertext: &[u8], aad: &[u8]) -> Vec; +} + +const RIGHT_TWELVE: u128 = 0x0000_0000_ffff_ffff_ffff_ffff_ffff_ffff; +const NONCE_LEN: usize = 12; +const TAG_LEN: usize = 16; +const SIZE_LEN: usize = 4; + +struct CounterNonce { + start: u128, + counter: u128, +} + +impl CounterNonce { + pub fn new(rng: &SystemRandom) -> Self { + let mut buf = [0; 16]; + rng.fill(&mut buf).unwrap(); + + // Since this is a random seed value, endianess doesn't matter at all, + // and we can use whatever is platform-native. + let start = u128::from_ne_bytes(buf) & RIGHT_TWELVE; + let counter = start.wrapping_add(1); + + Self { start, counter } + } + + /// One accessor for the nonce bytes to avoid potentially flipping endianess + #[inline] + pub fn get_bytes(&self) -> [u8; NONCE_LEN] { + self.counter.to_le_bytes()[0..NONCE_LEN].try_into().unwrap() + } +} + +impl NonceSequence for CounterNonce { + fn advance(&mut self) -> Result { + // If we've wrapped around, we've exhausted this nonce sequence + if (self.counter & RIGHT_TWELVE) == (self.start & RIGHT_TWELVE) { + Err(ring::error::Unspecified) + } else { + // Otherwise, just advance and return the new value + let buf: [u8; NONCE_LEN] = self.get_bytes(); + self.counter = self.counter.wrapping_add(1); + Ok(ring::aead::Nonce::assume_unique_for_key(buf)) + } + } +} + +pub(crate) struct RingGcmBlockEncryptor { + key: LessSafeKey, + nonce_sequence: CounterNonce, +} + +impl RingGcmBlockEncryptor { + // todo TBD: some KMS systems produce data keys, need to be able to pass them to Encryptor. + // todo TBD: for other KMSs, we will create data keys inside arrow-rs, making sure to use SystemRandom + /// Create a new `RingGcmBlockEncryptor` with a given key and random nonce. + /// The nonce will advance appropriately with each block encryption and + /// return an error if it wraps around. + pub(crate) fn new(key_bytes: &[u8]) -> Self { + let rng = SystemRandom::new(); + + // todo support other key sizes + let key = UnboundKey::new(&AES_128_GCM, key_bytes.as_ref()).unwrap(); + let nonce = CounterNonce::new(&rng); + + Self { + key: LessSafeKey::new(key), + nonce_sequence: nonce, + } + } +} + +impl BlockEncryptor for RingGcmBlockEncryptor { + fn encrypt(&mut self, plaintext: &[u8], aad: &[u8]) -> Vec { + let nonce = self.nonce_sequence.advance().unwrap(); + let ciphertext_len = plaintext.len() + NONCE_LEN + TAG_LEN; + // todo TBD: add first 4 bytes with the length, per https://github.com/apache/parquet-format/blob/master/Encryption.md#51-encrypted-module-serialization + let mut result = Vec::with_capacity(SIZE_LEN + ciphertext_len); + result.extend_from_slice((ciphertext_len as i32).to_le_bytes().as_ref()); + result.extend_from_slice(nonce.as_ref()); + result.extend_from_slice(plaintext); + + let tag = self + .key + .seal_in_place_separate_tag(nonce, Aad::from(aad), &mut result[SIZE_LEN + NONCE_LEN..]) + .unwrap(); + result.extend_from_slice(tag.as_ref()); + + result + } +} + +pub(crate) struct RingGcmBlockDecryptor { + key: LessSafeKey, +} + +impl RingGcmBlockDecryptor { + pub(crate) fn new(key_bytes: &[u8]) -> Self { + // todo support other key sizes + let key = UnboundKey::new(&AES_128_GCM, key_bytes).unwrap(); + + Self { + key: LessSafeKey::new(key), + } + } +} + +impl BlockDecryptor for RingGcmBlockDecryptor { + fn decrypt(&self, length_and_ciphertext: &[u8], aad: &[u8]) -> Vec { + let mut result = Vec::with_capacity( + length_and_ciphertext.len() - SIZE_LEN - NONCE_LEN - TAG_LEN, + ); + result.extend_from_slice(&length_and_ciphertext[SIZE_LEN + NONCE_LEN..]); + + let nonce = ring::aead::Nonce::try_assume_unique_for_key( + &length_and_ciphertext[SIZE_LEN..SIZE_LEN + NONCE_LEN], + ) + .unwrap(); + + self.key + .open_in_place(nonce, Aad::from(aad), &mut result) + .unwrap(); + + result + } +} + +pub(crate) enum ModuleType { + Footer = 0, + ColumnMetaData = 1, + DataPage = 2, + DictionaryPage = 3, + DataPageHeader = 4, + DictionaryPageHeader = 5, + ColumnIndex = 6, + OffsetIndex = 7, + BloomFilterHeader = 8, + BloomFilterBitset = 9, +} + +pub fn create_footer_aad(file_aad: &[u8]) -> Result> { + create_module_aad(file_aad, ModuleType::Footer, -1, -1, -1) +} + +pub fn create_module_aad(file_aad: &[u8], module_type: ModuleType, row_group_ordinal: i32, + column_ordinal: i32, page_ordinal: i32) -> Result> { + + let module_buf = [module_type as u8]; + + if module_buf[0] == (ModuleType::Footer as u8) { + let mut aad = Vec::with_capacity(file_aad.len() + 1); + aad.extend_from_slice(file_aad); + aad.extend_from_slice(module_buf.as_ref()); + return Ok(aad) + } + + if row_group_ordinal < 0 { + return Err(general_err!("Wrong row group ordinal: {}", row_group_ordinal)); + } + if row_group_ordinal > u16::MAX as i32 { + return Err(general_err!("Encrypted parquet files can't have more than {} row groups: {}", + u16::MAX, row_group_ordinal)); + } + + if column_ordinal < 0 { + return Err(general_err!("Wrong column ordinal: {}", column_ordinal)); + } + if column_ordinal > u16::MAX as i32 { + return Err(general_err!("Encrypted parquet files can't have more than {} columns: {}", + u16::MAX, column_ordinal)); + } + + if module_buf[0] != (ModuleType::DataPageHeader as u8) && + module_buf[0] != (ModuleType::DataPage as u8) { + let mut aad = Vec::with_capacity(file_aad.len() + 5); + aad.extend_from_slice(file_aad); + aad.extend_from_slice(module_buf.as_ref()); + aad.extend_from_slice((row_group_ordinal as u16).to_le_bytes().as_ref()); + aad.extend_from_slice((column_ordinal as u16).to_le_bytes().as_ref()); + return Ok(aad) + } + + if page_ordinal < 0 { + return Err(general_err!("Wrong column ordinal: {}", page_ordinal)); + } + if page_ordinal > u16::MAX as i32 { + return Err(general_err!("Encrypted parquet files can't have more than {} pages in a chunk: {}", + u16::MAX, page_ordinal)); + } + + let mut aad = Vec::with_capacity(file_aad.len() + 7); + aad.extend_from_slice(file_aad); + aad.extend_from_slice(module_buf.as_ref()); + aad.extend_from_slice((row_group_ordinal as u16).to_le_bytes().as_ref()); + aad.extend_from_slice((column_ordinal as u16).to_le_bytes().as_ref()); + aad.extend_from_slice((page_ordinal as u16).to_le_bytes().as_ref()); + Ok(aad) +} + +pub struct FileDecryptionProperties { + footer_key: Option> +} + +impl FileDecryptionProperties { + pub fn builder() -> DecryptionPropertiesBuilder { + DecryptionPropertiesBuilder::with_defaults() + } +} + +pub struct DecryptionPropertiesBuilder { + footer_key: Option> +} + +impl DecryptionPropertiesBuilder { + pub fn with_defaults() -> Self { + Self { + footer_key: None + } + } + + pub fn build(self) -> FileDecryptionProperties { + FileDecryptionProperties { + footer_key: self.footer_key + } + } + + // todo decr: doc comment + pub fn with_footer_key(mut self, value: Vec) -> Self { + self.footer_key = Some(value); + self + } +} + +pub struct FileDecryptor { + decryption_properties: FileDecryptionProperties, + // todo decr: change to BlockDecryptor + footer_decryptor: RingGcmBlockDecryptor +} + +impl FileDecryptor { + pub(crate) fn new(decryption_properties: FileDecryptionProperties) -> Self { + Self { + // todo decr: if no key available yet (not set in properties, will be retrieved from metadata) + footer_decryptor: RingGcmBlockDecryptor::new(decryption_properties.footer_key.clone().unwrap().as_ref()), + decryption_properties + } + } + + // todo decr: change to BlockDecryptor + pub(crate) fn get_footer_decryptor(self) -> RingGcmBlockDecryptor { + self.footer_decryptor + } +} diff --git a/parquet/src/encryption/mod.rs b/parquet/src/encryption/mod.rs new file mode 100644 index 000000000000..e0e7f5d81919 --- /dev/null +++ b/parquet/src/encryption/mod.rs @@ -0,0 +1,21 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Encryption implementation specific to Parquet, as described +//! in the [spec](https://github.com/apache/parquet-format/blob/master/Encryption.md). + +pub mod ciphers; diff --git a/parquet/src/file/footer.rs b/parquet/src/file/footer.rs index bd31c9142f56..c7829420b514 100644 --- a/parquet/src/file/footer.rs +++ b/parquet/src/file/footer.rs @@ -17,8 +17,20 @@ //! Module for working with Parquet file footers. -use crate::errors::Result; -use crate::file::{metadata::*, reader::ChunkReader, FOOTER_SIZE}; +use std::{io::Read, sync::Arc}; + +use crate::format::{ColumnOrder as TColumnOrder, FileMetaData as TFileMetaData, + FileCryptoMetaData as TFileCryptoMetaData, EncryptionAlgorithm}; +use crate::thrift::{TCompactSliceInputProtocol, TSerializable}; + +use crate::basic::ColumnOrder; +use crate::encryption::ciphers; +use crate::encryption::ciphers::{BlockDecryptor, FileDecryptionProperties, FileDecryptor}; +use crate::errors::{ParquetError, Result}; +use crate::file::{metadata::*, reader::ChunkReader, + FOOTER_SIZE, PARQUET_MAGIC, PARQUET_MAGIC_ENCR_FOOTER}; + +use crate::schema::types::{self, SchemaDescriptor}; /// Reads the [ParquetMetaData] from the footer of the parquet file. /// @@ -49,6 +61,60 @@ pub fn parse_metadata(chunk_reader: &R) -> Result(chunk_reader: &R) -> Result { + parse_metadata_with_decryption(chunk_reader, FileDecryptionProperties::builder().build()) +} + +pub fn parse_metadata_with_decryption(chunk_reader: &R, decr_props: FileDecryptionProperties) -> Result { + // check file is large enough to hold footer + let file_size = chunk_reader.len(); + if file_size < (FOOTER_SIZE as u64) { + return Err(general_err!( + "Invalid Parquet file. Size is smaller than footer" + )); + } + + let mut footer = [0_u8; 8]; + chunk_reader + .get_read(file_size - 8)? + .read_exact(&mut footer)?; + + let encrypted_footer; + // check this is indeed a parquet file + if footer[4..] == PARQUET_MAGIC { + encrypted_footer = false; + } else if footer[4..] == PARQUET_MAGIC_ENCR_FOOTER { + encrypted_footer = true; + //panic!() // todo rm + } else { + return Err(general_err!("Invalid Parquet file. Corrupt footer")); + } + + // get the metadata length from the footer + let metadata_len = u32::from_le_bytes(footer[..4].try_into().unwrap()) as usize; + + //let metadata_len = decode_footer(&footer)?; todo rm this function + let footer_metadata_len = FOOTER_SIZE + metadata_len; + + if footer_metadata_len > file_size as usize { + return Err(general_err!( + "Invalid Parquet file. Reported metadata length of {} + {} byte footer, but file is only {} bytes", + metadata_len, + FOOTER_SIZE, + file_size + )); + } + + let start = file_size - footer_metadata_len as u64; + + if encrypted_footer { + let file_decryptor = FileDecryptor::new(decr_props); + decode_encrypted_metadata(chunk_reader.get_bytes(start, metadata_len)?.as_ref(), file_decryptor) + } else { + decode_metadata(chunk_reader.get_bytes(start, metadata_len)?.as_ref()) + } +} + /// Decodes [`ParquetMetaData`] from the provided bytes. /// /// Typically this is used to decode the metadata from the end of a parquet @@ -61,6 +127,41 @@ pub fn decode_metadata(buf: &[u8]) -> Result { ParquetMetaDataReader::decode_metadata(buf) } +pub fn decode_metadata2(buf: &[u8]) -> Result { + decode_metadata_with_decryption(buf) +} + +/// Decodes [`ParquetMetaData`] from the provided bytes +// todo add file decryptor +pub fn decode_metadata_with_decryption(buf: &[u8]) -> Result { + // TODO: row group filtering + let mut prot = TCompactSliceInputProtocol::new(buf); + let t_file_metadata: TFileMetaData = TFileMetaData::read_from_in_protocol(&mut prot) + .map_err(|e| ParquetError::General(format!("Could not parse metadata: {e}")))?; + let schema = types::from_thrift(&t_file_metadata.schema)?; + let schema_descr = Arc::new(SchemaDescriptor::new(schema)); + let mut row_groups = Vec::new(); + for rg in t_file_metadata.row_groups { + row_groups.push(RowGroupMetaData::from_thrift(schema_descr.clone(), rg)?); + } + let column_orders = parse_column_orders(t_file_metadata.column_orders, &schema_descr); + + if t_file_metadata.encryption_algorithm.is_some() { + // todo get key_metadata etc. Set file decryptor in return value + // todo check signature + } + + let file_metadata = FileMetaData::new( + t_file_metadata.version, + t_file_metadata.num_rows, + t_file_metadata.created_by, + t_file_metadata.key_value_metadata, + schema_descr, + column_orders, + ); + Ok(ParquetMetaData::new(file_metadata, row_groups)) +} + /// Decodes the Parquet footer returning the metadata length in bytes /// /// A parquet footer is 8 bytes long and has the following layout: @@ -76,3 +177,162 @@ pub fn decode_metadata(buf: &[u8]) -> Result { pub fn decode_footer(slice: &[u8; FOOTER_SIZE]) -> Result { ParquetMetaDataReader::decode_footer(slice) } + +fn decode_encrypted_metadata(buf: &[u8], file_decryptor: FileDecryptor) -> Result { + // parse FileCryptoMetaData + let mut prot = TCompactSliceInputProtocol::new(buf.as_ref()); + let t_file_crypto_metadata: TFileCryptoMetaData = TFileCryptoMetaData::read_from_in_protocol(&mut prot) + .map_err(|e| ParquetError::General(format!("Could not parse crypto metadata: {e}")))?; + let algo = t_file_crypto_metadata.encryption_algorithm; + let aes_gcm_algo = if let EncryptionAlgorithm::AESGCMV1(a) = algo { a } + else { unreachable!() }; // todo decr: add support for GCMCTRV1 + + // todo decr: get key_metadata + + // remaining buffer contains encrypted FileMetaData + let decryptor = file_decryptor.get_footer_decryptor(); + // todo decr: get aad_prefix + // todo decr: set both aad_prefix and aad_file_unique in file_decryptor + let fmd_aad = ciphers::create_footer_aad(aes_gcm_algo.aad_file_unique.unwrap().as_ref()); + let decrypted_fmd_buf = decryptor.decrypt(prot.as_slice().as_ref(), fmd_aad.unwrap().as_ref()); + + // todo add file decryptor + decode_metadata_with_decryption(decrypted_fmd_buf.as_slice()) +} + +// todo decr: add encryption support +/// Decodes the footer returning the metadata length in bytes +pub fn decode_footer2(slice: &[u8; FOOTER_SIZE]) -> Result { + // check this is indeed a parquet file + if slice[4..] != PARQUET_MAGIC { + return Err(general_err!("Invalid Parquet file. Corrupt footer")); + } + + // get the metadata length from the footer + let metadata_len = u32::from_le_bytes(slice[..4].try_into().unwrap()); + // u32 won't be larger than usize in most cases + Ok(metadata_len as usize) +} + +/// Parses column orders from Thrift definition. +/// If no column orders are defined, returns `None`. +fn parse_column_orders( + t_column_orders: Option>, + schema_descr: &SchemaDescriptor, +) -> Option> { + match t_column_orders { + Some(orders) => { + // Should always be the case + assert_eq!( + orders.len(), + schema_descr.num_columns(), + "Column order length mismatch" + ); + let mut res = Vec::new(); + for (i, column) in schema_descr.columns().iter().enumerate() { + match orders[i] { + TColumnOrder::TYPEORDER(_) => { + let sort_order = ColumnOrder::get_sort_order( + column.logical_type(), + column.converted_type(), + column.physical_type(), + ); + res.push(ColumnOrder::TYPE_DEFINED_ORDER(sort_order)); + } + } + } + Some(res) + } + None => None, + } +} + +#[cfg(test)] +mod tests { + use super::*; + use bytes::Bytes; + + use crate::basic::SortOrder; + use crate::basic::Type; + use crate::format::TypeDefinedOrder; + use crate::schema::types::Type as SchemaType; + + #[test] + fn test_parse_metadata_size_smaller_than_footer() { + let test_file = tempfile::tempfile().unwrap(); + let reader_result = parse_metadata2(&test_file); + assert_eq!( + reader_result.unwrap_err().to_string(), + "Parquet error: Invalid Parquet file. Size is smaller than footer" + ); + } + + #[test] + fn test_parse_metadata_corrupt_footer() { + let data = Bytes::from(vec![1, 2, 3, 4, 5, 6, 7, 8]); + let reader_result = parse_metadata2(&data); + assert_eq!( + reader_result.unwrap_err().to_string(), + "Parquet error: Invalid Parquet file. Corrupt footer" + ); + } + + #[test] + fn test_parse_metadata_invalid_start() { + let test_file = Bytes::from(vec![255, 0, 0, 0, b'P', b'A', b'R', b'1']); + let reader_result = parse_metadata2(&test_file); + assert_eq!( + reader_result.unwrap_err().to_string(), + "Parquet error: Invalid Parquet file. Reported metadata length of 255 + 8 byte footer, but file is only 8 bytes" + ); + } + + #[test] + fn test_metadata_column_orders_parse() { + // Define simple schema, we do not need to provide logical types. + let fields = vec![ + Arc::new( + SchemaType::primitive_type_builder("col1", Type::INT32) + .build() + .unwrap(), + ), + Arc::new( + SchemaType::primitive_type_builder("col2", Type::FLOAT) + .build() + .unwrap(), + ), + ]; + let schema = SchemaType::group_type_builder("schema") + .with_fields(fields) + .build() + .unwrap(); + let schema_descr = SchemaDescriptor::new(Arc::new(schema)); + + let t_column_orders = Some(vec![ + TColumnOrder::TYPEORDER(TypeDefinedOrder::new()), + TColumnOrder::TYPEORDER(TypeDefinedOrder::new()), + ]); + + assert_eq!( + parse_column_orders(t_column_orders, &schema_descr), + Some(vec![ + ColumnOrder::TYPE_DEFINED_ORDER(SortOrder::SIGNED), + ColumnOrder::TYPE_DEFINED_ORDER(SortOrder::SIGNED) + ]) + ); + + // Test when no column orders are defined. + assert_eq!(parse_column_orders(None, &schema_descr), None); + } + + #[test] + #[should_panic(expected = "Column order length mismatch")] + fn test_metadata_column_orders_len_mismatch() { + let schema = SchemaType::group_type_builder("schema").build().unwrap(); + let schema_descr = SchemaDescriptor::new(Arc::new(schema)); + + let t_column_orders = Some(vec![TColumnOrder::TYPEORDER(TypeDefinedOrder::new())]); + + parse_column_orders(t_column_orders, &schema_descr); + } +} diff --git a/parquet/src/file/mod.rs b/parquet/src/file/mod.rs index 12ff35b51646..b36ef752ae6f 100644 --- a/parquet/src/file/mod.rs +++ b/parquet/src/file/mod.rs @@ -110,3 +110,4 @@ pub mod writer; /// The length of the parquet footer in bytes pub const FOOTER_SIZE: usize = 8; const PARQUET_MAGIC: [u8; 4] = [b'P', b'A', b'R', b'1']; +const PARQUET_MAGIC_ENCR_FOOTER: [u8; 4] = [b'P', b'A', b'R', b'E']; diff --git a/parquet/src/lib.rs b/parquet/src/lib.rs index 3ca0dbe98791..8c1c190ea871 100644 --- a/parquet/src/lib.rs +++ b/parquet/src/lib.rs @@ -140,6 +140,10 @@ pub mod column; experimental!(mod compression); experimental!(mod encodings); pub mod bloom_filter; + +//#[cfg(feature = "encryption")] +experimental!(mod encryption); + pub mod file; pub mod record; pub mod schema; From 3c1ca4f477461f81d17aeb7b5735ae86656c8d17 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Sat, 23 Nov 2024 22:55:38 +0100 Subject: [PATCH 02/97] Use ParquetMetaDataReader --- parquet/src/arrow/arrow_reader/mod.rs | 77 ++++---- parquet/src/arrow/async_reader/mod.rs | 5 +- parquet/src/encryption/ciphers.rs | 1 + parquet/src/file/footer.rs | 274 +------------------------- parquet/src/file/metadata/reader.rs | 94 +++++++-- 5 files changed, 127 insertions(+), 324 deletions(-) diff --git a/parquet/src/arrow/arrow_reader/mod.rs b/parquet/src/arrow/arrow_reader/mod.rs index 55f3ab027b37..0060338fcfff 100644 --- a/parquet/src/arrow/arrow_reader/mod.rs +++ b/parquet/src/arrow/arrow_reader/mod.rs @@ -42,8 +42,6 @@ mod filter; mod selection; pub mod statistics; -use crate::file::footer; -use crate::file::page_index::index_reader; use crate::encryption::ciphers::FileDecryptionProperties; /// Builder for constructing parquet readers into arrow. @@ -373,35 +371,6 @@ pub struct ArrowReaderMetadata { } impl ArrowReaderMetadata { - /// Loads [`ArrowReaderMetadata`] from the provided [`ChunkReader`] - /// - /// See [`ParquetRecordBatchReaderBuilder::new_with_metadata`] for how this can be used - pub fn load2(reader: &T, options: ArrowReaderOptions) -> Result { - Self::load_with_decryption(reader, options, FileDecryptionProperties::builder().build()) - } - - pub fn load_with_decryption(reader: &T, options: ArrowReaderOptions, - file_decryption_properties: FileDecryptionProperties) -> Result { - let mut metadata = footer::parse_metadata_with_decryption(reader, file_decryption_properties)?; - if options.page_index { - let column_index = metadata - .row_groups() - .iter() - .map(|rg| index_reader::read_columns_indexes(reader, rg.columns())) - .collect::>>()?; - metadata.set_column_index(Some(column_index)); - - let offset_index = metadata - .row_groups() - .iter() - .map(|rg| index_reader::read_offset_indexes(reader, rg.columns())) - .collect::>>()?; - - metadata.set_offset_index(Some(offset_index)) - } - Self::try_new(Arc::new(metadata), options) - } - /// Loads [`ArrowReaderMetadata`] from the provided [`ChunkReader`], if necessary /// /// See [`ParquetRecordBatchReaderBuilder::new_with_metadata`] for an @@ -412,9 +381,14 @@ impl ArrowReaderMetadata { /// If `options` has [`ArrowReaderOptions::with_page_index`] true, but /// `Self::metadata` is missing the page index, this function will attempt /// to load the page index by making an object store request. - pub fn load(reader: &T, options: ArrowReaderOptions) -> Result { + pub fn load( + reader: &T, + options: ArrowReaderOptions, + file_decryption_properties: Option, + ) -> Result { let metadata = ParquetMetaDataReader::new() .with_page_indexes(options.page_index) + .with_encryption_properties(file_decryption_properties) .parse_and_finish(reader)?; Self::try_new(Arc::new(metadata), options) } @@ -561,12 +535,16 @@ impl ParquetRecordBatchReaderBuilder { /// Create a new [`ParquetRecordBatchReaderBuilder`] with [`ArrowReaderOptions`] pub fn try_new_with_options(reader: T, options: ArrowReaderOptions) -> Result { - let metadata = ArrowReaderMetadata::load(&reader, options)?; + let metadata = ArrowReaderMetadata::load(&reader, options, None)?; Ok(Self::new_with_metadata(reader, metadata)) } - pub fn try_new_with_decryption(reader: T, options: ArrowReaderOptions, file_decryption_properties: FileDecryptionProperties) -> Result { - let metadata = ArrowReaderMetadata::load_with_decryption(&reader, options, file_decryption_properties)?; + pub fn try_new_with_decryption( + reader: T, + options: ArrowReaderOptions, + file_decryption_properties: Option, + ) -> Result { + let metadata = ArrowReaderMetadata::load(&reader, options, file_decryption_properties)?; Ok(Self::new_with_metadata(reader, metadata)) } @@ -826,11 +804,18 @@ impl ParquetRecordBatchReader { .build() } - pub fn try_new_with_decryption(reader: T, batch_size: usize, - file_decryption_properties: FileDecryptionProperties) -> Result { - ParquetRecordBatchReaderBuilder::try_new_with_decryption(reader, Default::default(), file_decryption_properties)? - .with_batch_size(batch_size) - .build() + pub fn try_new_with_decryption( + reader: T, + batch_size: usize, + file_decryption_properties: Option, + ) -> Result { + ParquetRecordBatchReaderBuilder::try_new_with_decryption( + reader, + Default::default(), + file_decryption_properties, + )? + .with_batch_size(batch_size) + .build() } /// Create a new [`ParquetRecordBatchReader`] from the provided [`RowGroups`] @@ -1844,10 +1829,14 @@ mod tests { // todo let key_code: &[u8] = "0123456789012345".as_bytes(); // todo - let decryption_properties = ciphers::FileDecryptionProperties::builder() - .with_footer_key(key_code.to_vec()) - .build(); - let record_reader = ParquetRecordBatchReader::try_new_with_decryption(file, 128, decryption_properties).unwrap(); + let decryption_properties = Some( + ciphers::FileDecryptionProperties::builder() + .with_footer_key(key_code.to_vec()) + .build(), + ); + let record_reader = + ParquetRecordBatchReader::try_new_with_decryption(file, 128, decryption_properties) + .unwrap(); // todo check contents } diff --git a/parquet/src/arrow/async_reader/mod.rs b/parquet/src/arrow/async_reader/mod.rs index 2c8a59399de1..d7a6ef698e15 100644 --- a/parquet/src/arrow/async_reader/mod.rs +++ b/parquet/src/arrow/async_reader/mod.rs @@ -153,7 +153,10 @@ impl AsyncFileReader for T { let mut buf = Vec::with_capacity(metadata_len); self.take(metadata_len as _).read_to_end(&mut buf).await?; - Ok(Arc::new(ParquetMetaDataReader::decode_metadata(&buf)?)) + // TODO: add self.file_decryption_properties + Ok(Arc::new(ParquetMetaDataReader::decode_metadata( + &buf, None, + )?)) } .boxed() } diff --git a/parquet/src/encryption/ciphers.rs b/parquet/src/encryption/ciphers.rs index db32146c6d5f..a067b56a4e6c 100644 --- a/parquet/src/encryption/ciphers.rs +++ b/parquet/src/encryption/ciphers.rs @@ -226,6 +226,7 @@ pub fn create_module_aad(file_aad: &[u8], module_type: ModuleType, row_group_ord Ok(aad) } +#[derive(Clone)] pub struct FileDecryptionProperties { footer_key: Option> } diff --git a/parquet/src/file/footer.rs b/parquet/src/file/footer.rs index c7829420b514..3192eac4cde0 100644 --- a/parquet/src/file/footer.rs +++ b/parquet/src/file/footer.rs @@ -17,20 +17,9 @@ //! Module for working with Parquet file footers. -use std::{io::Read, sync::Arc}; - -use crate::format::{ColumnOrder as TColumnOrder, FileMetaData as TFileMetaData, - FileCryptoMetaData as TFileCryptoMetaData, EncryptionAlgorithm}; -use crate::thrift::{TCompactSliceInputProtocol, TSerializable}; - -use crate::basic::ColumnOrder; -use crate::encryption::ciphers; -use crate::encryption::ciphers::{BlockDecryptor, FileDecryptionProperties, FileDecryptor}; -use crate::errors::{ParquetError, Result}; -use crate::file::{metadata::*, reader::ChunkReader, - FOOTER_SIZE, PARQUET_MAGIC, PARQUET_MAGIC_ENCR_FOOTER}; - -use crate::schema::types::{self, SchemaDescriptor}; +use crate::encryption::ciphers::FileDecryptionProperties; +use crate::errors::Result; +use crate::file::{metadata::*, reader::ChunkReader, FOOTER_SIZE}; /// Reads the [ParquetMetaData] from the footer of the parquet file. /// @@ -61,105 +50,19 @@ pub fn parse_metadata(chunk_reader: &R) -> Result(chunk_reader: &R) -> Result { - parse_metadata_with_decryption(chunk_reader, FileDecryptionProperties::builder().build()) -} - -pub fn parse_metadata_with_decryption(chunk_reader: &R, decr_props: FileDecryptionProperties) -> Result { - // check file is large enough to hold footer - let file_size = chunk_reader.len(); - if file_size < (FOOTER_SIZE as u64) { - return Err(general_err!( - "Invalid Parquet file. Size is smaller than footer" - )); - } - - let mut footer = [0_u8; 8]; - chunk_reader - .get_read(file_size - 8)? - .read_exact(&mut footer)?; - - let encrypted_footer; - // check this is indeed a parquet file - if footer[4..] == PARQUET_MAGIC { - encrypted_footer = false; - } else if footer[4..] == PARQUET_MAGIC_ENCR_FOOTER { - encrypted_footer = true; - //panic!() // todo rm - } else { - return Err(general_err!("Invalid Parquet file. Corrupt footer")); - } - - // get the metadata length from the footer - let metadata_len = u32::from_le_bytes(footer[..4].try_into().unwrap()) as usize; - - //let metadata_len = decode_footer(&footer)?; todo rm this function - let footer_metadata_len = FOOTER_SIZE + metadata_len; - - if footer_metadata_len > file_size as usize { - return Err(general_err!( - "Invalid Parquet file. Reported metadata length of {} + {} byte footer, but file is only {} bytes", - metadata_len, - FOOTER_SIZE, - file_size - )); - } - - let start = file_size - footer_metadata_len as u64; - - if encrypted_footer { - let file_decryptor = FileDecryptor::new(decr_props); - decode_encrypted_metadata(chunk_reader.get_bytes(start, metadata_len)?.as_ref(), file_decryptor) - } else { - decode_metadata(chunk_reader.get_bytes(start, metadata_len)?.as_ref()) - } -} - /// Decodes [`ParquetMetaData`] from the provided bytes. /// /// Typically this is used to decode the metadata from the end of a parquet -/// file. The format of `buf` is the Thift compact binary protocol, as specified +/// file. The format of `buf` is the Thrift compact binary protocol, as specified /// by the [Parquet Spec]. /// /// [Parquet Spec]: https://github.com/apache/parquet-format#metadata #[deprecated(since = "53.1.0", note = "Use ParquetMetaDataReader::decode_metadata")] -pub fn decode_metadata(buf: &[u8]) -> Result { - ParquetMetaDataReader::decode_metadata(buf) -} - -pub fn decode_metadata2(buf: &[u8]) -> Result { - decode_metadata_with_decryption(buf) -} - -/// Decodes [`ParquetMetaData`] from the provided bytes -// todo add file decryptor -pub fn decode_metadata_with_decryption(buf: &[u8]) -> Result { - // TODO: row group filtering - let mut prot = TCompactSliceInputProtocol::new(buf); - let t_file_metadata: TFileMetaData = TFileMetaData::read_from_in_protocol(&mut prot) - .map_err(|e| ParquetError::General(format!("Could not parse metadata: {e}")))?; - let schema = types::from_thrift(&t_file_metadata.schema)?; - let schema_descr = Arc::new(SchemaDescriptor::new(schema)); - let mut row_groups = Vec::new(); - for rg in t_file_metadata.row_groups { - row_groups.push(RowGroupMetaData::from_thrift(schema_descr.clone(), rg)?); - } - let column_orders = parse_column_orders(t_file_metadata.column_orders, &schema_descr); - - if t_file_metadata.encryption_algorithm.is_some() { - // todo get key_metadata etc. Set file decryptor in return value - // todo check signature - } - - let file_metadata = FileMetaData::new( - t_file_metadata.version, - t_file_metadata.num_rows, - t_file_metadata.created_by, - t_file_metadata.key_value_metadata, - schema_descr, - column_orders, - ); - Ok(ParquetMetaData::new(file_metadata, row_groups)) +pub fn decode_metadata( + buf: &[u8], + file_decryption_properties: Option, +) -> Result { + ParquetMetaDataReader::decode_metadata(buf, file_decryption_properties) } /// Decodes the Parquet footer returning the metadata length in bytes @@ -177,162 +80,3 @@ pub fn decode_metadata_with_decryption(buf: &[u8]) -> Result { pub fn decode_footer(slice: &[u8; FOOTER_SIZE]) -> Result { ParquetMetaDataReader::decode_footer(slice) } - -fn decode_encrypted_metadata(buf: &[u8], file_decryptor: FileDecryptor) -> Result { - // parse FileCryptoMetaData - let mut prot = TCompactSliceInputProtocol::new(buf.as_ref()); - let t_file_crypto_metadata: TFileCryptoMetaData = TFileCryptoMetaData::read_from_in_protocol(&mut prot) - .map_err(|e| ParquetError::General(format!("Could not parse crypto metadata: {e}")))?; - let algo = t_file_crypto_metadata.encryption_algorithm; - let aes_gcm_algo = if let EncryptionAlgorithm::AESGCMV1(a) = algo { a } - else { unreachable!() }; // todo decr: add support for GCMCTRV1 - - // todo decr: get key_metadata - - // remaining buffer contains encrypted FileMetaData - let decryptor = file_decryptor.get_footer_decryptor(); - // todo decr: get aad_prefix - // todo decr: set both aad_prefix and aad_file_unique in file_decryptor - let fmd_aad = ciphers::create_footer_aad(aes_gcm_algo.aad_file_unique.unwrap().as_ref()); - let decrypted_fmd_buf = decryptor.decrypt(prot.as_slice().as_ref(), fmd_aad.unwrap().as_ref()); - - // todo add file decryptor - decode_metadata_with_decryption(decrypted_fmd_buf.as_slice()) -} - -// todo decr: add encryption support -/// Decodes the footer returning the metadata length in bytes -pub fn decode_footer2(slice: &[u8; FOOTER_SIZE]) -> Result { - // check this is indeed a parquet file - if slice[4..] != PARQUET_MAGIC { - return Err(general_err!("Invalid Parquet file. Corrupt footer")); - } - - // get the metadata length from the footer - let metadata_len = u32::from_le_bytes(slice[..4].try_into().unwrap()); - // u32 won't be larger than usize in most cases - Ok(metadata_len as usize) -} - -/// Parses column orders from Thrift definition. -/// If no column orders are defined, returns `None`. -fn parse_column_orders( - t_column_orders: Option>, - schema_descr: &SchemaDescriptor, -) -> Option> { - match t_column_orders { - Some(orders) => { - // Should always be the case - assert_eq!( - orders.len(), - schema_descr.num_columns(), - "Column order length mismatch" - ); - let mut res = Vec::new(); - for (i, column) in schema_descr.columns().iter().enumerate() { - match orders[i] { - TColumnOrder::TYPEORDER(_) => { - let sort_order = ColumnOrder::get_sort_order( - column.logical_type(), - column.converted_type(), - column.physical_type(), - ); - res.push(ColumnOrder::TYPE_DEFINED_ORDER(sort_order)); - } - } - } - Some(res) - } - None => None, - } -} - -#[cfg(test)] -mod tests { - use super::*; - use bytes::Bytes; - - use crate::basic::SortOrder; - use crate::basic::Type; - use crate::format::TypeDefinedOrder; - use crate::schema::types::Type as SchemaType; - - #[test] - fn test_parse_metadata_size_smaller_than_footer() { - let test_file = tempfile::tempfile().unwrap(); - let reader_result = parse_metadata2(&test_file); - assert_eq!( - reader_result.unwrap_err().to_string(), - "Parquet error: Invalid Parquet file. Size is smaller than footer" - ); - } - - #[test] - fn test_parse_metadata_corrupt_footer() { - let data = Bytes::from(vec![1, 2, 3, 4, 5, 6, 7, 8]); - let reader_result = parse_metadata2(&data); - assert_eq!( - reader_result.unwrap_err().to_string(), - "Parquet error: Invalid Parquet file. Corrupt footer" - ); - } - - #[test] - fn test_parse_metadata_invalid_start() { - let test_file = Bytes::from(vec![255, 0, 0, 0, b'P', b'A', b'R', b'1']); - let reader_result = parse_metadata2(&test_file); - assert_eq!( - reader_result.unwrap_err().to_string(), - "Parquet error: Invalid Parquet file. Reported metadata length of 255 + 8 byte footer, but file is only 8 bytes" - ); - } - - #[test] - fn test_metadata_column_orders_parse() { - // Define simple schema, we do not need to provide logical types. - let fields = vec![ - Arc::new( - SchemaType::primitive_type_builder("col1", Type::INT32) - .build() - .unwrap(), - ), - Arc::new( - SchemaType::primitive_type_builder("col2", Type::FLOAT) - .build() - .unwrap(), - ), - ]; - let schema = SchemaType::group_type_builder("schema") - .with_fields(fields) - .build() - .unwrap(); - let schema_descr = SchemaDescriptor::new(Arc::new(schema)); - - let t_column_orders = Some(vec![ - TColumnOrder::TYPEORDER(TypeDefinedOrder::new()), - TColumnOrder::TYPEORDER(TypeDefinedOrder::new()), - ]); - - assert_eq!( - parse_column_orders(t_column_orders, &schema_descr), - Some(vec![ - ColumnOrder::TYPE_DEFINED_ORDER(SortOrder::SIGNED), - ColumnOrder::TYPE_DEFINED_ORDER(SortOrder::SIGNED) - ]) - ); - - // Test when no column orders are defined. - assert_eq!(parse_column_orders(None, &schema_descr), None); - } - - #[test] - #[should_panic(expected = "Column order length mismatch")] - fn test_metadata_column_orders_len_mismatch() { - let schema = SchemaType::group_type_builder("schema").build().unwrap(); - let schema_descr = SchemaDescriptor::new(Arc::new(schema)); - - let t_column_orders = Some(vec![TColumnOrder::TYPEORDER(TypeDefinedOrder::new())]); - - parse_column_orders(t_column_orders, &schema_descr); - } -} diff --git a/parquet/src/file/metadata/reader.rs b/parquet/src/file/metadata/reader.rs index d465a49c3544..c27f35402757 100644 --- a/parquet/src/file/metadata/reader.rs +++ b/parquet/src/file/metadata/reader.rs @@ -20,13 +20,19 @@ use std::{io::Read, ops::Range, sync::Arc}; use bytes::Bytes; use crate::basic::ColumnOrder; +use crate::encryption::ciphers::{ + create_footer_aad, BlockDecryptor, FileDecryptionProperties, FileDecryptor, +}; use crate::errors::{ParquetError, Result}; use crate::file::metadata::{FileMetaData, ParquetMetaData, RowGroupMetaData}; use crate::file::page_index::index::Index; use crate::file::page_index::index_reader::{acc_range, decode_column_index, decode_offset_index}; use crate::file::reader::ChunkReader; -use crate::file::{FOOTER_SIZE, PARQUET_MAGIC}; -use crate::format::{ColumnOrder as TColumnOrder, FileMetaData as TFileMetaData}; +use crate::file::{FOOTER_SIZE, PARQUET_MAGIC, PARQUET_MAGIC_ENCR_FOOTER}; +use crate::format::{ + ColumnOrder as TColumnOrder, EncryptionAlgorithm, FileCryptoMetaData as TFileCryptoMetaData, + FileMetaData as TFileMetaData, +}; use crate::schema::types; use crate::schema::types::SchemaDescriptor; use crate::thrift::{TCompactSliceInputProtocol, TSerializable}; @@ -68,6 +74,7 @@ pub struct ParquetMetaDataReader { // Size of the serialized thrift metadata plus the 8 byte footer. Only set if // `self.parse_metadata` is called. metadata_size: Option, + file_decryption_properties: Option, } impl ParquetMetaDataReader { @@ -126,6 +133,17 @@ impl ParquetMetaDataReader { self } + /// Provide the [`FileDecryptionProperties`] to use when decrypting the file. + /// + /// This is only necessary when the file is encrypted. + pub fn with_encryption_properties( + mut self, + properties: Option, + ) -> Self { + self.file_decryption_properties = properties; + self + } + /// Indicates whether this reader has a [`ParquetMetaData`] internally. pub fn has_metadata(&self) -> bool { self.metadata.is_some() @@ -372,8 +390,13 @@ impl ParquetMetaDataReader { mut fetch: F, file_size: usize, ) -> Result<()> { - let (metadata, remainder) = - Self::load_metadata(&mut fetch, file_size, self.get_prefetch_size()).await?; + let (metadata, remainder) = Self::load_metadata( + &mut fetch, + file_size, + self.get_prefetch_size(), + self.file_decryption_properties.clone(), + ) + .await?; self.metadata = Some(metadata); @@ -519,7 +542,10 @@ impl ParquetMetaDataReader { } let start = file_size - footer_metadata_len as u64; - Self::decode_metadata(chunk_reader.get_bytes(start, metadata_len)?.as_ref()) + Self::decode_metadata( + chunk_reader.get_bytes(start, metadata_len)?.as_ref(), + self.file_decryption_properties.clone(), + ) } /// Return the number of bytes to read in the initial pass. If `prefetch_size` has @@ -540,6 +566,7 @@ impl ParquetMetaDataReader { fetch: &mut F, file_size: usize, prefetch: usize, + file_decryption_properties: Option, ) -> Result<(ParquetMetaData, Option<(usize, Bytes)>)> { if file_size < FOOTER_SIZE { return Err(eof_err!("file size of {} is less than footer", file_size)); @@ -578,12 +605,15 @@ impl ParquetMetaDataReader { if length > suffix_len - FOOTER_SIZE { let metadata_start = file_size - length - FOOTER_SIZE; let meta = fetch.fetch(metadata_start..file_size - FOOTER_SIZE).await?; - Ok((Self::decode_metadata(&meta)?, None)) + Ok(( + Self::decode_metadata(&meta, file_decryption_properties)?, + None, + )) } else { let metadata_start = file_size - length - FOOTER_SIZE - footer_start; let slice = &suffix[metadata_start..suffix_len - FOOTER_SIZE]; Ok(( - Self::decode_metadata(slice)?, + Self::decode_metadata(slice, file_decryption_properties)?, Some((footer_start, suffix.slice(..metadata_start))), )) } @@ -593,16 +623,16 @@ impl ParquetMetaDataReader { /// /// A parquet footer is 8 bytes long and has the following layout: /// * 4 bytes for the metadata length - /// * 4 bytes for the magic bytes 'PAR1' + /// * 4 bytes for the magic bytes 'PAR1' or 'PARE' (encrypted footer) /// /// ```text - /// +-----+--------+ - /// | len | 'PAR1' | - /// +-----+--------+ + /// +-----+------------------+ + /// | len | 'PAR1' or 'PARE' | + /// +-----+------------------+ /// ``` pub fn decode_footer(slice: &[u8; FOOTER_SIZE]) -> Result { // check this is indeed a parquet file - if slice[4..] != PARQUET_MAGIC { + if slice[4..] != PARQUET_MAGIC && slice[4..] != PARQUET_MAGIC_ENCR_FOOTER { return Err(general_err!("Invalid Parquet file. Corrupt footer")); } @@ -615,23 +645,59 @@ impl ParquetMetaDataReader { /// Decodes [`ParquetMetaData`] from the provided bytes. /// /// Typically this is used to decode the metadata from the end of a parquet - /// file. The format of `buf` is the Thift compact binary protocol, as specified + /// file. The format of `buf` is the Thrift compact binary protocol, as specified /// by the [Parquet Spec]. /// /// [Parquet Spec]: https://github.com/apache/parquet-format#metadata - pub fn decode_metadata(buf: &[u8]) -> Result { + pub fn decode_metadata( + buf: &[u8], + file_decryption_properties: Option, + ) -> Result { let mut prot = TCompactSliceInputProtocol::new(buf); + + let decrypted_fmd_buf; + if let Some(file_decryption_properties) = file_decryption_properties { + let t_file_crypto_metadata: TFileCryptoMetaData = + TFileCryptoMetaData::read_from_in_protocol(&mut prot) + .map_err(|e| general_err!("Could not parse crypto metadata: {}", e))?; + let algo = t_file_crypto_metadata.encryption_algorithm; + let aes_gcm_algo = if let EncryptionAlgorithm::AESGCMV1(a) = algo { + a + } else { + unreachable!() + }; // todo decr: add support for GCMCTRV1 + + // todo decr: get key_metadata + + // remaining buffer contains encrypted FileMetaData + let file_decryptor = FileDecryptor::new(file_decryption_properties); + let decryptor = file_decryptor.get_footer_decryptor(); + // todo decr: get aad_prefix + // todo decr: set both aad_prefix and aad_file_unique in file_decryptor + let fmd_aad = create_footer_aad(aes_gcm_algo.aad_file_unique.unwrap().as_ref()); + decrypted_fmd_buf = + decryptor.decrypt(prot.as_slice().as_ref(), fmd_aad.unwrap().as_ref()); + prot = TCompactSliceInputProtocol::new(decrypted_fmd_buf.as_ref()); + } + let t_file_metadata: TFileMetaData = TFileMetaData::read_from_in_protocol(&mut prot) .map_err(|e| general_err!("Could not parse metadata: {}", e))?; let schema = types::from_thrift(&t_file_metadata.schema)?; let schema_descr = Arc::new(SchemaDescriptor::new(schema)); let mut row_groups = Vec::new(); + // TODO: row group filtering for rg in t_file_metadata.row_groups { row_groups.push(RowGroupMetaData::from_thrift(schema_descr.clone(), rg)?); } let column_orders = Self::parse_column_orders(t_file_metadata.column_orders, &schema_descr)?; + // todo add file decryptor + if t_file_metadata.encryption_algorithm.is_some() { + // todo get key_metadata etc. Set file decryptor in return value + // todo check signature + } + let file_metadata = FileMetaData::new( t_file_metadata.version, t_file_metadata.num_rows, From a1bf0ea3ed8132c7179494833936fd06ce68124a Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Sat, 23 Nov 2024 23:57:00 +0100 Subject: [PATCH 03/97] Fix CI --- parquet/src/arrow/arrow_reader/mod.rs | 7 ++++++- parquet/src/arrow/async_reader/metadata.rs | 6 ++++-- parquet/src/encryption/ciphers.rs | 2 +- 3 files changed, 11 insertions(+), 4 deletions(-) diff --git a/parquet/src/arrow/arrow_reader/mod.rs b/parquet/src/arrow/arrow_reader/mod.rs index 0060338fcfff..2d6b24418f0a 100644 --- a/parquet/src/arrow/arrow_reader/mod.rs +++ b/parquet/src/arrow/arrow_reader/mod.rs @@ -539,6 +539,7 @@ impl ParquetRecordBatchReaderBuilder { Ok(Self::new_with_metadata(reader, metadata)) } + /// Create a new [`ParquetRecordBatchReaderBuilder`] with [`ArrowReaderOptions`] and [`FileDecryptionProperties`] pub fn try_new_with_decryption( reader: T, options: ArrowReaderOptions, @@ -569,6 +570,7 @@ impl ParquetRecordBatchReaderBuilder { /// # use arrow_schema::{DataType, Field, Schema}; /// # use parquet::arrow::arrow_reader::{ArrowReaderMetadata, ParquetRecordBatchReader, ParquetRecordBatchReaderBuilder}; /// # use parquet::arrow::ArrowWriter; + /// # /// # let mut file: Vec = Vec::with_capacity(1024); /// # let schema = Arc::new(Schema::new(vec![Field::new("i32", DataType::Int32, false)])); /// # let mut writer = ArrowWriter::try_new(&mut file, schema.clone(), None).unwrap(); @@ -577,7 +579,7 @@ impl ParquetRecordBatchReaderBuilder { /// # writer.close().unwrap(); /// # let file = Bytes::from(file); /// # - /// let metadata = ArrowReaderMetadata::load(&file, Default::default()).unwrap(); + /// let metadata = ArrowReaderMetadata::load(&file, Default::default(), None).unwrap(); /// let mut a = ParquetRecordBatchReaderBuilder::new_with_metadata(file.clone(), metadata.clone()).build().unwrap(); /// let mut b = ParquetRecordBatchReaderBuilder::new_with_metadata(file, metadata).build().unwrap(); /// @@ -804,6 +806,9 @@ impl ParquetRecordBatchReader { .build() } + /// Create a new [`ParquetRecordBatchReader`] from the provided chunk reader and [`FileDecryptionProperties`] + /// + /// Note: this is needed when the parquet file is encrypted pub fn try_new_with_decryption( reader: T, batch_size: usize, diff --git a/parquet/src/arrow/async_reader/metadata.rs b/parquet/src/arrow/async_reader/metadata.rs index 526818845b5c..084131b9cdcf 100644 --- a/parquet/src/arrow/async_reader/metadata.rs +++ b/parquet/src/arrow/async_reader/metadata.rs @@ -127,13 +127,15 @@ impl MetadataLoader { let (metadata, remainder) = if length > suffix_len - FOOTER_SIZE { let metadata_start = file_size - length - FOOTER_SIZE; let meta = fetch.fetch(metadata_start..file_size - FOOTER_SIZE).await?; - (ParquetMetaDataReader::decode_metadata(&meta)?, None) + // TODO: this won't decrypt + (ParquetMetaDataReader::decode_metadata(&meta, None)?, None) } else { let metadata_start = file_size - length - FOOTER_SIZE - footer_start; let slice = &suffix[metadata_start..suffix_len - FOOTER_SIZE]; ( - ParquetMetaDataReader::decode_metadata(slice)?, + // TODO: this won't decrypt + ParquetMetaDataReader::decode_metadata(slice, None)?, Some((footer_start, suffix.slice(..metadata_start))), ) }; diff --git a/parquet/src/encryption/ciphers.rs b/parquet/src/encryption/ciphers.rs index a067b56a4e6c..2b7ffb933281 100644 --- a/parquet/src/encryption/ciphers.rs +++ b/parquet/src/encryption/ciphers.rs @@ -171,7 +171,7 @@ pub fn create_footer_aad(file_aad: &[u8]) -> Result> { create_module_aad(file_aad, ModuleType::Footer, -1, -1, -1) } -pub fn create_module_aad(file_aad: &[u8], module_type: ModuleType, row_group_ordinal: i32, +fn create_module_aad(file_aad: &[u8], module_type: ModuleType, row_group_ordinal: i32, column_ordinal: i32, page_ordinal: i32) -> Result> { let module_buf = [module_type as u8]; From d75207306055ccd42e78b12405cfa3b25cd533dc Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Wed, 4 Dec 2024 00:56:31 +0100 Subject: [PATCH 04/97] test --- parquet/src/arrow/arrow_reader/mod.rs | 36 ++++++++++++++++++--------- 1 file changed, 24 insertions(+), 12 deletions(-) diff --git a/parquet/src/arrow/arrow_reader/mod.rs b/parquet/src/arrow/arrow_reader/mod.rs index 2d6b24418f0a..1a5584e598cd 100644 --- a/parquet/src/arrow/arrow_reader/mod.rs +++ b/parquet/src/arrow/arrow_reader/mod.rs @@ -979,8 +979,9 @@ mod tests { use arrow_select::concat::concat_batches; use crate::arrow::arrow_reader::{ - ArrowPredicateFn, ArrowReaderBuilder, ArrowReaderOptions, ParquetRecordBatchReader, - ParquetRecordBatchReaderBuilder, RowFilter, RowSelection, RowSelector, + ArrowPredicateFn, ArrowReaderBuilder, ArrowReaderMetadata, ArrowReaderOptions, + ParquetRecordBatchReader, ParquetRecordBatchReaderBuilder, RowFilter, RowSelection, + RowSelector, }; use crate::arrow::schema::add_encoded_arrow_schema_to_metadata; use crate::arrow::{ArrowWriter, ProjectionMask}; @@ -1826,23 +1827,34 @@ mod tests { #[test] fn test_uniform_encryption() { - let path = format!( - "{}/uniform_encryption.parquet.encrypted", - arrow::util::test_util::parquet_test_data(), - ); + let testdata = arrow::util::test_util::parquet_test_data(); + let path = format!("{testdata}/uniform_encryption.parquet.encrypted"); let file = File::open(path).unwrap(); - // todo + let key_code: &[u8] = "0123456789012345".as_bytes(); - // todo let decryption_properties = Some( ciphers::FileDecryptionProperties::builder() .with_footer_key(key_code.to_vec()) .build(), ); - let record_reader = - ParquetRecordBatchReader::try_new_with_decryption(file, 128, decryption_properties) - .unwrap(); - // todo check contents + + let metadata = ArrowReaderMetadata::load(&file, Default::default(), decryption_properties.clone()).unwrap(); + let file_metadata = metadata.metadata.file_metadata(); + + assert_eq!(file_metadata.num_rows(), 50); + assert_eq!(file_metadata.schema_descr().num_columns(), 8); + assert_eq!(file_metadata.created_by().unwrap(), "parquet-cpp-arrow version 14.0.0-SNAPSHOT"); + + metadata.metadata.row_groups().iter().for_each(|rg| { + assert_eq!(rg.num_columns(), 8); + assert_eq!(rg.num_rows(), 50); + assert_eq!(rg.total_byte_size(), 4172); + }); + + // todo: decrypting data + // let record_reader = + // ParquetRecordBatchReader::try_new_with_decryption(file, 128, decryption_properties) + // .unwrap(); } #[test] From 8e2e1185ef6e0d1e3fde998a9f59393b5b433e2b Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Wed, 11 Dec 2024 01:47:59 +0100 Subject: [PATCH 05/97] save progress --- parquet/src/arrow/arrow_reader/mod.rs | 33 +++++++++--- parquet/src/column/writer/mod.rs | 4 ++ parquet/src/encryption/ciphers.rs | 28 ++++++++-- parquet/src/file/footer.rs | 2 +- parquet/src/file/metadata/mod.rs | 14 +++-- parquet/src/file/metadata/reader.rs | 21 ++++---- parquet/src/file/serialized_reader.rs | 77 ++++++++++++++++++++++----- parquet/src/file/writer.rs | 1 + 8 files changed, 143 insertions(+), 37 deletions(-) diff --git a/parquet/src/arrow/arrow_reader/mod.rs b/parquet/src/arrow/arrow_reader/mod.rs index 1a5584e598cd..72be5f75c455 100644 --- a/parquet/src/arrow/arrow_reader/mod.rs +++ b/parquet/src/arrow/arrow_reader/mod.rs @@ -384,7 +384,7 @@ impl ArrowReaderMetadata { pub fn load( reader: &T, options: ArrowReaderOptions, - file_decryption_properties: Option, + file_decryption_properties: Option<&FileDecryptionProperties>, ) -> Result { let metadata = ParquetMetaDataReader::new() .with_page_indexes(options.page_index) @@ -543,7 +543,7 @@ impl ParquetRecordBatchReaderBuilder { pub fn try_new_with_decryption( reader: T, options: ArrowReaderOptions, - file_decryption_properties: Option, + file_decryption_properties: Option<&FileDecryptionProperties>, ) -> Result { let metadata = ArrowReaderMetadata::load(&reader, options, file_decryption_properties)?; Ok(Self::new_with_metadata(reader, metadata)) @@ -809,10 +809,11 @@ impl ParquetRecordBatchReader { /// Create a new [`ParquetRecordBatchReader`] from the provided chunk reader and [`FileDecryptionProperties`] /// /// Note: this is needed when the parquet file is encrypted + // todo: add options or put file_decryption_properties into options pub fn try_new_with_decryption( reader: T, batch_size: usize, - file_decryption_properties: Option, + file_decryption_properties: Option<&FileDecryptionProperties>, ) -> Result { ParquetRecordBatchReaderBuilder::try_new_with_decryption( reader, @@ -1838,7 +1839,7 @@ mod tests { .build(), ); - let metadata = ArrowReaderMetadata::load(&file, Default::default(), decryption_properties.clone()).unwrap(); + let metadata = ArrowReaderMetadata::load(&file, Default::default(), decryption_properties.as_ref()).unwrap(); let file_metadata = metadata.metadata.file_metadata(); assert_eq!(file_metadata.num_rows(), 50); @@ -1852,9 +1853,27 @@ mod tests { }); // todo: decrypting data - // let record_reader = - // ParquetRecordBatchReader::try_new_with_decryption(file, 128, decryption_properties) - // .unwrap(); + let record_reader = + ParquetRecordBatchReader::try_new_with_decryption(file, 128, decryption_properties.as_ref()) + .unwrap(); + // todo check contents + let mut row_count = 0; + for batch in record_reader { + let batch = batch.unwrap(); + row_count += batch.num_rows(); + let f32_col = batch.column(0).as_primitive::(); + let f64_col = batch.column(1).as_primitive::(); + + // This file contains floats from a standard normal distribution + for &x in f32_col.values() { + assert!(x > -10.0); + assert!(x < 10.0); + } + for &x in f64_col.values() { + assert!(x > -10.0); + assert!(x < 10.0); + } + } } #[test] diff --git a/parquet/src/column/writer/mod.rs b/parquet/src/column/writer/mod.rs index 5f34f34cbb7a..2c0ba2e05ad1 100644 --- a/parquet/src/column/writer/mod.rs +++ b/parquet/src/column/writer/mod.rs @@ -2105,6 +2105,7 @@ mod tests { r.rows_written as usize, None, Arc::new(props), + None, ) .unwrap(); @@ -2157,6 +2158,7 @@ mod tests { r.rows_written as usize, None, Arc::new(props), + None, ) .unwrap(); @@ -2292,6 +2294,7 @@ mod tests { r.rows_written as usize, None, Arc::new(props), + None, ) .unwrap(), ); @@ -3741,6 +3744,7 @@ mod tests { result.rows_written as usize, None, Arc::new(props), + None, ) .unwrap(), ); diff --git a/parquet/src/encryption/ciphers.rs b/parquet/src/encryption/ciphers.rs index 2b7ffb933281..b4b7f47df5b0 100644 --- a/parquet/src/encryption/ciphers.rs +++ b/parquet/src/encryption/ciphers.rs @@ -119,6 +119,7 @@ impl BlockEncryptor for RingGcmBlockEncryptor { } } +#[derive(Debug, Clone)] pub(crate) struct RingGcmBlockDecryptor { key: LessSafeKey, } @@ -226,7 +227,7 @@ fn create_module_aad(file_aad: &[u8], module_type: ModuleType, row_group_ordinal Ok(aad) } -#[derive(Clone)] +#[derive(Debug, Clone, PartialEq)] pub struct FileDecryptionProperties { footer_key: Option> } @@ -261,18 +262,25 @@ impl DecryptionPropertiesBuilder { } } +#[derive(Debug, Clone)] pub struct FileDecryptor { decryption_properties: FileDecryptionProperties, // todo decr: change to BlockDecryptor footer_decryptor: RingGcmBlockDecryptor } +impl PartialEq for FileDecryptor { + fn eq(&self, other: &Self) -> bool { + self.decryption_properties == other.decryption_properties + } +} + impl FileDecryptor { - pub(crate) fn new(decryption_properties: FileDecryptionProperties) -> Self { + pub(crate) fn new(decryption_properties: &FileDecryptionProperties) -> Self { Self { // todo decr: if no key available yet (not set in properties, will be retrieved from metadata) footer_decryptor: RingGcmBlockDecryptor::new(decryption_properties.footer_key.clone().unwrap().as_ref()), - decryption_properties + decryption_properties: decryption_properties.clone() } } @@ -281,3 +289,17 @@ impl FileDecryptor { self.footer_decryptor } } + +pub struct CryptoContext { + row_group_ordinal: i32, + column_ordinal: i32, + metadata_decryptor: FileDecryptor, + data_decryptor: FileDecryptor, + file_decryption_properties: FileDecryptionProperties, + aad: Vec, +} + +impl CryptoContext { + pub fn data_decryptor(self) -> FileDecryptor { self.data_decryptor } + pub fn file_decryption_properties(&self) -> &FileDecryptionProperties { &self.file_decryption_properties } +} diff --git a/parquet/src/file/footer.rs b/parquet/src/file/footer.rs index 3192eac4cde0..af34fafb2e81 100644 --- a/parquet/src/file/footer.rs +++ b/parquet/src/file/footer.rs @@ -60,7 +60,7 @@ pub fn parse_metadata(chunk_reader: &R) -> Result, + file_decryption_properties: Option<&FileDecryptionProperties>, ) -> Result { ParquetMetaDataReader::decode_metadata(buf, file_decryption_properties) } diff --git a/parquet/src/file/metadata/mod.rs b/parquet/src/file/metadata/mod.rs index 252cb99f3f36..f5a3c7e599be 100644 --- a/parquet/src/file/metadata/mod.rs +++ b/parquet/src/file/metadata/mod.rs @@ -117,6 +117,7 @@ use crate::schema::types::{ pub use reader::ParquetMetaDataReader; pub use writer::ParquetMetaDataWriter; pub(crate) use writer::ThriftMetadataWriter; +use crate::encryption::ciphers::FileDecryptor; /// Page level statistics for each column chunk of each row group. /// @@ -174,15 +175,18 @@ pub struct ParquetMetaData { column_index: Option, /// Offset index for each page in each column chunk offset_index: Option, + /// Optional file decryptor + file_decryptor: Option, } impl ParquetMetaData { /// Creates Parquet metadata from file metadata and a list of row /// group metadata - pub fn new(file_metadata: FileMetaData, row_groups: Vec) -> Self { + pub fn new(file_metadata: FileMetaData, row_groups: Vec, file_decryptor: Option) -> Self { ParquetMetaData { file_metadata, row_groups, + file_decryptor, column_index: None, offset_index: None, } @@ -325,7 +329,7 @@ pub struct ParquetMetaDataBuilder(ParquetMetaData); impl ParquetMetaDataBuilder { /// Create a new builder from a file metadata, with no row groups pub fn new(file_meta_data: FileMetaData) -> Self { - Self(ParquetMetaData::new(file_meta_data, vec![])) + Self(ParquetMetaData::new(file_meta_data, vec![], None)) } /// Create a new builder from an existing ParquetMetaData @@ -528,6 +532,8 @@ pub struct RowGroupMetaData { ordinal: Option, } +// todo:rok + impl RowGroupMetaData { /// Returns builder for row group metadata. pub fn builder(schema_descr: SchemaDescPtr) -> RowGroupMetaDataBuilder { @@ -1849,7 +1855,7 @@ mod tests { let parquet_meta = ParquetMetaDataBuilder::new(file_metadata.clone()) .set_row_groups(row_group_meta_with_stats) .build(); - let base_expected_size = 2312; + let base_expected_size = 2896; assert_eq!(parquet_meta.memory_size(), base_expected_size); @@ -1876,7 +1882,7 @@ mod tests { ]])) .build(); - let bigger_expected_size = 2816; + let bigger_expected_size = 3400; // more set fields means more memory usage assert!(bigger_expected_size > base_expected_size); assert_eq!(parquet_meta.memory_size(), bigger_expected_size); diff --git a/parquet/src/file/metadata/reader.rs b/parquet/src/file/metadata/reader.rs index c27f35402757..39325f2c440d 100644 --- a/parquet/src/file/metadata/reader.rs +++ b/parquet/src/file/metadata/reader.rs @@ -138,9 +138,9 @@ impl ParquetMetaDataReader { /// This is only necessary when the file is encrypted. pub fn with_encryption_properties( mut self, - properties: Option, + properties: Option<&FileDecryptionProperties>, ) -> Self { - self.file_decryption_properties = properties; + self.file_decryption_properties = properties.cloned(); self } @@ -394,7 +394,7 @@ impl ParquetMetaDataReader { &mut fetch, file_size, self.get_prefetch_size(), - self.file_decryption_properties.clone(), + self.file_decryption_properties.as_ref(), ) .await?; @@ -544,7 +544,7 @@ impl ParquetMetaDataReader { let start = file_size - footer_metadata_len as u64; Self::decode_metadata( chunk_reader.get_bytes(start, metadata_len)?.as_ref(), - self.file_decryption_properties.clone(), + self.file_decryption_properties.as_ref(), ) } @@ -566,7 +566,7 @@ impl ParquetMetaDataReader { fetch: &mut F, file_size: usize, prefetch: usize, - file_decryption_properties: Option, + file_decryption_properties: Option<&FileDecryptionProperties>, ) -> Result<(ParquetMetaData, Option<(usize, Bytes)>)> { if file_size < FOOTER_SIZE { return Err(eof_err!("file size of {} is less than footer", file_size)); @@ -651,10 +651,11 @@ impl ParquetMetaDataReader { /// [Parquet Spec]: https://github.com/apache/parquet-format#metadata pub fn decode_metadata( buf: &[u8], - file_decryption_properties: Option, + file_decryption_properties: Option<&FileDecryptionProperties>, ) -> Result { let mut prot = TCompactSliceInputProtocol::new(buf); + let mut file_decryptor = None; let decrypted_fmd_buf; if let Some(file_decryption_properties) = file_decryption_properties { let t_file_crypto_metadata: TFileCryptoMetaData = @@ -670,13 +671,13 @@ impl ParquetMetaDataReader { // todo decr: get key_metadata // remaining buffer contains encrypted FileMetaData - let file_decryptor = FileDecryptor::new(file_decryption_properties); - let decryptor = file_decryptor.get_footer_decryptor(); + file_decryptor = Some(FileDecryptor::new(file_decryption_properties)); + let decryptor = file_decryptor.clone().unwrap().get_footer_decryptor(); // todo decr: get aad_prefix // todo decr: set both aad_prefix and aad_file_unique in file_decryptor let fmd_aad = create_footer_aad(aes_gcm_algo.aad_file_unique.unwrap().as_ref()); decrypted_fmd_buf = - decryptor.decrypt(prot.as_slice().as_ref(), fmd_aad.unwrap().as_ref()); + decryptor.decrypt(prot.as_slice().as_ref(), fmd_aad?.as_ref()); prot = TCompactSliceInputProtocol::new(decrypted_fmd_buf.as_ref()); } @@ -706,7 +707,7 @@ impl ParquetMetaDataReader { schema_descr, column_orders, ); - Ok(ParquetMetaData::new(file_metadata, row_groups)) + Ok(ParquetMetaData::new(file_metadata, row_groups, file_decryptor)) } /// Parses column orders from Thrift definition. diff --git a/parquet/src/file/serialized_reader.rs b/parquet/src/file/serialized_reader.rs index 81ba0a66463e..a5f0c57750b8 100644 --- a/parquet/src/file/serialized_reader.rs +++ b/parquet/src/file/serialized_reader.rs @@ -21,7 +21,6 @@ use std::collections::VecDeque; use std::iter; use std::{fs::File, io::Read, path::Path, sync::Arc}; - use crate::basic::{Encoding, Type}; use crate::bloom_filter::Sbbf; use crate::column::page::{Page, PageMetadata, PageReader}; @@ -34,13 +33,14 @@ use crate::file::{ reader::*, statistics, }; -use crate::format::{PageHeader, PageLocation, PageType}; +use crate::format::{PageHeader, PageLocation, PageType, FileCryptoMetaData as TFileCryptoMetaData, EncryptionAlgorithm}; use crate::record::reader::RowIter; use crate::record::Row; use crate::schema::types::Type as SchemaType; use crate::thrift::{TCompactSliceInputProtocol, TSerializable}; -use bytes::Bytes; +use bytes::{Buf, Bytes}; use thrift::protocol::TCompactInputProtocol; +use crate::encryption::ciphers::{create_footer_aad, BlockDecryptor, CryptoContext, FileDecryptionProperties, FileDecryptor, RingGcmBlockDecryptor}; impl TryFrom for SerializedFileReader { type Error = ParquetError; @@ -324,6 +324,7 @@ impl RowGroupReader for SerializedRowGroupReader<'_, R self.metadata.num_rows() as usize, page_locations, props, + None, )?)) } @@ -338,14 +339,37 @@ impl RowGroupReader for SerializedRowGroupReader<'_, R } /// Reads a [`PageHeader`] from the provided [`Read`] -pub(crate) fn read_page_header(input: &mut T) -> Result { - let mut prot = TCompactInputProtocol::new(input); +pub(crate) fn read_page_header(input: &mut T, crypto_context: Option<&CryptoContext>) -> Result { + let buf = &mut []; + let size = input.read(buf)?; + + // todo: decrypt buffer + let mut prot = TCompactSliceInputProtocol::new(buf.as_slice()); + let t_file_crypto_metadata: TFileCryptoMetaData = + TFileCryptoMetaData::read_from_in_protocol(&mut prot) + .map_err(|e| general_err!("Could not parse crypto metadata: {}", e))?; + + let file_decryption_properties = crypto_context.unwrap().file_decryption_properties(); + let file_decryptor = FileDecryptor::new(file_decryption_properties); + + // let fmd_aad = create_footer_aad(aes_gcm_algo.aad_file_unique.unwrap().as_ref()); + let algo = t_file_crypto_metadata.encryption_algorithm; + let aes_gcm_algo = if let EncryptionAlgorithm::AESGCMV1(a) = algo { + a + } else { + unreachable!() + }; // todo decr: add support for GCMCTRV1 + let fmd_aad = create_footer_aad(aes_gcm_algo.aad_file_unique.unwrap().as_ref()); + let buf2 = file_decryptor.get_footer_decryptor().decrypt(prot.as_slice().as_ref(), fmd_aad?.as_ref()); + + let mut prot = TCompactInputProtocol::new(buf2.reader()); + let page_header = PageHeader::read_from_in_protocol(&mut prot)?; Ok(page_header) } /// Reads a [`PageHeader`] from the provided [`Read`] returning the number of bytes read -fn read_page_header_len(input: &mut T) -> Result<(usize, PageHeader)> { +fn read_page_header_len(input: &mut T, crypto_context: Option<&CryptoContext>) -> Result<(usize, PageHeader)> { /// A wrapper around a [`std::io::Read`] that keeps track of the bytes read struct TrackedRead { inner: R, @@ -364,7 +388,7 @@ fn read_page_header_len(input: &mut T) -> Result<(usize, PageHeader)> { inner: input, bytes_read: 0, }; - let header = read_page_header(&mut tracked)?; + let header = read_page_header(&mut tracked, crypto_context)?; Ok((tracked.bytes_read, header)) } @@ -512,6 +536,9 @@ pub struct SerializedPageReader { physical_type: Type, state: SerializedPageReaderState, + + /// Crypto context + crypto_context: Option<&'static CryptoContext>, } impl SerializedPageReader { @@ -523,7 +550,7 @@ impl SerializedPageReader { page_locations: Option>, ) -> Result { let props = Arc::new(ReaderProperties::builder().build()); - SerializedPageReader::new_with_properties(reader, meta, total_rows, page_locations, props) + SerializedPageReader::new_with_properties(reader, meta, total_rows, page_locations, props, None) } /// Creates a new serialized page with custom options. @@ -533,6 +560,7 @@ impl SerializedPageReader { total_rows: usize, page_locations: Option>, props: ReaderPropertiesPtr, + crypto_context: Option<&'static CryptoContext>, ) -> Result { let decompressor = create_codec(meta.compression(), props.codec_options())?; let (start, len) = meta.byte_range(); @@ -560,12 +588,21 @@ impl SerializedPageReader { next_page_header: None, }, }; - + if crypto_context.is_some() { + return Ok(Self { + reader, + decompressor, + state, + physical_type: meta.column_type(), + crypto_context, + }) + } Ok(Self { reader, decompressor, state, physical_type: meta.column_type(), + crypto_context: None, }) } @@ -670,10 +707,26 @@ impl PageReader for SerializedPageReader { } let mut read = self.reader.get_read(*offset as u64)?; + // let mut prot = TCompactSliceInputProtocol::new(buffer.as_ref()); + + // let decrypted_fmd_buf = + // decryptor.decrypt(prot.as_slice().as_ref(), fmd_aad?.as_ref()); + + // if let Some(z) = self.crypto_context.as_ref() { + // let c = read.take(1); + // // read = z.get_data_decryptor().decrypt(&read, b"aaaaa"); + // // let (header_len, header) = read_page_header_len(&mut read)?; + // // header + // // let dec = z.get_data_decryptor().decrypt(header_len, header); + // } + // let file_decryptor = self.crypto_context.unwrap().get_data_decryptor().unwrap(); + let file_decryption_properties = + FileDecryptionProperties::builder().with_footer_key("0123456789012345".into()).build(); + // let file_decryptor = FileDecryptor::new(&file_decryption_properties); let header = if let Some(header) = next_page_header.take() { *header } else { - let (header_len, header) = read_page_header_len(&mut read)?; + let (header_len, header) = read_page_header_len(&mut read, self.crypto_context)?; verify_page_header_len(header_len, *remaining)?; *offset += header_len; *remaining -= header_len; @@ -766,7 +819,7 @@ impl PageReader for SerializedPageReader { } } else { let mut read = self.reader.get_read(*offset as u64)?; - let (header_len, header) = read_page_header_len(&mut read)?; + let (header_len, header) = read_page_header_len(&mut read, None)?; verify_page_header_len(header_len, *remaining_bytes)?; *offset += header_len; *remaining_bytes -= header_len; @@ -828,7 +881,7 @@ impl PageReader for SerializedPageReader { *remaining_bytes -= buffered_header.compressed_page_size as usize; } else { let mut read = self.reader.get_read(*offset as u64)?; - let (header_len, header) = read_page_header_len(&mut read)?; + let (header_len, header) = read_page_header_len(&mut read, None)?; verify_page_header_len(header_len, *remaining_bytes)?; verify_page_size( header.compressed_page_size, diff --git a/parquet/src/file/writer.rs b/parquet/src/file/writer.rs index 6b7707f03cd9..b509707da604 100644 --- a/parquet/src/file/writer.rs +++ b/parquet/src/file/writer.rs @@ -1314,6 +1314,7 @@ mod tests { total_num_values as usize, None, Arc::new(props), + None, ) .unwrap(); From be10eb340e0c18705878973320a35d6fe81cb585 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Tue, 17 Dec 2024 00:45:35 +0100 Subject: [PATCH 06/97] work --- parquet/src/arrow/arrow_reader/mod.rs | 22 ++++++- parquet/src/arrow/async_reader/mod.rs | 1 + parquet/src/encryption/ciphers.rs | 84 +++++++++++++++++++------- parquet/src/file/metadata/mod.rs | 7 +++ parquet/src/file/metadata/reader.rs | 16 +++-- parquet/src/file/serialized_reader.rs | 86 ++++++++++++--------------- 6 files changed, 137 insertions(+), 79 deletions(-) diff --git a/parquet/src/arrow/arrow_reader/mod.rs b/parquet/src/arrow/arrow_reader/mod.rs index 72be5f75c455..b3765a6b03d3 100644 --- a/parquet/src/arrow/arrow_reader/mod.rs +++ b/parquet/src/arrow/arrow_reader/mod.rs @@ -19,7 +19,7 @@ use std::collections::VecDeque; use std::sync::Arc; - +use num::ToPrimitive; use arrow_array::cast::AsArray; use arrow_array::Array; use arrow_array::{RecordBatch, RecordBatchReader}; @@ -42,7 +42,7 @@ mod filter; mod selection; pub mod statistics; -use crate::encryption::ciphers::FileDecryptionProperties; +use crate::encryption::ciphers::{CryptoContext, FileDecryptionProperties}; /// Builder for constructing parquet readers into arrow. /// @@ -695,7 +695,18 @@ impl Iterator for ReaderPageIterator { let total_rows = rg.num_rows() as usize; let reader = self.reader.clone(); - let ret = SerializedPageReader::new(reader, meta, total_rows, page_locations); + let file_decryptor = Arc::new(self.metadata.file_decryptor().clone().unwrap()); + // let aad_file_unique = file_decryptor?.aad_file_unique(); + // let aad_prefix = file_decryptor?.aad_prefix(); + // + // let file_decryptor = FileDecryptor::new(file_decryptor, aad_file_unique.clone(), aad_prefix.clone()); + + let crypto_context = CryptoContext::new( + meta.dictionary_page_offset().is_some(), rg_idx.to_i16()?, self.column_idx.to_i16()?, file_decryptor.clone(), file_decryptor); + let crypto_context = Arc::new(crypto_context); + + let ret = SerializedPageReader::new(reader, meta, total_rows, page_locations, Some(crypto_context)); + // let ret = SerializedPageReader::new(reader, meta, total_rows, page_locations); Some(ret.map(|x| Box::new(x) as _)) } } @@ -1853,6 +1864,11 @@ mod tests { }); // todo: decrypting data + let decryption_properties = Some( + ciphers::FileDecryptionProperties::builder() + .with_footer_key(key_code.to_vec()) + .build(), + ); let record_reader = ParquetRecordBatchReader::try_new_with_decryption(file, 128, decryption_properties.as_ref()) .unwrap(); diff --git a/parquet/src/arrow/async_reader/mod.rs b/parquet/src/arrow/async_reader/mod.rs index d7a6ef698e15..e35342cd32e6 100644 --- a/parquet/src/arrow/async_reader/mod.rs +++ b/parquet/src/arrow/async_reader/mod.rs @@ -972,6 +972,7 @@ impl RowGroups for InMemoryRowGroup<'_> { self.metadata.column(i), self.row_count, page_locations, + None, )?); Ok(Box::new(ColumnChunkIterator { diff --git a/parquet/src/encryption/ciphers.rs b/parquet/src/encryption/ciphers.rs index b4b7f47df5b0..89515fe0e006 100644 --- a/parquet/src/encryption/ciphers.rs +++ b/parquet/src/encryption/ciphers.rs @@ -18,6 +18,7 @@ //! Encryption implementation specific to Parquet, as described //! in the [spec](https://github.com/apache/parquet-format/blob/master/Encryption.md). +use std::sync::Arc; use ring::aead::{Aad, LessSafeKey, NonceSequence, UnboundKey, AES_128_GCM}; use ring::rand::{SecureRandom, SystemRandom}; use crate::errors::{ParquetError, Result}; @@ -172,8 +173,12 @@ pub fn create_footer_aad(file_aad: &[u8]) -> Result> { create_module_aad(file_aad, ModuleType::Footer, -1, -1, -1) } -fn create_module_aad(file_aad: &[u8], module_type: ModuleType, row_group_ordinal: i32, - column_ordinal: i32, page_ordinal: i32) -> Result> { +pub fn create_page_aad(file_aad: &[u8], module_type: ModuleType, row_group_ordinal: i16, column_ordinal: i16, page_ordinal: i32) -> Result> { + create_module_aad(file_aad, module_type, row_group_ordinal, column_ordinal, page_ordinal) +} + +fn create_module_aad(file_aad: &[u8], module_type: ModuleType, row_group_ordinal: i16, + column_ordinal: i16, page_ordinal: i32) -> Result> { let module_buf = [module_type as u8]; @@ -187,7 +192,7 @@ fn create_module_aad(file_aad: &[u8], module_type: ModuleType, row_group_ordinal if row_group_ordinal < 0 { return Err(general_err!("Wrong row group ordinal: {}", row_group_ordinal)); } - if row_group_ordinal > u16::MAX as i32 { + if row_group_ordinal > i16::MAX { return Err(general_err!("Encrypted parquet files can't have more than {} row groups: {}", u16::MAX, row_group_ordinal)); } @@ -195,7 +200,7 @@ fn create_module_aad(file_aad: &[u8], module_type: ModuleType, row_group_ordinal if column_ordinal < 0 { return Err(general_err!("Wrong column ordinal: {}", column_ordinal)); } - if column_ordinal > u16::MAX as i32 { + if column_ordinal > i16::MAX { return Err(general_err!("Encrypted parquet files can't have more than {} columns: {}", u16::MAX, column_ordinal)); } @@ -205,15 +210,15 @@ fn create_module_aad(file_aad: &[u8], module_type: ModuleType, row_group_ordinal let mut aad = Vec::with_capacity(file_aad.len() + 5); aad.extend_from_slice(file_aad); aad.extend_from_slice(module_buf.as_ref()); - aad.extend_from_slice((row_group_ordinal as u16).to_le_bytes().as_ref()); - aad.extend_from_slice((column_ordinal as u16).to_le_bytes().as_ref()); + aad.extend_from_slice((row_group_ordinal as i16).to_le_bytes().as_ref()); + aad.extend_from_slice((column_ordinal as i16).to_le_bytes().as_ref()); return Ok(aad) } if page_ordinal < 0 { - return Err(general_err!("Wrong column ordinal: {}", page_ordinal)); + return Err(general_err!("Wrong page ordinal: {}", page_ordinal)); } - if page_ordinal > u16::MAX as i32 { + if page_ordinal > i32::MAX { return Err(general_err!("Encrypted parquet files can't have more than {} pages in a chunk: {}", u16::MAX, page_ordinal)); } @@ -221,9 +226,9 @@ fn create_module_aad(file_aad: &[u8], module_type: ModuleType, row_group_ordinal let mut aad = Vec::with_capacity(file_aad.len() + 7); aad.extend_from_slice(file_aad); aad.extend_from_slice(module_buf.as_ref()); - aad.extend_from_slice((row_group_ordinal as u16).to_le_bytes().as_ref()); - aad.extend_from_slice((column_ordinal as u16).to_le_bytes().as_ref()); - aad.extend_from_slice((page_ordinal as u16).to_le_bytes().as_ref()); + aad.extend_from_slice(row_group_ordinal.to_le_bytes().as_ref()); + aad.extend_from_slice(column_ordinal.to_le_bytes().as_ref()); + aad.extend_from_slice(page_ordinal.to_le_bytes().as_ref()); Ok(aad) } @@ -266,7 +271,9 @@ impl DecryptionPropertiesBuilder { pub struct FileDecryptor { decryption_properties: FileDecryptionProperties, // todo decr: change to BlockDecryptor - footer_decryptor: RingGcmBlockDecryptor + footer_decryptor: RingGcmBlockDecryptor, + aad_file_unique: Vec, + aad_prefix: Vec, } impl PartialEq for FileDecryptor { @@ -276,11 +283,13 @@ impl PartialEq for FileDecryptor { } impl FileDecryptor { - pub(crate) fn new(decryption_properties: &FileDecryptionProperties) -> Self { + pub(crate) fn new(decryption_properties: &FileDecryptionProperties, aad_file_unique: Vec, aad_prefix: Vec) -> Self { Self { // todo decr: if no key available yet (not set in properties, will be retrieved from metadata) footer_decryptor: RingGcmBlockDecryptor::new(decryption_properties.footer_key.clone().unwrap().as_ref()), - decryption_properties: decryption_properties.clone() + decryption_properties: decryption_properties.clone(), + aad_file_unique, + aad_prefix, } } @@ -288,18 +297,49 @@ impl FileDecryptor { pub(crate) fn get_footer_decryptor(self) -> RingGcmBlockDecryptor { self.footer_decryptor } + + pub(crate) fn decryption_properties(&self) -> &FileDecryptionProperties { + &self.decryption_properties + } + + pub(crate) fn footer_decryptor(&self) -> RingGcmBlockDecryptor { + self.footer_decryptor.clone() + } + + pub(crate) fn aad_file_unique(&self) -> &Vec { + &self.aad_file_unique + } + + pub(crate) fn aad_prefix(&self) -> &Vec { + &self.aad_prefix + } } +#[derive(Debug, Clone)] pub struct CryptoContext { - row_group_ordinal: i32, - column_ordinal: i32, - metadata_decryptor: FileDecryptor, - data_decryptor: FileDecryptor, - file_decryption_properties: FileDecryptionProperties, - aad: Vec, + pub(crate) start_decrypt_with_dictionary_page: bool, + pub(crate) row_group_ordinal: i16, + pub(crate) column_ordinal: i16, + pub(crate) data_decryptor: Arc, + pub(crate) metadata_decryptor: Arc, + } impl CryptoContext { - pub fn data_decryptor(self) -> FileDecryptor { self.data_decryptor } - pub fn file_decryption_properties(&self) -> &FileDecryptionProperties { &self.file_decryption_properties } + pub fn new(start_decrypt_with_dictionary_page: bool, row_group_ordinal: i16, + column_ordinal: i16, data_decryptor: Arc, + metadata_decryptor: Arc) -> Self { + Self { + start_decrypt_with_dictionary_page, + row_group_ordinal, + column_ordinal, + data_decryptor, + metadata_decryptor, + } + } + pub fn start_decrypt_with_dictionary_page(&self) -> &bool { &self.start_decrypt_with_dictionary_page } + pub fn row_group_ordinal(&self) -> &i16 { &self.row_group_ordinal } + pub fn column_ordinal(&self) -> &i16 { &self.column_ordinal } + pub fn data_decryptor(&self) -> Arc { self.data_decryptor.clone()} + pub fn metadata_decryptor(&self) -> Arc { self.metadata_decryptor.clone() } } diff --git a/parquet/src/file/metadata/mod.rs b/parquet/src/file/metadata/mod.rs index f5a3c7e599be..1bbbecc7f54a 100644 --- a/parquet/src/file/metadata/mod.rs +++ b/parquet/src/file/metadata/mod.rs @@ -218,6 +218,13 @@ impl ParquetMetaData { &self.file_metadata } + /// Returns file decryptor as reference. + pub fn file_decryptor(&self) -> &Option { + &self.file_decryptor + } + + + /// Returns number of row groups in this file. pub fn num_row_groups(&self) -> usize { self.row_groups.len() diff --git a/parquet/src/file/metadata/reader.rs b/parquet/src/file/metadata/reader.rs index 39325f2c440d..494e161510ce 100644 --- a/parquet/src/file/metadata/reader.rs +++ b/parquet/src/file/metadata/reader.rs @@ -671,13 +671,19 @@ impl ParquetMetaDataReader { // todo decr: get key_metadata // remaining buffer contains encrypted FileMetaData - file_decryptor = Some(FileDecryptor::new(file_decryption_properties)); - let decryptor = file_decryptor.clone().unwrap().get_footer_decryptor(); + // todo decr: get aad_prefix // todo decr: set both aad_prefix and aad_file_unique in file_decryptor - let fmd_aad = create_footer_aad(aes_gcm_algo.aad_file_unique.unwrap().as_ref()); + let aad_file_unique = aes_gcm_algo.aad_file_unique.unwrap(); + let aad_footer = create_footer_aad(aad_file_unique.as_ref())?; + let aad_prefix : Vec = aes_gcm_algo.aad_prefix.unwrap_or_default(); + + file_decryptor = Some(FileDecryptor::new(file_decryption_properties, aad_file_unique.clone(), aad_prefix.clone())); + let decryptor = file_decryptor.clone().unwrap().get_footer_decryptor(); + // file_decryptor = Some(FileDecryptor::new(file_decryption_properties, aad, aad_prefix)); + decrypted_fmd_buf = - decryptor.decrypt(prot.as_slice().as_ref(), fmd_aad?.as_ref()); + decryptor.decrypt(prot.as_slice().as_ref(), aad_footer.as_ref()); prot = TCompactSliceInputProtocol::new(decrypted_fmd_buf.as_ref()); } @@ -707,7 +713,7 @@ impl ParquetMetaDataReader { schema_descr, column_orders, ); - Ok(ParquetMetaData::new(file_metadata, row_groups, file_decryptor)) + Ok(ParquetMetaData::new(file_metadata, row_groups, Some(file_decryptor.unwrap()))) } /// Parses column orders from Thrift definition. diff --git a/parquet/src/file/serialized_reader.rs b/parquet/src/file/serialized_reader.rs index a5f0c57750b8..e8165e2c5a14 100644 --- a/parquet/src/file/serialized_reader.rs +++ b/parquet/src/file/serialized_reader.rs @@ -38,9 +38,11 @@ use crate::record::reader::RowIter; use crate::record::Row; use crate::schema::types::Type as SchemaType; use crate::thrift::{TCompactSliceInputProtocol, TSerializable}; -use bytes::{Buf, Bytes}; -use thrift::protocol::TCompactInputProtocol; -use crate::encryption::ciphers::{create_footer_aad, BlockDecryptor, CryptoContext, FileDecryptionProperties, FileDecryptor, RingGcmBlockDecryptor}; +use bytes::Bytes; +use thrift::protocol::{TCompactInputProtocol, TInputProtocol}; +use zstd::zstd_safe::WriteBuf; +use crate::data_type::AsBytes; +use crate::encryption::ciphers::{create_page_aad, BlockDecryptor, CryptoContext, FileDecryptionProperties, ModuleType}; impl TryFrom for SerializedFileReader { type Error = ParquetError; @@ -339,37 +341,38 @@ impl RowGroupReader for SerializedRowGroupReader<'_, R } /// Reads a [`PageHeader`] from the provided [`Read`] -pub(crate) fn read_page_header(input: &mut T, crypto_context: Option<&CryptoContext>) -> Result { - let buf = &mut []; - let size = input.read(buf)?; - - // todo: decrypt buffer - let mut prot = TCompactSliceInputProtocol::new(buf.as_slice()); - let t_file_crypto_metadata: TFileCryptoMetaData = - TFileCryptoMetaData::read_from_in_protocol(&mut prot) - .map_err(|e| general_err!("Could not parse crypto metadata: {}", e))?; - - let file_decryption_properties = crypto_context.unwrap().file_decryption_properties(); - let file_decryptor = FileDecryptor::new(file_decryption_properties); - - // let fmd_aad = create_footer_aad(aes_gcm_algo.aad_file_unique.unwrap().as_ref()); - let algo = t_file_crypto_metadata.encryption_algorithm; - let aes_gcm_algo = if let EncryptionAlgorithm::AESGCMV1(a) = algo { - a - } else { - unreachable!() - }; // todo decr: add support for GCMCTRV1 - let fmd_aad = create_footer_aad(aes_gcm_algo.aad_file_unique.unwrap().as_ref()); - let buf2 = file_decryptor.get_footer_decryptor().decrypt(prot.as_slice().as_ref(), fmd_aad?.as_ref()); - - let mut prot = TCompactInputProtocol::new(buf2.reader()); - +pub(crate) fn read_page_header(input: &mut T, crypto_context: Option>) -> Result { + let mut prot = TCompactInputProtocol::new(input); + if let Some(crypto_context) = crypto_context { + // let mut buf = [0; 16 * 1024]; + // let size = input.read(&mut buf)?; + + let decryptor = &crypto_context.data_decryptor(); + let file_decryptor = decryptor.footer_decryptor(); + let aad_file_unique = decryptor.aad_file_unique(); + // let aad_prefix = decryptor.aad_prefix(); + + let aad = create_page_aad( + aad_file_unique.as_slice(), + ModuleType::DictionaryPageHeader, + crypto_context.row_group_ordinal, + crypto_context.column_ordinal, + 0, + )?; + + // todo: This currently fails, possibly due to wrongly generated AAD + let buf = file_decryptor.decrypt(prot.read_bytes()?.as_slice(), aad.as_ref()); + todo!("Decrypted page header!"); + let mut prot = TCompactSliceInputProtocol::new(buf.as_slice()); + let page_header = PageHeader::read_from_in_protocol(&mut prot)?; + return Ok(page_header) + } let page_header = PageHeader::read_from_in_protocol(&mut prot)?; Ok(page_header) } /// Reads a [`PageHeader`] from the provided [`Read`] returning the number of bytes read -fn read_page_header_len(input: &mut T, crypto_context: Option<&CryptoContext>) -> Result<(usize, PageHeader)> { +fn read_page_header_len(input: &mut T, crypto_context: Option>) -> Result<(usize, PageHeader)> { /// A wrapper around a [`std::io::Read`] that keeps track of the bytes read struct TrackedRead { inner: R, @@ -538,7 +541,7 @@ pub struct SerializedPageReader { state: SerializedPageReaderState, /// Crypto context - crypto_context: Option<&'static CryptoContext>, + crypto_context: Option>, } impl SerializedPageReader { @@ -548,9 +551,10 @@ impl SerializedPageReader { meta: &ColumnChunkMetaData, total_rows: usize, page_locations: Option>, + crypto_context: Option>, ) -> Result { let props = Arc::new(ReaderProperties::builder().build()); - SerializedPageReader::new_with_properties(reader, meta, total_rows, page_locations, props, None) + SerializedPageReader::new_with_properties(reader, meta, total_rows, page_locations, props, crypto_context) } /// Creates a new serialized page with custom options. @@ -560,7 +564,7 @@ impl SerializedPageReader { total_rows: usize, page_locations: Option>, props: ReaderPropertiesPtr, - crypto_context: Option<&'static CryptoContext>, + crypto_context: Option>, ) -> Result { let decompressor = create_codec(meta.compression(), props.codec_options())?; let (start, len) = meta.byte_range(); @@ -707,26 +711,10 @@ impl PageReader for SerializedPageReader { } let mut read = self.reader.get_read(*offset as u64)?; - // let mut prot = TCompactSliceInputProtocol::new(buffer.as_ref()); - - // let decrypted_fmd_buf = - // decryptor.decrypt(prot.as_slice().as_ref(), fmd_aad?.as_ref()); - - // if let Some(z) = self.crypto_context.as_ref() { - // let c = read.take(1); - // // read = z.get_data_decryptor().decrypt(&read, b"aaaaa"); - // // let (header_len, header) = read_page_header_len(&mut read)?; - // // header - // // let dec = z.get_data_decryptor().decrypt(header_len, header); - // } - // let file_decryptor = self.crypto_context.unwrap().get_data_decryptor().unwrap(); - let file_decryption_properties = - FileDecryptionProperties::builder().with_footer_key("0123456789012345".into()).build(); - // let file_decryptor = FileDecryptor::new(&file_decryption_properties); let header = if let Some(header) = next_page_header.take() { *header } else { - let (header_len, header) = read_page_header_len(&mut read, self.crypto_context)?; + let (header_len, header) = read_page_header_len(&mut read, self.crypto_context.clone())?; verify_page_header_len(header_len, *remaining)?; *offset += header_len; *remaining -= header_len; From 46910b2db2dc3c8778dcdb3d4944bc907886675a Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Tue, 17 Dec 2024 13:30:47 +0100 Subject: [PATCH 07/97] Review feedback --- parquet/src/arrow/arrow_reader/mod.rs | 6 +---- parquet/src/encryption/ciphers.rs | 17 ++++++------ parquet/src/file/serialized_reader.rs | 39 ++++++++++++++++++++------- 3 files changed, 40 insertions(+), 22 deletions(-) diff --git a/parquet/src/arrow/arrow_reader/mod.rs b/parquet/src/arrow/arrow_reader/mod.rs index b3765a6b03d3..e7baf88bd2c2 100644 --- a/parquet/src/arrow/arrow_reader/mod.rs +++ b/parquet/src/arrow/arrow_reader/mod.rs @@ -696,13 +696,9 @@ impl Iterator for ReaderPageIterator { let reader = self.reader.clone(); let file_decryptor = Arc::new(self.metadata.file_decryptor().clone().unwrap()); - // let aad_file_unique = file_decryptor?.aad_file_unique(); - // let aad_prefix = file_decryptor?.aad_prefix(); - // - // let file_decryptor = FileDecryptor::new(file_decryptor, aad_file_unique.clone(), aad_prefix.clone()); let crypto_context = CryptoContext::new( - meta.dictionary_page_offset().is_some(), rg_idx.to_i16()?, self.column_idx.to_i16()?, file_decryptor.clone(), file_decryptor); + meta.dictionary_page_offset().is_some(), rg_idx as i16, self.column_idx as i16, file_decryptor.clone(), file_decryptor); let crypto_context = Arc::new(crypto_context); let ret = SerializedPageReader::new(reader, meta, total_rows, page_locations, Some(crypto_context)); diff --git a/parquet/src/encryption/ciphers.rs b/parquet/src/encryption/ciphers.rs index 89515fe0e006..b347c292692f 100644 --- a/parquet/src/encryption/ciphers.rs +++ b/parquet/src/encryption/ciphers.rs @@ -177,8 +177,8 @@ pub fn create_page_aad(file_aad: &[u8], module_type: ModuleType, row_group_ordin create_module_aad(file_aad, module_type, row_group_ordinal, column_ordinal, page_ordinal) } -fn create_module_aad(file_aad: &[u8], module_type: ModuleType, row_group_ordinal: i16, - column_ordinal: i16, page_ordinal: i32) -> Result> { +pub fn create_module_aad(file_aad: &[u8], module_type: ModuleType, row_group_ordinal: i16, + column_ordinal: i16, page_ordinal: i32) -> Result> { let module_buf = [module_type as u8]; @@ -192,17 +192,18 @@ fn create_module_aad(file_aad: &[u8], module_type: ModuleType, row_group_ordinal if row_group_ordinal < 0 { return Err(general_err!("Wrong row group ordinal: {}", row_group_ordinal)); } + // todo: this check is a noop here if row_group_ordinal > i16::MAX { return Err(general_err!("Encrypted parquet files can't have more than {} row groups: {}", - u16::MAX, row_group_ordinal)); + i16::MAX, row_group_ordinal)); } - if column_ordinal < 0 { return Err(general_err!("Wrong column ordinal: {}", column_ordinal)); } + // todo: this check is a noop here if column_ordinal > i16::MAX { return Err(general_err!("Encrypted parquet files can't have more than {} columns: {}", - u16::MAX, column_ordinal)); + i16::MAX, column_ordinal)); } if module_buf[0] != (ModuleType::DataPageHeader as u8) && @@ -218,9 +219,9 @@ fn create_module_aad(file_aad: &[u8], module_type: ModuleType, row_group_ordinal if page_ordinal < 0 { return Err(general_err!("Wrong page ordinal: {}", page_ordinal)); } - if page_ordinal > i32::MAX { + if page_ordinal > i16::MAX as i32 { return Err(general_err!("Encrypted parquet files can't have more than {} pages in a chunk: {}", - u16::MAX, page_ordinal)); + i16::MAX, page_ordinal)); } let mut aad = Vec::with_capacity(file_aad.len() + 7); @@ -228,7 +229,7 @@ fn create_module_aad(file_aad: &[u8], module_type: ModuleType, row_group_ordinal aad.extend_from_slice(module_buf.as_ref()); aad.extend_from_slice(row_group_ordinal.to_le_bytes().as_ref()); aad.extend_from_slice(column_ordinal.to_le_bytes().as_ref()); - aad.extend_from_slice(page_ordinal.to_le_bytes().as_ref()); + aad.extend_from_slice((page_ordinal as i16).to_le_bytes().as_ref()); Ok(aad) } diff --git a/parquet/src/file/serialized_reader.rs b/parquet/src/file/serialized_reader.rs index e8165e2c5a14..db94ebb23f24 100644 --- a/parquet/src/file/serialized_reader.rs +++ b/parquet/src/file/serialized_reader.rs @@ -342,31 +342,33 @@ impl RowGroupReader for SerializedRowGroupReader<'_, R /// Reads a [`PageHeader`] from the provided [`Read`] pub(crate) fn read_page_header(input: &mut T, crypto_context: Option>) -> Result { - let mut prot = TCompactInputProtocol::new(input); if let Some(crypto_context) = crypto_context { - // let mut buf = [0; 16 * 1024]; - // let size = input.read(&mut buf)?; - let decryptor = &crypto_context.data_decryptor(); let file_decryptor = decryptor.footer_decryptor(); let aad_file_unique = decryptor.aad_file_unique(); - // let aad_prefix = decryptor.aad_prefix(); + // todo: page ordinal and page type (ModuleType) let aad = create_page_aad( aad_file_unique.as_slice(), - ModuleType::DictionaryPageHeader, + ModuleType::DataPageHeader, crypto_context.row_group_ordinal, crypto_context.column_ordinal, 0, )?; - // todo: This currently fails, possibly due to wrongly generated AAD - let buf = file_decryptor.decrypt(prot.read_bytes()?.as_slice(), aad.as_ref()); - todo!("Decrypted page header!"); + let mut len_bytes = [0; 4]; + input.read_exact(&mut len_bytes)?; + let ciphertext_len = u32::from_le_bytes(len_bytes) as usize; + let mut ciphertext = vec![0; 4 + ciphertext_len]; + input.read_exact(&mut ciphertext[4..])?; + let buf = file_decryptor.decrypt(&ciphertext, aad.as_ref()); + let mut prot = TCompactSliceInputProtocol::new(buf.as_slice()); let page_header = PageHeader::read_from_in_protocol(&mut prot)?; return Ok(page_header) } + + let mut prot = TCompactInputProtocol::new(input); let page_header = PageHeader::read_from_in_protocol(&mut prot)?; Ok(page_header) } @@ -401,6 +403,7 @@ pub(crate) fn decode_page( buffer: Bytes, physical_type: Type, decompressor: Option<&mut Box>, + crypto_context: Option>, ) -> Result { // Verify the 32-bit CRC checksum of the page #[cfg(feature = "crc")] @@ -426,6 +429,22 @@ pub(crate) fn decode_page( // When is_compressed flag is missing the page is considered compressed can_decompress = header_v2.is_compressed.unwrap_or(true); } + if crypto_context.is_some() { + let crypto_context = crypto_context.as_ref().unwrap(); + let decryptor = crypto_context.data_decryptor(); + let file_decryptor = decryptor.footer_decryptor(); + + // todo: page ordinal + let aad = create_page_aad( + decryptor.aad_file_unique().as_slice(), + ModuleType::DataPage, + crypto_context.row_group_ordinal, + crypto_context.column_ordinal, + 0, + )?; + let decrypted = file_decryptor.decrypt(&buffer.as_ref()[offset..], &aad); + todo!("page decrypted!"); + } // TODO: page header could be huge because of statistics. We should set a // maximum page header size and abort if that is exceeded. @@ -749,6 +768,7 @@ impl PageReader for SerializedPageReader { Bytes::from(buffer), self.physical_type, self.decompressor.as_mut(), + self.crypto_context.clone(), )? } SerializedPageReaderState::Pages { @@ -778,6 +798,7 @@ impl PageReader for SerializedPageReader { bytes, self.physical_type, self.decompressor.as_mut(), + None, )? } }; From e8e5df2182ea0af46f8f460934672a5029d42b3e Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Tue, 17 Dec 2024 16:55:15 +0100 Subject: [PATCH 08/97] page decompression issue --- parquet/src/file/serialized_reader.rs | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/parquet/src/file/serialized_reader.rs b/parquet/src/file/serialized_reader.rs index db94ebb23f24..f4980818e9a8 100644 --- a/parquet/src/file/serialized_reader.rs +++ b/parquet/src/file/serialized_reader.rs @@ -429,7 +429,8 @@ pub(crate) fn decode_page( // When is_compressed flag is missing the page is considered compressed can_decompress = header_v2.is_compressed.unwrap_or(true); } - if crypto_context.is_some() { + + let buffer = if crypto_context.is_some() { let crypto_context = crypto_context.as_ref().unwrap(); let decryptor = crypto_context.data_decryptor(); let file_decryptor = decryptor.footer_decryptor(); @@ -442,15 +443,18 @@ pub(crate) fn decode_page( crypto_context.column_ordinal, 0, )?; - let decrypted = file_decryptor.decrypt(&buffer.as_ref()[offset..], &aad); - todo!("page decrypted!"); - } + let decrypted = file_decryptor.decrypt(&buffer.as_ref(), &aad); + Bytes::from(decrypted) + } else { + buffer + }; // TODO: page header could be huge because of statistics. We should set a // maximum page header size and abort if that is exceeded. let buffer = match decompressor { Some(decompressor) if can_decompress => { let uncompressed_size = page_header.uncompressed_page_size as usize; + let mut decompressed = Vec::with_capacity(uncompressed_size); let compressed = &buffer.as_ref()[offset..]; decompressed.extend_from_slice(&buffer.as_ref()[..offset]); @@ -459,6 +463,7 @@ pub(crate) fn decode_page( &mut decompressed, Some(uncompressed_size - offset), )?; + todo!("page decompressed!"); if decompressed.len() != uncompressed_size { return Err(general_err!( From 8501cf9f72c6c785f9a189f5ede62e7432ab7bb9 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Wed, 18 Dec 2024 00:17:43 +0100 Subject: [PATCH 09/97] add update_aad --- parquet/src/encryption/ciphers.rs | 13 ++++++++++++- parquet/src/file/serialized_reader.rs | 3 +-- 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/parquet/src/encryption/ciphers.rs b/parquet/src/encryption/ciphers.rs index b347c292692f..c0e7ce1a8037 100644 --- a/parquet/src/encryption/ciphers.rs +++ b/parquet/src/encryption/ciphers.rs @@ -35,6 +35,7 @@ const RIGHT_TWELVE: u128 = 0x0000_0000_ffff_ffff_ffff_ffff_ffff_ffff; const NONCE_LEN: usize = 12; const TAG_LEN: usize = 16; const SIZE_LEN: usize = 4; +const NON_PAGE_ORDINAL: i32 = -1; struct CounterNonce { start: u128, @@ -152,10 +153,12 @@ impl BlockDecryptor for RingGcmBlockDecryptor { .open_in_place(nonce, Aad::from(aad), &mut result) .unwrap(); + result.resize(result.len() - TAG_LEN, 0u8); result } } +#[derive(PartialEq)] pub(crate) enum ModuleType { Footer = 0, ColumnMetaData = 1, @@ -170,7 +173,7 @@ pub(crate) enum ModuleType { } pub fn create_footer_aad(file_aad: &[u8]) -> Result> { - create_module_aad(file_aad, ModuleType::Footer, -1, -1, -1) + create_module_aad(file_aad, ModuleType::Footer, -1, -1, NON_PAGE_ORDINAL) } pub fn create_page_aad(file_aad: &[u8], module_type: ModuleType, row_group_ordinal: i16, column_ordinal: i16, page_ordinal: i32) -> Result> { @@ -314,6 +317,14 @@ impl FileDecryptor { pub(crate) fn aad_prefix(&self) -> &Vec { &self.aad_prefix } + + pub fn update_aad(&mut self, aad: Vec, row_group_ordinal: i16, column_ordinal: i16, module_type: ModuleType) { + // todo decr: update aad + debug_assert!(!self.aad_file_unique().is_empty(), "AAD is empty"); + + let aad = create_module_aad(self.aad_file_unique(), module_type, row_group_ordinal, column_ordinal, NON_PAGE_ORDINAL).unwrap(); + self.aad_file_unique = aad; + } } #[derive(Debug, Clone)] diff --git a/parquet/src/file/serialized_reader.rs b/parquet/src/file/serialized_reader.rs index f4980818e9a8..3a938ae324ca 100644 --- a/parquet/src/file/serialized_reader.rs +++ b/parquet/src/file/serialized_reader.rs @@ -430,7 +430,7 @@ pub(crate) fn decode_page( can_decompress = header_v2.is_compressed.unwrap_or(true); } - let buffer = if crypto_context.is_some() { + let buffer : Bytes = if crypto_context.is_some() { let crypto_context = crypto_context.as_ref().unwrap(); let decryptor = crypto_context.data_decryptor(); let file_decryptor = decryptor.footer_decryptor(); @@ -463,7 +463,6 @@ pub(crate) fn decode_page( &mut decompressed, Some(uncompressed_size - offset), )?; - todo!("page decompressed!"); if decompressed.len() != uncompressed_size { return Err(general_err!( From 1a8fd94b98333b144544c632a2e2cbea3e12af3f Mon Sep 17 00:00:00 2001 From: Adam Reeve Date: Wed, 18 Dec 2024 11:11:34 +1300 Subject: [PATCH 10/97] Change encrypt and decrypt to return Results --- parquet/src/encryption/ciphers.rs | 24 +++++++++++------------- parquet/src/errors.rs | 7 +++++++ parquet/src/file/metadata/reader.rs | 2 +- parquet/src/file/serialized_reader.rs | 4 ++-- 4 files changed, 21 insertions(+), 16 deletions(-) diff --git a/parquet/src/encryption/ciphers.rs b/parquet/src/encryption/ciphers.rs index c0e7ce1a8037..5379357326b9 100644 --- a/parquet/src/encryption/ciphers.rs +++ b/parquet/src/encryption/ciphers.rs @@ -24,11 +24,11 @@ use ring::rand::{SecureRandom, SystemRandom}; use crate::errors::{ParquetError, Result}; pub trait BlockEncryptor { - fn encrypt(&mut self, plaintext: &[u8], aad: &[u8]) -> Vec; + fn encrypt(&mut self, plaintext: &[u8], aad: &[u8]) -> Result>; } pub trait BlockDecryptor { - fn decrypt(&self, length_and_ciphertext: &[u8], aad: &[u8]) -> Vec; + fn decrypt(&self, length_and_ciphertext: &[u8], aad: &[u8]) -> Result>; } const RIGHT_TWELVE: u128 = 0x0000_0000_ffff_ffff_ffff_ffff_ffff_ffff; @@ -102,8 +102,8 @@ impl RingGcmBlockEncryptor { } impl BlockEncryptor for RingGcmBlockEncryptor { - fn encrypt(&mut self, plaintext: &[u8], aad: &[u8]) -> Vec { - let nonce = self.nonce_sequence.advance().unwrap(); + fn encrypt(&mut self, plaintext: &[u8], aad: &[u8]) -> Result> { + let nonce = self.nonce_sequence.advance()?; let ciphertext_len = plaintext.len() + NONCE_LEN + TAG_LEN; // todo TBD: add first 4 bytes with the length, per https://github.com/apache/parquet-format/blob/master/Encryption.md#51-encrypted-module-serialization let mut result = Vec::with_capacity(SIZE_LEN + ciphertext_len); @@ -113,11 +113,10 @@ impl BlockEncryptor for RingGcmBlockEncryptor { let tag = self .key - .seal_in_place_separate_tag(nonce, Aad::from(aad), &mut result[SIZE_LEN + NONCE_LEN..]) - .unwrap(); + .seal_in_place_separate_tag(nonce, Aad::from(aad), &mut result[SIZE_LEN + NONCE_LEN..])?; result.extend_from_slice(tag.as_ref()); - result + Ok(result) } } @@ -138,7 +137,7 @@ impl RingGcmBlockDecryptor { } impl BlockDecryptor for RingGcmBlockDecryptor { - fn decrypt(&self, length_and_ciphertext: &[u8], aad: &[u8]) -> Vec { + fn decrypt(&self, length_and_ciphertext: &[u8], aad: &[u8]) -> Result> { let mut result = Vec::with_capacity( length_and_ciphertext.len() - SIZE_LEN - NONCE_LEN - TAG_LEN, ); @@ -146,15 +145,14 @@ impl BlockDecryptor for RingGcmBlockDecryptor { let nonce = ring::aead::Nonce::try_assume_unique_for_key( &length_and_ciphertext[SIZE_LEN..SIZE_LEN + NONCE_LEN], - ) - .unwrap(); + )?; self.key - .open_in_place(nonce, Aad::from(aad), &mut result) - .unwrap(); + .open_in_place(nonce, Aad::from(aad), &mut result)?; + // Truncate result to remove the tag result.resize(result.len() - TAG_LEN, 0u8); - result + Ok(result) } } diff --git a/parquet/src/errors.rs b/parquet/src/errors.rs index d749287bba62..72e5cedcc5a8 100644 --- a/parquet/src/errors.rs +++ b/parquet/src/errors.rs @@ -132,6 +132,13 @@ impl From for ParquetError { } } +//#[cfg(feature = "encryption")] +impl From for ParquetError { + fn from(e: ring::error::Unspecified) -> ParquetError { + ParquetError::External(Box::new(e)) + } +} + /// A specialized `Result` for Parquet errors. pub type Result = result::Result; diff --git a/parquet/src/file/metadata/reader.rs b/parquet/src/file/metadata/reader.rs index 494e161510ce..f32b5eb8bdd5 100644 --- a/parquet/src/file/metadata/reader.rs +++ b/parquet/src/file/metadata/reader.rs @@ -683,7 +683,7 @@ impl ParquetMetaDataReader { // file_decryptor = Some(FileDecryptor::new(file_decryption_properties, aad, aad_prefix)); decrypted_fmd_buf = - decryptor.decrypt(prot.as_slice().as_ref(), aad_footer.as_ref()); + decryptor.decrypt(prot.as_slice().as_ref(), aad_footer.as_ref())?; prot = TCompactSliceInputProtocol::new(decrypted_fmd_buf.as_ref()); } diff --git a/parquet/src/file/serialized_reader.rs b/parquet/src/file/serialized_reader.rs index 3a938ae324ca..7ccef8b23836 100644 --- a/parquet/src/file/serialized_reader.rs +++ b/parquet/src/file/serialized_reader.rs @@ -361,7 +361,7 @@ pub(crate) fn read_page_header(input: &mut T, crypto_context: Option Date: Wed, 18 Dec 2024 13:33:26 +1300 Subject: [PATCH 11/97] Use correct page ordinal and module type in AADs --- parquet/src/arrow/arrow_reader/mod.rs | 2 +- parquet/src/column/page.rs | 8 ++++ parquet/src/encryption/ciphers.rs | 58 ++++++++++++++++---------- parquet/src/file/serialized_reader.rs | 60 ++++++++++++++++++++++----- 4 files changed, 95 insertions(+), 33 deletions(-) diff --git a/parquet/src/arrow/arrow_reader/mod.rs b/parquet/src/arrow/arrow_reader/mod.rs index e7baf88bd2c2..99682c0d59c5 100644 --- a/parquet/src/arrow/arrow_reader/mod.rs +++ b/parquet/src/arrow/arrow_reader/mod.rs @@ -698,7 +698,7 @@ impl Iterator for ReaderPageIterator { let file_decryptor = Arc::new(self.metadata.file_decryptor().clone().unwrap()); let crypto_context = CryptoContext::new( - meta.dictionary_page_offset().is_some(), rg_idx as i16, self.column_idx as i16, file_decryptor.clone(), file_decryptor); + rg_idx as i16, self.column_idx as i16, file_decryptor.clone(), file_decryptor); let crypto_context = Arc::new(crypto_context); let ret = SerializedPageReader::new(reader, meta, total_rows, page_locations, Some(crypto_context)); diff --git a/parquet/src/column/page.rs b/parquet/src/column/page.rs index 5c866318e185..931241e4259f 100644 --- a/parquet/src/column/page.rs +++ b/parquet/src/column/page.rs @@ -90,6 +90,14 @@ impl Page { } } + pub fn is_data_page(&self) -> bool { + matches!(self, Page::DataPage { .. } | Page::DataPageV2 { .. }) + } + + pub fn is_dictionary_page(&self) -> bool { + matches!(self, Page::DictionaryPage { .. }) + } + /// Returns internal byte buffer reference for this page. pub fn buffer(&self) -> &Bytes { match self { diff --git a/parquet/src/encryption/ciphers.rs b/parquet/src/encryption/ciphers.rs index 5379357326b9..f795da547933 100644 --- a/parquet/src/encryption/ciphers.rs +++ b/parquet/src/encryption/ciphers.rs @@ -35,7 +35,6 @@ const RIGHT_TWELVE: u128 = 0x0000_0000_ffff_ffff_ffff_ffff_ffff_ffff; const NONCE_LEN: usize = 12; const TAG_LEN: usize = 16; const SIZE_LEN: usize = 4; -const NON_PAGE_ORDINAL: i32 = -1; struct CounterNonce { start: u128, @@ -171,15 +170,15 @@ pub(crate) enum ModuleType { } pub fn create_footer_aad(file_aad: &[u8]) -> Result> { - create_module_aad(file_aad, ModuleType::Footer, -1, -1, NON_PAGE_ORDINAL) + create_module_aad(file_aad, ModuleType::Footer, -1, -1, None) } -pub fn create_page_aad(file_aad: &[u8], module_type: ModuleType, row_group_ordinal: i16, column_ordinal: i16, page_ordinal: i32) -> Result> { +pub fn create_page_aad(file_aad: &[u8], module_type: ModuleType, row_group_ordinal: i16, column_ordinal: i16, page_ordinal: Option) -> Result> { create_module_aad(file_aad, module_type, row_group_ordinal, column_ordinal, page_ordinal) } pub fn create_module_aad(file_aad: &[u8], module_type: ModuleType, row_group_ordinal: i16, - column_ordinal: i16, page_ordinal: i32) -> Result> { + column_ordinal: i16, page_ordinal: Option) -> Result> { let module_buf = [module_type as u8]; @@ -217,20 +216,19 @@ pub fn create_module_aad(file_aad: &[u8], module_type: ModuleType, row_group_ord return Ok(aad) } + let page_ordinal = page_ordinal.ok_or_else(|| general_err!( + "Page ordinal must be set for data pages"))?; + if page_ordinal < 0 { return Err(general_err!("Wrong page ordinal: {}", page_ordinal)); } - if page_ordinal > i16::MAX as i32 { - return Err(general_err!("Encrypted parquet files can't have more than {} pages in a chunk: {}", - i16::MAX, page_ordinal)); - } let mut aad = Vec::with_capacity(file_aad.len() + 7); aad.extend_from_slice(file_aad); aad.extend_from_slice(module_buf.as_ref()); aad.extend_from_slice(row_group_ordinal.to_le_bytes().as_ref()); aad.extend_from_slice(column_ordinal.to_le_bytes().as_ref()); - aad.extend_from_slice((page_ordinal as i16).to_le_bytes().as_ref()); + aad.extend_from_slice(page_ordinal.to_le_bytes().as_ref()); Ok(aad) } @@ -315,41 +313,57 @@ impl FileDecryptor { pub(crate) fn aad_prefix(&self) -> &Vec { &self.aad_prefix } - - pub fn update_aad(&mut self, aad: Vec, row_group_ordinal: i16, column_ordinal: i16, module_type: ModuleType) { - // todo decr: update aad - debug_assert!(!self.aad_file_unique().is_empty(), "AAD is empty"); - - let aad = create_module_aad(self.aad_file_unique(), module_type, row_group_ordinal, column_ordinal, NON_PAGE_ORDINAL).unwrap(); - self.aad_file_unique = aad; - } } #[derive(Debug, Clone)] pub struct CryptoContext { - pub(crate) start_decrypt_with_dictionary_page: bool, pub(crate) row_group_ordinal: i16, pub(crate) column_ordinal: i16, + pub(crate) page_ordinal: Option, + pub(crate) dictionary_page: bool, pub(crate) data_decryptor: Arc, pub(crate) metadata_decryptor: Arc, - } impl CryptoContext { - pub fn new(start_decrypt_with_dictionary_page: bool, row_group_ordinal: i16, + pub fn new(row_group_ordinal: i16, column_ordinal: i16, data_decryptor: Arc, metadata_decryptor: Arc) -> Self { Self { - start_decrypt_with_dictionary_page, row_group_ordinal, column_ordinal, + page_ordinal: None, + dictionary_page: false, data_decryptor, metadata_decryptor, } } - pub fn start_decrypt_with_dictionary_page(&self) -> &bool { &self.start_decrypt_with_dictionary_page } + + pub fn with_page_ordinal(&self, page_ordinal: i16) -> Self { + Self { + row_group_ordinal: self.row_group_ordinal, + column_ordinal: self.column_ordinal, + page_ordinal: Some(page_ordinal), + dictionary_page: false, + data_decryptor: self.data_decryptor.clone(), + metadata_decryptor: self.metadata_decryptor.clone(), + } + } + + pub fn for_dictionary_page(&self) -> Self { + Self { + row_group_ordinal: self.row_group_ordinal, + column_ordinal: self.column_ordinal, + page_ordinal: self.page_ordinal, + dictionary_page: true, + data_decryptor: self.data_decryptor.clone(), + metadata_decryptor: self.metadata_decryptor.clone(), + } + } + pub fn row_group_ordinal(&self) -> &i16 { &self.row_group_ordinal } pub fn column_ordinal(&self) -> &i16 { &self.column_ordinal } + pub fn page_ordinal(&self) -> &Option { &self.page_ordinal } pub fn data_decryptor(&self) -> Arc { self.data_decryptor.clone()} pub fn metadata_decryptor(&self) -> Arc { self.metadata_decryptor.clone() } } diff --git a/parquet/src/file/serialized_reader.rs b/parquet/src/file/serialized_reader.rs index 7ccef8b23836..8fbdeb0ae882 100644 --- a/parquet/src/file/serialized_reader.rs +++ b/parquet/src/file/serialized_reader.rs @@ -39,6 +39,7 @@ use crate::record::Row; use crate::schema::types::Type as SchemaType; use crate::thrift::{TCompactSliceInputProtocol, TSerializable}; use bytes::Bytes; +use num::ToPrimitive; use thrift::protocol::{TCompactInputProtocol, TInputProtocol}; use zstd::zstd_safe::WriteBuf; use crate::data_type::AsBytes; @@ -347,13 +348,17 @@ pub(crate) fn read_page_header(input: &mut T, crypto_context: Option>, + + /// The index of the data page within this column chunk + page_ordinal: usize, + + /// Whether the next page is expected to be a dictionary page + require_dictionary: bool, }, Pages { /// Remaining page locations @@ -613,6 +628,8 @@ impl SerializedPageReader { offset: start as usize, remaining_bytes: len as usize, next_page_header: None, + page_ordinal: 0, + require_dictionary: meta.dictionary_page_offset().is_some(), }, }; if crypto_context.is_some() { @@ -728,6 +745,8 @@ impl PageReader for SerializedPageReader { offset, remaining_bytes: remaining, next_page_header, + page_ordinal, + require_dictionary, } => { if *remaining == 0 { return Ok(None); @@ -737,7 +756,8 @@ impl PageReader for SerializedPageReader { let header = if let Some(header) = next_page_header.take() { *header } else { - let (header_len, header) = read_page_header_len(&mut read, self.crypto_context.clone())?; + let crypto_context = page_crypto_context(&self.crypto_context, *page_ordinal, *require_dictionary)?; + let (header_len, header) = read_page_header_len(&mut read, crypto_context)?; verify_page_header_len(header_len, *remaining)?; *offset += header_len; *remaining -= header_len; @@ -767,13 +787,20 @@ impl PageReader for SerializedPageReader { )); } - decode_page( + let crypto_context = page_crypto_context(&self.crypto_context, *page_ordinal, *require_dictionary)?; + let page = decode_page( header, Bytes::from(buffer), self.physical_type, self.decompressor.as_mut(), - self.crypto_context.clone(), - )? + crypto_context, + )?; + if page.is_data_page() { + *page_ordinal += 1; + } else if page.is_dictionary_page() { + *require_dictionary = false; + } + page } SerializedPageReaderState::Pages { page_locations, @@ -817,6 +844,7 @@ impl PageReader for SerializedPageReader { offset, remaining_bytes, next_page_header, + .. } => { loop { if *remaining_bytes == 0 { @@ -882,6 +910,7 @@ impl PageReader for SerializedPageReader { offset, remaining_bytes, next_page_header, + .. } => { if let Some(buffered_header) = next_page_header.take() { verify_page_size( @@ -923,6 +952,17 @@ impl PageReader for SerializedPageReader { } } +fn page_crypto_context(crypto_context: &Option>, page_ordinal: usize, dictionary_page: bool) -> Result>> { + let page_ordinal = page_ordinal + .to_i16() + .ok_or_else(|| general_err!( + "Page ordinal {} is greater than the maximum allowed in encrypted Parquet files ({})", + page_ordinal, i16::MAX))?; + + Ok(crypto_context.as_ref().map( + |c| Arc::new(if dictionary_page { c.for_dictionary_page() } else { c.with_page_ordinal(page_ordinal) }))) +} + #[cfg(test)] mod tests { use std::collections::HashSet; From 968209f5a837bd4fbc9c0016e4813c8aec091d96 Mon Sep 17 00:00:00 2001 From: Adam Reeve Date: Wed, 18 Dec 2024 14:36:38 +1300 Subject: [PATCH 12/97] Tidy up ordinal types --- parquet/src/arrow/arrow_reader/mod.rs | 2 +- parquet/src/encryption/ciphers.rs | 46 +++++++++++---------------- parquet/src/file/serialized_reader.rs | 21 +++++------- 3 files changed, 27 insertions(+), 42 deletions(-) diff --git a/parquet/src/arrow/arrow_reader/mod.rs b/parquet/src/arrow/arrow_reader/mod.rs index 99682c0d59c5..a558512d8212 100644 --- a/parquet/src/arrow/arrow_reader/mod.rs +++ b/parquet/src/arrow/arrow_reader/mod.rs @@ -698,7 +698,7 @@ impl Iterator for ReaderPageIterator { let file_decryptor = Arc::new(self.metadata.file_decryptor().clone().unwrap()); let crypto_context = CryptoContext::new( - rg_idx as i16, self.column_idx as i16, file_decryptor.clone(), file_decryptor); + rg_idx, self.column_idx, file_decryptor.clone(), file_decryptor); let crypto_context = Arc::new(crypto_context); let ret = SerializedPageReader::new(reader, meta, total_rows, page_locations, Some(crypto_context)); diff --git a/parquet/src/encryption/ciphers.rs b/parquet/src/encryption/ciphers.rs index f795da547933..6ecbb003dd5b 100644 --- a/parquet/src/encryption/ciphers.rs +++ b/parquet/src/encryption/ciphers.rs @@ -170,15 +170,15 @@ pub(crate) enum ModuleType { } pub fn create_footer_aad(file_aad: &[u8]) -> Result> { - create_module_aad(file_aad, ModuleType::Footer, -1, -1, None) + create_module_aad(file_aad, ModuleType::Footer, 0, 0, None) } -pub fn create_page_aad(file_aad: &[u8], module_type: ModuleType, row_group_ordinal: i16, column_ordinal: i16, page_ordinal: Option) -> Result> { +pub fn create_page_aad(file_aad: &[u8], module_type: ModuleType, row_group_ordinal: usize, column_ordinal: usize, page_ordinal: Option) -> Result> { create_module_aad(file_aad, module_type, row_group_ordinal, column_ordinal, page_ordinal) } -pub fn create_module_aad(file_aad: &[u8], module_type: ModuleType, row_group_ordinal: i16, - column_ordinal: i16, page_ordinal: Option) -> Result> { +fn create_module_aad(file_aad: &[u8], module_type: ModuleType, row_group_ordinal: usize, + column_ordinal: usize, page_ordinal: Option) -> Result> { let module_buf = [module_type as u8]; @@ -189,19 +189,11 @@ pub fn create_module_aad(file_aad: &[u8], module_type: ModuleType, row_group_ord return Ok(aad) } - if row_group_ordinal < 0 { - return Err(general_err!("Wrong row group ordinal: {}", row_group_ordinal)); - } - // todo: this check is a noop here - if row_group_ordinal > i16::MAX { + if row_group_ordinal > i16::MAX as usize { return Err(general_err!("Encrypted parquet files can't have more than {} row groups: {}", i16::MAX, row_group_ordinal)); } - if column_ordinal < 0 { - return Err(general_err!("Wrong column ordinal: {}", column_ordinal)); - } - // todo: this check is a noop here - if column_ordinal > i16::MAX { + if column_ordinal > i16::MAX as usize { return Err(general_err!("Encrypted parquet files can't have more than {} columns: {}", i16::MAX, column_ordinal)); } @@ -219,16 +211,17 @@ pub fn create_module_aad(file_aad: &[u8], module_type: ModuleType, row_group_ord let page_ordinal = page_ordinal.ok_or_else(|| general_err!( "Page ordinal must be set for data pages"))?; - if page_ordinal < 0 { - return Err(general_err!("Wrong page ordinal: {}", page_ordinal)); + if page_ordinal > i16::MAX as usize { + return Err(general_err!("Encrypted parquet files can't have more than {} pages per column chunk: {}", + i16::MAX, page_ordinal)); } let mut aad = Vec::with_capacity(file_aad.len() + 7); aad.extend_from_slice(file_aad); aad.extend_from_slice(module_buf.as_ref()); - aad.extend_from_slice(row_group_ordinal.to_le_bytes().as_ref()); - aad.extend_from_slice(column_ordinal.to_le_bytes().as_ref()); - aad.extend_from_slice(page_ordinal.to_le_bytes().as_ref()); + aad.extend_from_slice((row_group_ordinal as i16).to_le_bytes().as_ref()); + aad.extend_from_slice((column_ordinal as i16).to_le_bytes().as_ref()); + aad.extend_from_slice((page_ordinal as i16).to_le_bytes().as_ref()); Ok(aad) } @@ -317,17 +310,17 @@ impl FileDecryptor { #[derive(Debug, Clone)] pub struct CryptoContext { - pub(crate) row_group_ordinal: i16, - pub(crate) column_ordinal: i16, - pub(crate) page_ordinal: Option, + pub(crate) row_group_ordinal: usize, + pub(crate) column_ordinal: usize, + pub(crate) page_ordinal: Option, pub(crate) dictionary_page: bool, pub(crate) data_decryptor: Arc, pub(crate) metadata_decryptor: Arc, } impl CryptoContext { - pub fn new(row_group_ordinal: i16, - column_ordinal: i16, data_decryptor: Arc, + pub fn new(row_group_ordinal: usize, + column_ordinal: usize, data_decryptor: Arc, metadata_decryptor: Arc) -> Self { Self { row_group_ordinal, @@ -339,7 +332,7 @@ impl CryptoContext { } } - pub fn with_page_ordinal(&self, page_ordinal: i16) -> Self { + pub fn with_page_ordinal(&self, page_ordinal: usize) -> Self { Self { row_group_ordinal: self.row_group_ordinal, column_ordinal: self.column_ordinal, @@ -361,9 +354,6 @@ impl CryptoContext { } } - pub fn row_group_ordinal(&self) -> &i16 { &self.row_group_ordinal } - pub fn column_ordinal(&self) -> &i16 { &self.column_ordinal } - pub fn page_ordinal(&self) -> &Option { &self.page_ordinal } pub fn data_decryptor(&self) -> Arc { self.data_decryptor.clone()} pub fn metadata_decryptor(&self) -> Arc { self.metadata_decryptor.clone() } } diff --git a/parquet/src/file/serialized_reader.rs b/parquet/src/file/serialized_reader.rs index 8fbdeb0ae882..ceb5ed222ea8 100644 --- a/parquet/src/file/serialized_reader.rs +++ b/parquet/src/file/serialized_reader.rs @@ -33,17 +33,14 @@ use crate::file::{ reader::*, statistics, }; -use crate::format::{PageHeader, PageLocation, PageType, FileCryptoMetaData as TFileCryptoMetaData, EncryptionAlgorithm}; +use crate::format::{PageHeader, PageLocation, PageType}; use crate::record::reader::RowIter; use crate::record::Row; use crate::schema::types::Type as SchemaType; use crate::thrift::{TCompactSliceInputProtocol, TSerializable}; use bytes::Bytes; -use num::ToPrimitive; -use thrift::protocol::{TCompactInputProtocol, TInputProtocol}; -use zstd::zstd_safe::WriteBuf; -use crate::data_type::AsBytes; -use crate::encryption::ciphers::{create_page_aad, BlockDecryptor, CryptoContext, FileDecryptionProperties, ModuleType}; +use thrift::protocol::TCompactInputProtocol; +use crate::encryption::ciphers::{create_page_aad, BlockDecryptor, CryptoContext, ModuleType}; impl TryFrom for SerializedFileReader { type Error = ParquetError; @@ -953,14 +950,12 @@ impl PageReader for SerializedPageReader { } fn page_crypto_context(crypto_context: &Option>, page_ordinal: usize, dictionary_page: bool) -> Result>> { - let page_ordinal = page_ordinal - .to_i16() - .ok_or_else(|| general_err!( - "Page ordinal {} is greater than the maximum allowed in encrypted Parquet files ({})", - page_ordinal, i16::MAX))?; - Ok(crypto_context.as_ref().map( - |c| Arc::new(if dictionary_page { c.for_dictionary_page() } else { c.with_page_ordinal(page_ordinal) }))) + |c| Arc::new(if dictionary_page { + c.for_dictionary_page() + } else { + c.with_page_ordinal(page_ordinal) + }))) } #[cfg(test)] From a2507c59b19eb73bde6183b7b854c35a08e43027 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Wed, 18 Dec 2024 18:34:55 +0100 Subject: [PATCH 13/97] Lint --- parquet/src/arrow/arrow_reader/mod.rs | 43 +++++++++++++------- parquet/src/file/metadata/mod.rs | 10 +++-- parquet/src/file/metadata/reader.rs | 17 +++++--- parquet/src/file/serialized_reader.rs | 58 +++++++++++++++++++-------- 4 files changed, 89 insertions(+), 39 deletions(-) diff --git a/parquet/src/arrow/arrow_reader/mod.rs b/parquet/src/arrow/arrow_reader/mod.rs index a558512d8212..4ef374a347cd 100644 --- a/parquet/src/arrow/arrow_reader/mod.rs +++ b/parquet/src/arrow/arrow_reader/mod.rs @@ -17,9 +17,6 @@ //! Contains reader which reads parquet data into arrow [`RecordBatch`] -use std::collections::VecDeque; -use std::sync::Arc; -use num::ToPrimitive; use arrow_array::cast::AsArray; use arrow_array::Array; use arrow_array::{RecordBatch, RecordBatchReader}; @@ -27,6 +24,8 @@ use arrow_schema::{ArrowError, DataType as ArrowType, Schema, SchemaRef}; use arrow_select::filter::prep_null_mask_filter; pub use filter::{ArrowPredicate, ArrowPredicateFn, RowFilter}; pub use selection::{RowSelection, RowSelector}; +use std::collections::VecDeque; +use std::sync::Arc; pub use crate::arrow::array_reader::RowGroups; use crate::arrow::array_reader::{build_array_reader, ArrayReader}; @@ -695,14 +694,22 @@ impl Iterator for ReaderPageIterator { let total_rows = rg.num_rows() as usize; let reader = self.reader.clone(); - let file_decryptor = Arc::new(self.metadata.file_decryptor().clone().unwrap()); + let crypto_context = if self.metadata.file_decryptor().is_some() { + let file_decryptor = Arc::new(self.metadata.file_decryptor().clone().unwrap()); - let crypto_context = CryptoContext::new( - rg_idx, self.column_idx, file_decryptor.clone(), file_decryptor); - let crypto_context = Arc::new(crypto_context); + let crypto_context = CryptoContext::new( + rg_idx, + self.column_idx, + file_decryptor.clone(), + file_decryptor, + ); + Some(Arc::new(crypto_context)) + } else { + None + }; - let ret = SerializedPageReader::new(reader, meta, total_rows, page_locations, Some(crypto_context)); - // let ret = SerializedPageReader::new(reader, meta, total_rows, page_locations); + let ret = + SerializedPageReader::new(reader, meta, total_rows, page_locations, crypto_context); Some(ret.map(|x| Box::new(x) as _)) } } @@ -1846,12 +1853,17 @@ mod tests { .build(), ); - let metadata = ArrowReaderMetadata::load(&file, Default::default(), decryption_properties.as_ref()).unwrap(); + let metadata = + ArrowReaderMetadata::load(&file, Default::default(), decryption_properties.as_ref()) + .unwrap(); let file_metadata = metadata.metadata.file_metadata(); assert_eq!(file_metadata.num_rows(), 50); assert_eq!(file_metadata.schema_descr().num_columns(), 8); - assert_eq!(file_metadata.created_by().unwrap(), "parquet-cpp-arrow version 14.0.0-SNAPSHOT"); + assert_eq!( + file_metadata.created_by().unwrap(), + "parquet-cpp-arrow version 14.0.0-SNAPSHOT" + ); metadata.metadata.row_groups().iter().for_each(|rg| { assert_eq!(rg.num_columns(), 8); @@ -1865,9 +1877,12 @@ mod tests { .with_footer_key(key_code.to_vec()) .build(), ); - let record_reader = - ParquetRecordBatchReader::try_new_with_decryption(file, 128, decryption_properties.as_ref()) - .unwrap(); + let record_reader = ParquetRecordBatchReader::try_new_with_decryption( + file, + 128, + decryption_properties.as_ref(), + ) + .unwrap(); // todo check contents let mut row_count = 0; for batch in record_reader { diff --git a/parquet/src/file/metadata/mod.rs b/parquet/src/file/metadata/mod.rs index 1bbbecc7f54a..2e8c654a267f 100644 --- a/parquet/src/file/metadata/mod.rs +++ b/parquet/src/file/metadata/mod.rs @@ -104,6 +104,7 @@ use crate::format::{ }; use crate::basic::{ColumnOrder, Compression, Encoding, Type}; +use crate::encryption::ciphers::FileDecryptor; use crate::errors::{ParquetError, Result}; pub(crate) use crate::file::metadata::memory::HeapSize; use crate::file::page_encoding_stats::{self, PageEncodingStats}; @@ -117,7 +118,6 @@ use crate::schema::types::{ pub use reader::ParquetMetaDataReader; pub use writer::ParquetMetaDataWriter; pub(crate) use writer::ThriftMetadataWriter; -use crate::encryption::ciphers::FileDecryptor; /// Page level statistics for each column chunk of each row group. /// @@ -182,7 +182,11 @@ pub struct ParquetMetaData { impl ParquetMetaData { /// Creates Parquet metadata from file metadata and a list of row /// group metadata - pub fn new(file_metadata: FileMetaData, row_groups: Vec, file_decryptor: Option) -> Self { + pub fn new( + file_metadata: FileMetaData, + row_groups: Vec, + file_decryptor: Option, + ) -> Self { ParquetMetaData { file_metadata, row_groups, @@ -223,8 +227,6 @@ impl ParquetMetaData { &self.file_decryptor } - - /// Returns number of row groups in this file. pub fn num_row_groups(&self) -> usize { self.row_groups.len() diff --git a/parquet/src/file/metadata/reader.rs b/parquet/src/file/metadata/reader.rs index f32b5eb8bdd5..cacd76695d96 100644 --- a/parquet/src/file/metadata/reader.rs +++ b/parquet/src/file/metadata/reader.rs @@ -676,14 +676,17 @@ impl ParquetMetaDataReader { // todo decr: set both aad_prefix and aad_file_unique in file_decryptor let aad_file_unique = aes_gcm_algo.aad_file_unique.unwrap(); let aad_footer = create_footer_aad(aad_file_unique.as_ref())?; - let aad_prefix : Vec = aes_gcm_algo.aad_prefix.unwrap_or_default(); + let aad_prefix: Vec = aes_gcm_algo.aad_prefix.unwrap_or_default(); - file_decryptor = Some(FileDecryptor::new(file_decryption_properties, aad_file_unique.clone(), aad_prefix.clone())); + file_decryptor = Some(FileDecryptor::new( + file_decryption_properties, + aad_file_unique.clone(), + aad_prefix.clone(), + )); let decryptor = file_decryptor.clone().unwrap().get_footer_decryptor(); // file_decryptor = Some(FileDecryptor::new(file_decryption_properties, aad, aad_prefix)); - decrypted_fmd_buf = - decryptor.decrypt(prot.as_slice().as_ref(), aad_footer.as_ref())?; + decrypted_fmd_buf = decryptor.decrypt(prot.as_slice().as_ref(), aad_footer.as_ref())?; prot = TCompactSliceInputProtocol::new(decrypted_fmd_buf.as_ref()); } @@ -713,7 +716,11 @@ impl ParquetMetaDataReader { schema_descr, column_orders, ); - Ok(ParquetMetaData::new(file_metadata, row_groups, Some(file_decryptor.unwrap()))) + Ok(ParquetMetaData::new( + file_metadata, + row_groups, + Some(file_decryptor.unwrap()), + )) } /// Parses column orders from Thrift definition. diff --git a/parquet/src/file/serialized_reader.rs b/parquet/src/file/serialized_reader.rs index ceb5ed222ea8..aafd5089b9d5 100644 --- a/parquet/src/file/serialized_reader.rs +++ b/parquet/src/file/serialized_reader.rs @@ -18,13 +18,11 @@ //! Contains implementations of the reader traits FileReader, RowGroupReader and PageReader //! Also contains implementations of the ChunkReader for files (with buffering) and byte arrays (RAM) -use std::collections::VecDeque; -use std::iter; -use std::{fs::File, io::Read, path::Path, sync::Arc}; use crate::basic::{Encoding, Type}; use crate::bloom_filter::Sbbf; use crate::column::page::{Page, PageMetadata, PageReader}; use crate::compression::{create_codec, Codec}; +use crate::encryption::ciphers::{create_page_aad, BlockDecryptor, CryptoContext, ModuleType}; use crate::errors::{ParquetError, Result}; use crate::file::page_index::offset_index::OffsetIndexMetaData; use crate::file::{ @@ -39,8 +37,10 @@ use crate::record::Row; use crate::schema::types::Type as SchemaType; use crate::thrift::{TCompactSliceInputProtocol, TSerializable}; use bytes::Bytes; +use std::collections::VecDeque; +use std::iter; +use std::{fs::File, io::Read, path::Path, sync::Arc}; use thrift::protocol::TCompactInputProtocol; -use crate::encryption::ciphers::{create_page_aad, BlockDecryptor, CryptoContext, ModuleType}; impl TryFrom for SerializedFileReader { type Error = ParquetError; @@ -339,7 +339,10 @@ impl RowGroupReader for SerializedRowGroupReader<'_, R } /// Reads a [`PageHeader`] from the provided [`Read`] -pub(crate) fn read_page_header(input: &mut T, crypto_context: Option>) -> Result { +pub(crate) fn read_page_header( + input: &mut T, + crypto_context: Option>, +) -> Result { if let Some(crypto_context) = crypto_context { let decryptor = &crypto_context.data_decryptor(); let file_decryptor = decryptor.footer_decryptor(); @@ -367,7 +370,7 @@ pub(crate) fn read_page_header(input: &mut T, crypto_context: Option(input: &mut T, crypto_context: Option(input: &mut T, crypto_context: Option>) -> Result<(usize, PageHeader)> { +fn read_page_header_len( + input: &mut T, + crypto_context: Option>, +) -> Result<(usize, PageHeader)> { /// A wrapper around a [`std::io::Read`] that keeps track of the bytes read struct TrackedRead { inner: R, @@ -432,7 +438,7 @@ pub(crate) fn decode_page( can_decompress = header_v2.is_compressed.unwrap_or(true); } - let buffer : Bytes = if crypto_context.is_some() { + let buffer: Bytes = if crypto_context.is_some() { let crypto_context = crypto_context.as_ref().unwrap(); let decryptor = crypto_context.data_decryptor(); let file_decryptor = decryptor.footer_decryptor(); @@ -589,7 +595,14 @@ impl SerializedPageReader { crypto_context: Option>, ) -> Result { let props = Arc::new(ReaderProperties::builder().build()); - SerializedPageReader::new_with_properties(reader, meta, total_rows, page_locations, props, crypto_context) + SerializedPageReader::new_with_properties( + reader, + meta, + total_rows, + page_locations, + props, + crypto_context, + ) } /// Creates a new serialized page with custom options. @@ -636,7 +649,7 @@ impl SerializedPageReader { state, physical_type: meta.column_type(), crypto_context, - }) + }); } Ok(Self { reader, @@ -753,7 +766,11 @@ impl PageReader for SerializedPageReader { let header = if let Some(header) = next_page_header.take() { *header } else { - let crypto_context = page_crypto_context(&self.crypto_context, *page_ordinal, *require_dictionary)?; + let crypto_context = page_crypto_context( + &self.crypto_context, + *page_ordinal, + *require_dictionary, + )?; let (header_len, header) = read_page_header_len(&mut read, crypto_context)?; verify_page_header_len(header_len, *remaining)?; *offset += header_len; @@ -784,7 +801,11 @@ impl PageReader for SerializedPageReader { )); } - let crypto_context = page_crypto_context(&self.crypto_context, *page_ordinal, *require_dictionary)?; + let crypto_context = page_crypto_context( + &self.crypto_context, + *page_ordinal, + *require_dictionary, + )?; let page = decode_page( header, Bytes::from(buffer), @@ -949,13 +970,18 @@ impl PageReader for SerializedPageReader { } } -fn page_crypto_context(crypto_context: &Option>, page_ordinal: usize, dictionary_page: bool) -> Result>> { - Ok(crypto_context.as_ref().map( - |c| Arc::new(if dictionary_page { +fn page_crypto_context( + crypto_context: &Option>, + page_ordinal: usize, + dictionary_page: bool, +) -> Result>> { + Ok(crypto_context.as_ref().map(|c| { + Arc::new(if dictionary_page { c.for_dictionary_page() } else { c.with_page_ordinal(page_ordinal) - }))) + }) + })) } #[cfg(test)] From 3a5e8bbc53101ef1ff477e98f91d1b71da43ca01 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Wed, 18 Dec 2024 18:45:12 +0100 Subject: [PATCH 14/97] Fix regular deserialization path --- parquet/src/file/metadata/reader.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/parquet/src/file/metadata/reader.rs b/parquet/src/file/metadata/reader.rs index cacd76695d96..28eac350e87e 100644 --- a/parquet/src/file/metadata/reader.rs +++ b/parquet/src/file/metadata/reader.rs @@ -684,7 +684,6 @@ impl ParquetMetaDataReader { aad_prefix.clone(), )); let decryptor = file_decryptor.clone().unwrap().get_footer_decryptor(); - // file_decryptor = Some(FileDecryptor::new(file_decryption_properties, aad, aad_prefix)); decrypted_fmd_buf = decryptor.decrypt(prot.as_slice().as_ref(), aad_footer.as_ref())?; prot = TCompactSliceInputProtocol::new(decrypted_fmd_buf.as_ref()); @@ -719,7 +718,7 @@ impl ParquetMetaDataReader { Ok(ParquetMetaData::new( file_metadata, row_groups, - Some(file_decryptor.unwrap()), + file_decryptor, )) } From ffd4a7eb9796879c540aa61e43f6c1d35f0c2f3e Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Wed, 18 Dec 2024 22:04:45 +0100 Subject: [PATCH 15/97] cleaning --- parquet/src/arrow/async_reader/metadata.rs | 2 -- parquet/src/arrow/async_reader/mod.rs | 2 +- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/parquet/src/arrow/async_reader/metadata.rs b/parquet/src/arrow/async_reader/metadata.rs index 084131b9cdcf..34e14cb5fc50 100644 --- a/parquet/src/arrow/async_reader/metadata.rs +++ b/parquet/src/arrow/async_reader/metadata.rs @@ -127,14 +127,12 @@ impl MetadataLoader { let (metadata, remainder) = if length > suffix_len - FOOTER_SIZE { let metadata_start = file_size - length - FOOTER_SIZE; let meta = fetch.fetch(metadata_start..file_size - FOOTER_SIZE).await?; - // TODO: this won't decrypt (ParquetMetaDataReader::decode_metadata(&meta, None)?, None) } else { let metadata_start = file_size - length - FOOTER_SIZE - footer_start; let slice = &suffix[metadata_start..suffix_len - FOOTER_SIZE]; ( - // TODO: this won't decrypt ParquetMetaDataReader::decode_metadata(slice, None)?, Some((footer_start, suffix.slice(..metadata_start))), ) diff --git a/parquet/src/arrow/async_reader/mod.rs b/parquet/src/arrow/async_reader/mod.rs index e35342cd32e6..89dd32c18f4a 100644 --- a/parquet/src/arrow/async_reader/mod.rs +++ b/parquet/src/arrow/async_reader/mod.rs @@ -153,7 +153,7 @@ impl AsyncFileReader for T { let mut buf = Vec::with_capacity(metadata_len); self.take(metadata_len as _).read_to_end(&mut buf).await?; - // TODO: add self.file_decryption_properties + // todo: use file_decryption_properties Ok(Arc::new(ParquetMetaDataReader::decode_metadata( &buf, None, )?)) From 22b2abbdbde6662efb4644cfac299cbcc597d680 Mon Sep 17 00:00:00 2001 From: Adam Reeve Date: Thu, 19 Dec 2024 21:23:21 +1300 Subject: [PATCH 16/97] Update data checks in test --- parquet/src/arrow/arrow_reader/mod.rs | 58 +++++++++++++++++++++------ 1 file changed, 46 insertions(+), 12 deletions(-) diff --git a/parquet/src/arrow/arrow_reader/mod.rs b/parquet/src/arrow/arrow_reader/mod.rs index 4ef374a347cd..c00c336b3a23 100644 --- a/parquet/src/arrow/arrow_reader/mod.rs +++ b/parquet/src/arrow/arrow_reader/mod.rs @@ -1862,7 +1862,7 @@ mod tests { assert_eq!(file_metadata.schema_descr().num_columns(), 8); assert_eq!( file_metadata.created_by().unwrap(), - "parquet-cpp-arrow version 14.0.0-SNAPSHOT" + "parquet-cpp-arrow version 19.0.0-SNAPSHOT" ); metadata.metadata.row_groups().iter().for_each(|rg| { @@ -1871,7 +1871,6 @@ mod tests { assert_eq!(rg.total_byte_size(), 4172); }); - // todo: decrypting data let decryption_properties = Some( ciphers::FileDecryptionProperties::builder() .with_footer_key(key_code.to_vec()) @@ -1883,24 +1882,59 @@ mod tests { decryption_properties.as_ref(), ) .unwrap(); - // todo check contents + let mut row_count = 0; for batch in record_reader { let batch = batch.unwrap(); row_count += batch.num_rows(); - let f32_col = batch.column(0).as_primitive::(); - let f64_col = batch.column(1).as_primitive::(); - // This file contains floats from a standard normal distribution - for &x in f32_col.values() { - assert!(x > -10.0); - assert!(x < 10.0); + let bool_col = batch.column(0).as_boolean(); + let time_col = batch + .column(1) + .as_primitive::(); + let list_col = batch.column(2).as_list::(); + let timestamp_col = batch + .column(3) + .as_primitive::(); + let f32_col = batch.column(4).as_primitive::(); + let f64_col = batch.column(5).as_primitive::(); + let binary_col = batch.column(6).as_binary::(); + let fixed_size_binary_col = batch.column(7).as_fixed_size_binary(); + + for (i, x) in bool_col.iter().enumerate() { + assert_eq!(x.unwrap(), i % 2 == 0); } - for &x in f64_col.values() { - assert!(x > -10.0); - assert!(x < 10.0); + for (i, x) in time_col.iter().enumerate() { + assert_eq!(x.unwrap(), i as i32); + } + for (i, list_item) in list_col.iter().enumerate() { + let list_item = list_item.unwrap(); + let list_item = list_item.as_primitive::(); + assert_eq!(list_item.len(), 2); + assert_eq!(list_item.value(0), ((i * 2) * 1000000000000) as i64); + assert_eq!(list_item.value(1), ((i * 2 + 1) * 1000000000000) as i64); + } + for x in timestamp_col.iter() { + assert!(x.is_some()); + } + for (i, x) in f32_col.iter().enumerate() { + assert_eq!(x.unwrap(), i as f32 * 1.1f32); + } + for (i, x) in f64_col.iter().enumerate() { + assert_eq!(x.unwrap(), i as f64 * 1.1111111f64); + } + for (i, x) in binary_col.iter().enumerate() { + assert_eq!(x.is_some(), i % 2 == 0); + if let Some(x) = x { + assert_eq!(&x[0..7], b"parquet"); + } + } + for (i, x) in fixed_size_binary_col.iter().enumerate() { + assert_eq!(x.unwrap(), &[i as u8; 10]); } } + + assert_eq!(row_count, file_metadata.num_rows() as usize); } #[test] From 9f5879274f5ba0e9fe07ef32509c6f126a800666 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Thu, 19 Dec 2024 23:41:39 +0100 Subject: [PATCH 17/97] start non-uniform decryption --- parquet/src/arrow/arrow_reader/mod.rs | 81 +++++++++++++++++++++++++++ parquet/src/encryption/ciphers.rs | 27 +++++++-- parquet/src/file/metadata/reader.rs | 9 ++- parquet/src/file/serialized_reader.rs | 2 + 4 files changed, 112 insertions(+), 7 deletions(-) diff --git a/parquet/src/arrow/arrow_reader/mod.rs b/parquet/src/arrow/arrow_reader/mod.rs index c00c336b3a23..1db5181d526d 100644 --- a/parquet/src/arrow/arrow_reader/mod.rs +++ b/parquet/src/arrow/arrow_reader/mod.rs @@ -1840,6 +1840,87 @@ mod tests { assert!(col.value(2).is_nan()); } + #[test] + fn test_non_uniform_encryption_plaintext_footer() { + let testdata = arrow::util::test_util::parquet_test_data(); + let path = format!("{testdata}/encrypt_columns_plaintext_footer.parquet.encrypted"); + let file = File::open(path).unwrap(); + + let column_1_key = "1234567890123450".as_bytes(); + let column_2_key = "1234567890123451".as_bytes(); + + let decryption_properties = Some( + ciphers::FileDecryptionProperties::builder() + .with_column_key("kc1".as_bytes().to_vec(), column_1_key.to_vec()) + .with_column_key("kc2".as_bytes().to_vec(), column_2_key.to_vec()) + .build(), + ); + + let metadata = + ArrowReaderMetadata::load(&file, Default::default(), decryption_properties.as_ref()) + .unwrap(); + let file_metadata = metadata.metadata.file_metadata(); + + assert_eq!(file_metadata.num_rows(), 50); + assert_eq!(file_metadata.schema_descr().num_columns(), 8); + assert_eq!( + file_metadata.created_by().unwrap(), + "parquet-cpp-arrow version 14.0.0-SNAPSHOT" + ); + + metadata.metadata.row_groups().iter().for_each(|rg| { + assert_eq!(rg.num_columns(), 8); + assert_eq!(rg.num_rows(), 50); + assert_eq!(rg.total_byte_size(), 3816); + }); + + let record_reader = ParquetRecordBatchReader::try_new_with_decryption( + file, + 128, + decryption_properties.as_ref(), + ) + .unwrap(); + + let mut row_count = 0; + for batch in record_reader { + let batch = batch.unwrap(); + row_count += batch.num_rows(); + } + + assert_eq!(row_count, file_metadata.num_rows() as usize); + } + + #[test] + fn test_non_uniform_encryption() { + let testdata = arrow::util::test_util::parquet_test_data(); + let path = format!("{testdata}/encrypt_columns_plaintext_footer.parquet.encrypted"); + let file = File::open(path).unwrap(); + + let footer_key = "0123456789012345".as_bytes(); // 128bit/16 + let column_1_key = "1234567890123450".as_bytes(); + let column_2_key = "1234567890123451".as_bytes(); + + let decryption_properties = Some( + ciphers::FileDecryptionProperties::builder() + .with_footer_key(footer_key.to_vec()) + .with_column_key("kc1".as_bytes().to_vec(), column_1_key.to_vec()) + .with_column_key("kc2".as_bytes().to_vec(), column_2_key.to_vec()) + .build(), + ); + + let metadata = + ArrowReaderMetadata::load(&file, Default::default(), decryption_properties.as_ref()) + .unwrap(); + // let file_metadata = metadata.metadata.file_metadata(); + // + // assert_eq!(file_metadata.num_rows(), 50); + // assert_eq!(file_metadata.schema_descr().num_columns(), 8); + // assert_eq!( + // file_metadata.created_by().unwrap(), + // "parquet-cpp-arrow version 19.0.0-SNAPSHOT" + // ); + } + #[test] fn test_uniform_encryption() { let testdata = arrow::util::test_util::parquet_test_data(); diff --git a/parquet/src/encryption/ciphers.rs b/parquet/src/encryption/ciphers.rs index 6ecbb003dd5b..0b8d9246bca2 100644 --- a/parquet/src/encryption/ciphers.rs +++ b/parquet/src/encryption/ciphers.rs @@ -18,6 +18,7 @@ //! Encryption implementation specific to Parquet, as described //! in the [spec](https://github.com/apache/parquet-format/blob/master/Encryption.md). +use std::collections::HashMap; use std::sync::Arc; use ring::aead::{Aad, LessSafeKey, NonceSequence, UnboundKey, AES_128_GCM}; use ring::rand::{SecureRandom, SystemRandom}; @@ -227,29 +228,34 @@ fn create_module_aad(file_aad: &[u8], module_type: ModuleType, row_group_ordinal #[derive(Debug, Clone, PartialEq)] pub struct FileDecryptionProperties { - footer_key: Option> + footer_key: Option>, + column_keys: Option, Vec>>, } impl FileDecryptionProperties { pub fn builder() -> DecryptionPropertiesBuilder { DecryptionPropertiesBuilder::with_defaults() } + pub fn has_footer_key(&self) -> bool { self.footer_key.is_some() } } pub struct DecryptionPropertiesBuilder { - footer_key: Option> + footer_key: Option>, + column_keys: Option, Vec>>, } impl DecryptionPropertiesBuilder { pub fn with_defaults() -> Self { Self { - footer_key: None + footer_key: None, + column_keys: None, } } pub fn build(self) -> FileDecryptionProperties { FileDecryptionProperties { - footer_key: self.footer_key + footer_key: self.footer_key, + column_keys: self.column_keys, } } @@ -258,6 +264,14 @@ impl DecryptionPropertiesBuilder { self.footer_key = Some(value); self } + + pub fn with_column_key(mut self, key: Vec, value: Vec) -> Self { + let mut column_keys= self.column_keys.unwrap_or_else(HashMap::new); + column_keys.insert(key, value); + // let _ = column_keys.insert(key, value); + self.column_keys = Some(column_keys); + self + } } #[derive(Debug, Clone)] @@ -291,6 +305,11 @@ impl FileDecryptor { self.footer_decryptor } + pub(crate) fn get_column_decryptor(&self, column_key: &[u8]) -> RingGcmBlockDecryptor { + let column_key = self.decryption_properties.column_keys.as_ref().unwrap().get(column_key).unwrap(); + RingGcmBlockDecryptor::new(column_key) + } + pub(crate) fn decryption_properties(&self) -> &FileDecryptionProperties { &self.decryption_properties } diff --git a/parquet/src/file/metadata/reader.rs b/parquet/src/file/metadata/reader.rs index 28eac350e87e..4eacad6dcff6 100644 --- a/parquet/src/file/metadata/reader.rs +++ b/parquet/src/file/metadata/reader.rs @@ -654,10 +654,12 @@ impl ParquetMetaDataReader { file_decryption_properties: Option<&FileDecryptionProperties>, ) -> Result { let mut prot = TCompactSliceInputProtocol::new(buf); - let mut file_decryptor = None; let decrypted_fmd_buf; - if let Some(file_decryption_properties) = file_decryption_properties { + + if file_decryption_properties.is_some() + && file_decryption_properties.unwrap().has_footer_key() + { let t_file_crypto_metadata: TFileCryptoMetaData = TFileCryptoMetaData::read_from_in_protocol(&mut prot) .map_err(|e| general_err!("Could not parse crypto metadata: {}", e))?; @@ -679,7 +681,7 @@ impl ParquetMetaDataReader { let aad_prefix: Vec = aes_gcm_algo.aad_prefix.unwrap_or_default(); file_decryptor = Some(FileDecryptor::new( - file_decryption_properties, + file_decryption_properties.unwrap(), aad_file_unique.clone(), aad_prefix.clone(), )); @@ -696,6 +698,7 @@ impl ParquetMetaDataReader { let mut row_groups = Vec::new(); // TODO: row group filtering for rg in t_file_metadata.row_groups { + // rg. row_groups.push(RowGroupMetaData::from_thrift(schema_descr.clone(), rg)?); } let column_orders = diff --git a/parquet/src/file/serialized_reader.rs b/parquet/src/file/serialized_reader.rs index aafd5089b9d5..1f1b61e7790c 100644 --- a/parquet/src/file/serialized_reader.rs +++ b/parquet/src/file/serialized_reader.rs @@ -345,6 +345,8 @@ pub(crate) fn read_page_header( ) -> Result { if let Some(crypto_context) = crypto_context { let decryptor = &crypto_context.data_decryptor(); + // todo: get column decryptor + // let file_decryptor = decryptor.get_column_decryptor(crypto_context.column_ordinal); let file_decryptor = decryptor.footer_decryptor(); let aad_file_unique = decryptor.aad_file_unique(); From e6e056a62383437545814a4613fdba435249ab2e Mon Sep 17 00:00:00 2001 From: Adam Reeve Date: Fri, 20 Dec 2024 10:53:46 +1300 Subject: [PATCH 18/97] Add missing doc comments --- parquet/src/column/page.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/parquet/src/column/page.rs b/parquet/src/column/page.rs index 931241e4259f..b5afe6b93389 100644 --- a/parquet/src/column/page.rs +++ b/parquet/src/column/page.rs @@ -90,10 +90,12 @@ impl Page { } } + /// Returns whether this page is any version of a data page pub fn is_data_page(&self) -> bool { matches!(self, Page::DataPage { .. } | Page::DataPageV2 { .. }) } + /// Returns whether this page is a dictionary page pub fn is_dictionary_page(&self) -> bool { matches!(self, Page::DictionaryPage { .. }) } From cca1155b7ddf0cdfe77d2bfd7f0a138d476410bb Mon Sep 17 00:00:00 2001 From: Adam Reeve Date: Fri, 20 Dec 2024 13:31:34 +1300 Subject: [PATCH 19/97] Make encryption an optional feature --- parquet/Cargo.toml | 4 +- parquet/src/arrow/arrow_reader/mod.rs | 44 +++++++++++++++------ parquet/src/column/writer/mod.rs | 4 ++ parquet/src/errors.rs | 2 +- parquet/src/file/footer.rs | 9 ++++- parquet/src/file/metadata/mod.rs | 13 +++++- parquet/src/file/metadata/reader.rs | 20 +++++++--- parquet/src/file/serialized_reader.rs | 57 ++++++++++++++++++--------- parquet/src/file/writer.rs | 1 + parquet/src/lib.rs | 2 +- 10 files changed, 114 insertions(+), 42 deletions(-) diff --git a/parquet/Cargo.toml b/parquet/Cargo.toml index cc7b5688742a..895e091b6d2b 100644 --- a/parquet/Cargo.toml +++ b/parquet/Cargo.toml @@ -70,7 +70,7 @@ half = { version = "2.1", default-features = false, features = ["num-traits"] } sysinfo = { version = "0.33.0", optional = true, default-features = false, features = ["system"] } crc32fast = { version = "1.4.2", optional = true, default-features = false } simdutf8 = { version = "0.1.5", optional = true, default-features = false } -ring = { version = "0.17", default-features = false, features = ["std"]} +ring = { version = "0.17", default-features = false, features = ["std"], optional = true } [dev-dependencies] base64 = { version = "0.22", default-features = false, features = ["std"] } @@ -127,6 +127,8 @@ crc = ["dep:crc32fast"] # Enable SIMD UTF-8 validation simdutf8 = ["dep:simdutf8"] #encryption = ["aes-gcm", "base64"] +# Enable Parquet modular encryption support +encryption = ["dep:ring"] [[example]] diff --git a/parquet/src/arrow/arrow_reader/mod.rs b/parquet/src/arrow/arrow_reader/mod.rs index 1db5181d526d..70d38cd8faed 100644 --- a/parquet/src/arrow/arrow_reader/mod.rs +++ b/parquet/src/arrow/arrow_reader/mod.rs @@ -41,6 +41,7 @@ mod filter; mod selection; pub mod statistics; +#[cfg(feature = "encryption")] use crate::encryption::ciphers::{CryptoContext, FileDecryptionProperties}; /// Builder for constructing parquet readers into arrow. @@ -383,12 +384,14 @@ impl ArrowReaderMetadata { pub fn load( reader: &T, options: ArrowReaderOptions, - file_decryption_properties: Option<&FileDecryptionProperties>, + #[cfg(feature = "encryption")] file_decryption_properties: Option< + &FileDecryptionProperties, + >, ) -> Result { - let metadata = ParquetMetaDataReader::new() - .with_page_indexes(options.page_index) - .with_encryption_properties(file_decryption_properties) - .parse_and_finish(reader)?; + let metadata = ParquetMetaDataReader::new().with_page_indexes(options.page_index); + #[cfg(feature = "encryption")] + let metadata = metadata.with_encryption_properties(file_decryption_properties); + let metadata = metadata.parse_and_finish(reader)?; Self::try_new(Arc::new(metadata), options) } @@ -534,11 +537,17 @@ impl ParquetRecordBatchReaderBuilder { /// Create a new [`ParquetRecordBatchReaderBuilder`] with [`ArrowReaderOptions`] pub fn try_new_with_options(reader: T, options: ArrowReaderOptions) -> Result { - let metadata = ArrowReaderMetadata::load(&reader, options, None)?; + let metadata = ArrowReaderMetadata::load( + &reader, + options, + #[cfg(feature = "encryption")] + None, + )?; Ok(Self::new_with_metadata(reader, metadata)) } /// Create a new [`ParquetRecordBatchReaderBuilder`] with [`ArrowReaderOptions`] and [`FileDecryptionProperties`] + #[cfg(feature = "encryption")] pub fn try_new_with_decryption( reader: T, options: ArrowReaderOptions, @@ -694,6 +703,7 @@ impl Iterator for ReaderPageIterator { let total_rows = rg.num_rows() as usize; let reader = self.reader.clone(); + #[cfg(feature = "encryption")] let crypto_context = if self.metadata.file_decryptor().is_some() { let file_decryptor = Arc::new(self.metadata.file_decryptor().clone().unwrap()); @@ -708,8 +718,14 @@ impl Iterator for ReaderPageIterator { None }; - let ret = - SerializedPageReader::new(reader, meta, total_rows, page_locations, crypto_context); + let ret = SerializedPageReader::new( + reader, + meta, + total_rows, + page_locations, + #[cfg(feature = "encryption")] + crypto_context, + ); Some(ret.map(|x| Box::new(x) as _)) } } @@ -824,6 +840,7 @@ impl ParquetRecordBatchReader { /// /// Note: this is needed when the parquet file is encrypted // todo: add options or put file_decryption_properties into options + #[cfg(feature = "encryption")] pub fn try_new_with_decryption( reader: T, batch_size: usize, @@ -993,10 +1010,11 @@ mod tests { }; use arrow_select::concat::concat_batches; + #[cfg(feature = "encryption")] + use crate::arrow::arrow_reader::ArrowReaderMetadata; use crate::arrow::arrow_reader::{ - ArrowPredicateFn, ArrowReaderBuilder, ArrowReaderMetadata, ArrowReaderOptions, - ParquetRecordBatchReader, ParquetRecordBatchReaderBuilder, RowFilter, RowSelection, - RowSelector, + ArrowPredicateFn, ArrowReaderBuilder, ArrowReaderOptions, ParquetRecordBatchReader, + ParquetRecordBatchReaderBuilder, RowFilter, RowSelection, RowSelector, }; use crate::arrow::schema::add_encoded_arrow_schema_to_metadata; use crate::arrow::{ArrowWriter, ProjectionMask}; @@ -1006,6 +1024,7 @@ mod tests { BoolType, ByteArray, ByteArrayType, DataType, FixedLenByteArray, FixedLenByteArrayType, FloatType, Int32Type, Int64Type, Int96Type, }; + #[cfg(feature = "encryption")] use crate::encryption::ciphers; use crate::errors::Result; use crate::file::properties::{EnabledStatistics, WriterProperties, WriterVersion}; @@ -1841,6 +1860,7 @@ mod tests { } #[test] + #[cfg(feature = "encryption")] fn test_non_uniform_encryption_plaintext_footer() { let testdata = arrow::util::test_util::parquet_test_data(); let path = format!("{testdata}/encrypt_columns_plaintext_footer.parquet.encrypted"); @@ -1891,6 +1911,7 @@ mod tests { } #[test] + #[cfg(feature = "encryption")] fn test_non_uniform_encryption() { let testdata = arrow::util::test_util::parquet_test_data(); let path = format!("{testdata}/encrypt_columns_plaintext_footer.parquet.encrypted"); @@ -1922,6 +1943,7 @@ mod tests { } #[test] + #[cfg(feature = "encryption")] fn test_uniform_encryption() { let testdata = arrow::util::test_util::parquet_test_data(); let path = format!("{testdata}/uniform_encryption.parquet.encrypted"); diff --git a/parquet/src/column/writer/mod.rs b/parquet/src/column/writer/mod.rs index 2c0ba2e05ad1..50156a26e276 100644 --- a/parquet/src/column/writer/mod.rs +++ b/parquet/src/column/writer/mod.rs @@ -2105,6 +2105,7 @@ mod tests { r.rows_written as usize, None, Arc::new(props), + #[cfg(feature = "encryption")] None, ) .unwrap(); @@ -2158,6 +2159,7 @@ mod tests { r.rows_written as usize, None, Arc::new(props), + #[cfg(feature = "encryption")] None, ) .unwrap(); @@ -2294,6 +2296,7 @@ mod tests { r.rows_written as usize, None, Arc::new(props), + #[cfg(feature = "encryption")] None, ) .unwrap(), @@ -3744,6 +3747,7 @@ mod tests { result.rows_written as usize, None, Arc::new(props), + #[cfg(feature = "encryption")] None, ) .unwrap(), diff --git a/parquet/src/errors.rs b/parquet/src/errors.rs index 72e5cedcc5a8..4cb1f99c3cf6 100644 --- a/parquet/src/errors.rs +++ b/parquet/src/errors.rs @@ -132,7 +132,7 @@ impl From for ParquetError { } } -//#[cfg(feature = "encryption")] +#[cfg(feature = "encryption")] impl From for ParquetError { fn from(e: ring::error::Unspecified) -> ParquetError { ParquetError::External(Box::new(e)) diff --git a/parquet/src/file/footer.rs b/parquet/src/file/footer.rs index af34fafb2e81..21f909d505b2 100644 --- a/parquet/src/file/footer.rs +++ b/parquet/src/file/footer.rs @@ -17,6 +17,7 @@ //! Module for working with Parquet file footers. +#[cfg(feature = "encryption")] use crate::encryption::ciphers::FileDecryptionProperties; use crate::errors::Result; use crate::file::{metadata::*, reader::ChunkReader, FOOTER_SIZE}; @@ -60,9 +61,13 @@ pub fn parse_metadata(chunk_reader: &R) -> Result, + #[cfg(feature = "encryption")] file_decryption_properties: Option<&FileDecryptionProperties>, ) -> Result { - ParquetMetaDataReader::decode_metadata(buf, file_decryption_properties) + ParquetMetaDataReader::decode_metadata( + buf, + #[cfg(feature = "encryption")] + file_decryption_properties, + ) } /// Decodes the Parquet footer returning the metadata length in bytes diff --git a/parquet/src/file/metadata/mod.rs b/parquet/src/file/metadata/mod.rs index 2e8c654a267f..6a8e8d8147c6 100644 --- a/parquet/src/file/metadata/mod.rs +++ b/parquet/src/file/metadata/mod.rs @@ -104,6 +104,7 @@ use crate::format::{ }; use crate::basic::{ColumnOrder, Compression, Encoding, Type}; +#[cfg(feature = "encryption")] use crate::encryption::ciphers::FileDecryptor; use crate::errors::{ParquetError, Result}; pub(crate) use crate::file::metadata::memory::HeapSize; @@ -176,6 +177,7 @@ pub struct ParquetMetaData { /// Offset index for each page in each column chunk offset_index: Option, /// Optional file decryptor + #[cfg(feature = "encryption")] file_decryptor: Option, } @@ -185,11 +187,12 @@ impl ParquetMetaData { pub fn new( file_metadata: FileMetaData, row_groups: Vec, - file_decryptor: Option, + #[cfg(feature = "encryption")] file_decryptor: Option, ) -> Self { ParquetMetaData { file_metadata, row_groups, + #[cfg(feature = "encryption")] file_decryptor, column_index: None, offset_index: None, @@ -223,6 +226,7 @@ impl ParquetMetaData { } /// Returns file decryptor as reference. + #[cfg(feature = "encryption")] pub fn file_decryptor(&self) -> &Option { &self.file_decryptor } @@ -338,7 +342,12 @@ pub struct ParquetMetaDataBuilder(ParquetMetaData); impl ParquetMetaDataBuilder { /// Create a new builder from a file metadata, with no row groups pub fn new(file_meta_data: FileMetaData) -> Self { - Self(ParquetMetaData::new(file_meta_data, vec![], None)) + Self(ParquetMetaData::new( + file_meta_data, + vec![], + #[cfg(feature = "encryption")] + None, + )) } /// Create a new builder from an existing ParquetMetaData diff --git a/parquet/src/file/metadata/reader.rs b/parquet/src/file/metadata/reader.rs index 4eacad6dcff6..5684b4e95cfa 100644 --- a/parquet/src/file/metadata/reader.rs +++ b/parquet/src/file/metadata/reader.rs @@ -20,6 +20,7 @@ use std::{io::Read, ops::Range, sync::Arc}; use bytes::Bytes; use crate::basic::ColumnOrder; +#[cfg(feature = "encryption")] use crate::encryption::ciphers::{ create_footer_aad, BlockDecryptor, FileDecryptionProperties, FileDecryptor, }; @@ -29,10 +30,9 @@ use crate::file::page_index::index::Index; use crate::file::page_index::index_reader::{acc_range, decode_column_index, decode_offset_index}; use crate::file::reader::ChunkReader; use crate::file::{FOOTER_SIZE, PARQUET_MAGIC, PARQUET_MAGIC_ENCR_FOOTER}; -use crate::format::{ - ColumnOrder as TColumnOrder, EncryptionAlgorithm, FileCryptoMetaData as TFileCryptoMetaData, - FileMetaData as TFileMetaData, -}; +use crate::format::{ColumnOrder as TColumnOrder, FileMetaData as TFileMetaData}; +#[cfg(feature = "encryption")] +use crate::format::{EncryptionAlgorithm, FileCryptoMetaData as TFileCryptoMetaData}; use crate::schema::types; use crate::schema::types::SchemaDescriptor; use crate::thrift::{TCompactSliceInputProtocol, TSerializable}; @@ -74,6 +74,7 @@ pub struct ParquetMetaDataReader { // Size of the serialized thrift metadata plus the 8 byte footer. Only set if // `self.parse_metadata` is called. metadata_size: Option, + #[cfg(feature = "encryption")] file_decryption_properties: Option, } @@ -136,6 +137,7 @@ impl ParquetMetaDataReader { /// Provide the [`FileDecryptionProperties`] to use when decrypting the file. /// /// This is only necessary when the file is encrypted. + #[cfg(feature = "encryption")] pub fn with_encryption_properties( mut self, properties: Option<&FileDecryptionProperties>, @@ -544,6 +546,7 @@ impl ParquetMetaDataReader { let start = file_size - footer_metadata_len as u64; Self::decode_metadata( chunk_reader.get_bytes(start, metadata_len)?.as_ref(), + #[cfg(feature = "encryption")] self.file_decryption_properties.as_ref(), ) } @@ -651,12 +654,18 @@ impl ParquetMetaDataReader { /// [Parquet Spec]: https://github.com/apache/parquet-format#metadata pub fn decode_metadata( buf: &[u8], - file_decryption_properties: Option<&FileDecryptionProperties>, + #[cfg(feature = "encryption")] file_decryption_properties: Option< + &FileDecryptionProperties, + >, ) -> Result { let mut prot = TCompactSliceInputProtocol::new(buf); + + #[cfg(feature = "encryption")] let mut file_decryptor = None; + #[cfg(feature = "encryption")] let decrypted_fmd_buf; + #[cfg(feature = "encryption")] if file_decryption_properties.is_some() && file_decryption_properties.unwrap().has_footer_key() { @@ -721,6 +730,7 @@ impl ParquetMetaDataReader { Ok(ParquetMetaData::new( file_metadata, row_groups, + #[cfg(feature = "encryption")] file_decryptor, )) } diff --git a/parquet/src/file/serialized_reader.rs b/parquet/src/file/serialized_reader.rs index 1f1b61e7790c..692b3668dccb 100644 --- a/parquet/src/file/serialized_reader.rs +++ b/parquet/src/file/serialized_reader.rs @@ -22,6 +22,7 @@ use crate::basic::{Encoding, Type}; use crate::bloom_filter::Sbbf; use crate::column::page::{Page, PageMetadata, PageReader}; use crate::compression::{create_codec, Codec}; +#[cfg(feature = "encryption")] use crate::encryption::ciphers::{create_page_aad, BlockDecryptor, CryptoContext, ModuleType}; use crate::errors::{ParquetError, Result}; use crate::file::page_index::offset_index::OffsetIndexMetaData; @@ -324,6 +325,7 @@ impl RowGroupReader for SerializedRowGroupReader<'_, R self.metadata.num_rows() as usize, page_locations, props, + #[cfg(feature = "encryption")] None, )?)) } @@ -341,8 +343,9 @@ impl RowGroupReader for SerializedRowGroupReader<'_, R /// Reads a [`PageHeader`] from the provided [`Read`] pub(crate) fn read_page_header( input: &mut T, - crypto_context: Option>, + #[cfg(feature = "encryption")] crypto_context: Option>, ) -> Result { + #[cfg(feature = "encryption")] if let Some(crypto_context) = crypto_context { let decryptor = &crypto_context.data_decryptor(); // todo: get column decryptor @@ -383,7 +386,7 @@ pub(crate) fn read_page_header( /// Reads a [`PageHeader`] from the provided [`Read`] returning the number of bytes read fn read_page_header_len( input: &mut T, - crypto_context: Option>, + #[cfg(feature = "encryption")] crypto_context: Option>, ) -> Result<(usize, PageHeader)> { /// A wrapper around a [`std::io::Read`] that keeps track of the bytes read struct TrackedRead { @@ -403,7 +406,11 @@ fn read_page_header_len( inner: input, bytes_read: 0, }; - let header = read_page_header(&mut tracked, crypto_context)?; + let header = read_page_header( + &mut tracked, + #[cfg(feature = "encryption")] + crypto_context, + )?; Ok((tracked.bytes_read, header)) } @@ -413,7 +420,7 @@ pub(crate) fn decode_page( buffer: Bytes, physical_type: Type, decompressor: Option<&mut Box>, - crypto_context: Option>, + #[cfg(feature = "encryption")] crypto_context: Option>, ) -> Result { // Verify the 32-bit CRC checksum of the page #[cfg(feature = "crc")] @@ -440,6 +447,7 @@ pub(crate) fn decode_page( can_decompress = header_v2.is_compressed.unwrap_or(true); } + #[cfg(feature = "encryption")] let buffer: Bytes = if crypto_context.is_some() { let crypto_context = crypto_context.as_ref().unwrap(); let decryptor = crypto_context.data_decryptor(); @@ -584,6 +592,7 @@ pub struct SerializedPageReader { state: SerializedPageReaderState, /// Crypto context + #[cfg(feature = "encryption")] crypto_context: Option>, } @@ -594,7 +603,7 @@ impl SerializedPageReader { meta: &ColumnChunkMetaData, total_rows: usize, page_locations: Option>, - crypto_context: Option>, + #[cfg(feature = "encryption")] crypto_context: Option>, ) -> Result { let props = Arc::new(ReaderProperties::builder().build()); SerializedPageReader::new_with_properties( @@ -603,6 +612,7 @@ impl SerializedPageReader { total_rows, page_locations, props, + #[cfg(feature = "encryption")] crypto_context, ) } @@ -614,7 +624,7 @@ impl SerializedPageReader { total_rows: usize, page_locations: Option>, props: ReaderPropertiesPtr, - crypto_context: Option>, + #[cfg(feature = "encryption")] crypto_context: Option>, ) -> Result { let decompressor = create_codec(meta.compression(), props.codec_options())?; let (start, len) = meta.byte_range(); @@ -644,21 +654,13 @@ impl SerializedPageReader { require_dictionary: meta.dictionary_page_offset().is_some(), }, }; - if crypto_context.is_some() { - return Ok(Self { - reader, - decompressor, - state, - physical_type: meta.column_type(), - crypto_context, - }); - } Ok(Self { reader, decompressor, state, physical_type: meta.column_type(), - crypto_context: None, + #[cfg(feature = "encryption")] + crypto_context, }) } @@ -768,12 +770,17 @@ impl PageReader for SerializedPageReader { let header = if let Some(header) = next_page_header.take() { *header } else { + #[cfg(feature = "encryption")] let crypto_context = page_crypto_context( &self.crypto_context, *page_ordinal, *require_dictionary, )?; - let (header_len, header) = read_page_header_len(&mut read, crypto_context)?; + let (header_len, header) = read_page_header_len( + &mut read, + #[cfg(feature = "encryption")] + crypto_context, + )?; verify_page_header_len(header_len, *remaining)?; *offset += header_len; *remaining -= header_len; @@ -803,6 +810,7 @@ impl PageReader for SerializedPageReader { )); } + #[cfg(feature = "encryption")] let crypto_context = page_crypto_context( &self.crypto_context, *page_ordinal, @@ -813,6 +821,7 @@ impl PageReader for SerializedPageReader { Bytes::from(buffer), self.physical_type, self.decompressor.as_mut(), + #[cfg(feature = "encryption")] crypto_context, )?; if page.is_data_page() { @@ -849,6 +858,7 @@ impl PageReader for SerializedPageReader { bytes, self.physical_type, self.decompressor.as_mut(), + #[cfg(feature = "encryption")] None, )? } @@ -880,7 +890,11 @@ impl PageReader for SerializedPageReader { } } else { let mut read = self.reader.get_read(*offset as u64)?; - let (header_len, header) = read_page_header_len(&mut read, None)?; + let (header_len, header) = read_page_header_len( + &mut read, + #[cfg(feature = "encryption")] + None, + )?; verify_page_header_len(header_len, *remaining_bytes)?; *offset += header_len; *remaining_bytes -= header_len; @@ -943,7 +957,11 @@ impl PageReader for SerializedPageReader { *remaining_bytes -= buffered_header.compressed_page_size as usize; } else { let mut read = self.reader.get_read(*offset as u64)?; - let (header_len, header) = read_page_header_len(&mut read, None)?; + let (header_len, header) = read_page_header_len( + &mut read, + #[cfg(feature = "encryption")] + None, + )?; verify_page_header_len(header_len, *remaining_bytes)?; verify_page_size( header.compressed_page_size, @@ -972,6 +990,7 @@ impl PageReader for SerializedPageReader { } } +#[cfg(feature = "encryption")] fn page_crypto_context( crypto_context: &Option>, page_ordinal: usize, diff --git a/parquet/src/file/writer.rs b/parquet/src/file/writer.rs index b509707da604..2fa2d2dcf910 100644 --- a/parquet/src/file/writer.rs +++ b/parquet/src/file/writer.rs @@ -1314,6 +1314,7 @@ mod tests { total_num_values as usize, None, Arc::new(props), + #[cfg(feature = "encryption")] None, ) .unwrap(); diff --git a/parquet/src/lib.rs b/parquet/src/lib.rs index 8c1c190ea871..6a9f9947195b 100644 --- a/parquet/src/lib.rs +++ b/parquet/src/lib.rs @@ -141,7 +141,7 @@ experimental!(mod compression); experimental!(mod encodings); pub mod bloom_filter; -//#[cfg(feature = "encryption")] +#[cfg(feature = "encryption")] experimental!(mod encryption); pub mod file; From 952892d610b5862582c60e908d09a061699b6495 Mon Sep 17 00:00:00 2001 From: Adam Reeve Date: Fri, 20 Dec 2024 15:02:58 +1300 Subject: [PATCH 20/97] Handle when a file is encrypted but encryption is disabled or no decryption properties are provided --- parquet/src/arrow/async_reader/metadata.rs | 20 ++++- parquet/src/arrow/async_reader/mod.rs | 9 +- parquet/src/file/footer.rs | 15 ++-- parquet/src/file/metadata/reader.rs | 95 ++++++++++++++++++---- 4 files changed, 109 insertions(+), 30 deletions(-) diff --git a/parquet/src/arrow/async_reader/metadata.rs b/parquet/src/arrow/async_reader/metadata.rs index 34e14cb5fc50..53f992e424c7 100644 --- a/parquet/src/arrow/async_reader/metadata.rs +++ b/parquet/src/arrow/async_reader/metadata.rs @@ -113,7 +113,8 @@ impl MetadataLoader { let mut footer = [0; FOOTER_SIZE]; footer.copy_from_slice(&suffix[suffix_len - FOOTER_SIZE..suffix_len]); - let length = ParquetMetaDataReader::decode_footer(&footer)?; + let footer = ParquetMetaDataReader::decode_footer_tail(&footer)?; + let length = footer.metadata_length(); if file_size < length + FOOTER_SIZE { return Err(ParquetError::EOF(format!( @@ -127,13 +128,26 @@ impl MetadataLoader { let (metadata, remainder) = if length > suffix_len - FOOTER_SIZE { let metadata_start = file_size - length - FOOTER_SIZE; let meta = fetch.fetch(metadata_start..file_size - FOOTER_SIZE).await?; - (ParquetMetaDataReader::decode_metadata(&meta, None)?, None) + ( + ParquetMetaDataReader::decode_metadata( + &meta, + footer.encrypted_footer(), + #[cfg(feature = "encryption")] + None, + )?, + None, + ) } else { let metadata_start = file_size - length - FOOTER_SIZE - footer_start; let slice = &suffix[metadata_start..suffix_len - FOOTER_SIZE]; ( - ParquetMetaDataReader::decode_metadata(slice, None)?, + ParquetMetaDataReader::decode_metadata( + slice, + footer.encrypted_footer(), + #[cfg(feature = "encryption")] + None, + )?, Some((footer_start, suffix.slice(..metadata_start))), ) }; diff --git a/parquet/src/arrow/async_reader/mod.rs b/parquet/src/arrow/async_reader/mod.rs index 89dd32c18f4a..c2b5410eba62 100644 --- a/parquet/src/arrow/async_reader/mod.rs +++ b/parquet/src/arrow/async_reader/mod.rs @@ -146,7 +146,8 @@ impl AsyncFileReader for T { let mut buf = [0_u8; FOOTER_SIZE]; self.read_exact(&mut buf).await?; - let metadata_len = ParquetMetaDataReader::decode_footer(&buf)?; + let footer = ParquetMetaDataReader::decode_footer_tail(&buf)?; + let metadata_len = footer.metadata_length(); self.seek(SeekFrom::End(-FOOTER_SIZE_I64 - metadata_len as i64)) .await?; @@ -155,7 +156,10 @@ impl AsyncFileReader for T { // todo: use file_decryption_properties Ok(Arc::new(ParquetMetaDataReader::decode_metadata( - &buf, None, + &buf, + footer.encrypted_footer(), + #[cfg(feature = "encryption")] + None, )?)) } .boxed() @@ -972,6 +976,7 @@ impl RowGroups for InMemoryRowGroup<'_> { self.metadata.column(i), self.row_count, page_locations, + #[cfg(feature = "encryption")] None, )?); diff --git a/parquet/src/file/footer.rs b/parquet/src/file/footer.rs index 21f909d505b2..bdab765cf700 100644 --- a/parquet/src/file/footer.rs +++ b/parquet/src/file/footer.rs @@ -59,14 +59,12 @@ pub fn parse_metadata(chunk_reader: &R) -> Result, -) -> Result { +pub fn decode_metadata(buf: &[u8]) -> Result { ParquetMetaDataReader::decode_metadata( buf, + false, #[cfg(feature = "encryption")] - file_decryption_properties, + None, ) } @@ -81,7 +79,10 @@ pub fn decode_metadata( /// | len | 'PAR1' | /// +-----+--------+ /// ``` -#[deprecated(since = "53.1.0", note = "Use ParquetMetaDataReader::decode_footer")] +#[deprecated( + since = "53.1.0", + note = "Use ParquetMetaDataReader::decode_footer_tail" +)] pub fn decode_footer(slice: &[u8; FOOTER_SIZE]) -> Result { - ParquetMetaDataReader::decode_footer(slice) + ParquetMetaDataReader::decode_footer_tail(slice).map(|f| f.metadata_length()) } diff --git a/parquet/src/file/metadata/reader.rs b/parquet/src/file/metadata/reader.rs index 5684b4e95cfa..28c6d565c991 100644 --- a/parquet/src/file/metadata/reader.rs +++ b/parquet/src/file/metadata/reader.rs @@ -78,6 +78,26 @@ pub struct ParquetMetaDataReader { file_decryption_properties: Option, } +/// Describes how the footer metadata is stored +/// +/// This is parsed from the last 8 bytes of the Parquet file +pub struct FooterTail { + metadata_length: usize, + encrypted_footer: bool, +} + +impl FooterTail { + /// The length of the footer metadata in bytes + pub fn metadata_length(&self) -> usize { + self.metadata_length + } + + /// Whether the footer metadata is encrypted + pub fn encrypted_footer(&self) -> bool { + self.encrypted_footer + } +} + impl ParquetMetaDataReader { /// Create a new [`ParquetMetaDataReader`] pub fn new() -> Self { @@ -396,6 +416,7 @@ impl ParquetMetaDataReader { &mut fetch, file_size, self.get_prefetch_size(), + #[cfg(feature = "encryption")] self.file_decryption_properties.as_ref(), ) .await?; @@ -535,7 +556,8 @@ impl ParquetMetaDataReader { .get_read(file_size - 8)? .read_exact(&mut footer)?; - let metadata_len = Self::decode_footer(&footer)?; + let footer = Self::decode_footer_tail(&footer)?; + let metadata_len = footer.metadata_length(); let footer_metadata_len = FOOTER_SIZE + metadata_len; self.metadata_size = Some(footer_metadata_len); @@ -546,6 +568,7 @@ impl ParquetMetaDataReader { let start = file_size - footer_metadata_len as u64; Self::decode_metadata( chunk_reader.get_bytes(start, metadata_len)?.as_ref(), + footer.encrypted_footer(), #[cfg(feature = "encryption")] self.file_decryption_properties.as_ref(), ) @@ -569,7 +592,9 @@ impl ParquetMetaDataReader { fetch: &mut F, file_size: usize, prefetch: usize, - file_decryption_properties: Option<&FileDecryptionProperties>, + #[cfg(feature = "encryption")] file_decryption_properties: Option< + &FileDecryptionProperties, + >, ) -> Result<(ParquetMetaData, Option<(usize, Bytes)>)> { if file_size < FOOTER_SIZE { return Err(eof_err!("file size of {} is less than footer", file_size)); @@ -594,7 +619,8 @@ impl ParquetMetaDataReader { let mut footer = [0; FOOTER_SIZE]; footer.copy_from_slice(&suffix[suffix_len - FOOTER_SIZE..suffix_len]); - let length = Self::decode_footer(&footer)?; + let footer = Self::decode_footer_tail(&footer)?; + let length = footer.metadata_length(); if file_size < length + FOOTER_SIZE { return Err(eof_err!( @@ -609,22 +635,32 @@ impl ParquetMetaDataReader { let metadata_start = file_size - length - FOOTER_SIZE; let meta = fetch.fetch(metadata_start..file_size - FOOTER_SIZE).await?; Ok(( - Self::decode_metadata(&meta, file_decryption_properties)?, + Self::decode_metadata( + &meta, + footer.encrypted_footer(), + #[cfg(feature = "encryption")] + file_decryption_properties, + )?, None, )) } else { let metadata_start = file_size - length - FOOTER_SIZE - footer_start; let slice = &suffix[metadata_start..suffix_len - FOOTER_SIZE]; Ok(( - Self::decode_metadata(slice, file_decryption_properties)?, + Self::decode_metadata( + slice, + footer.encrypted_footer(), + #[cfg(feature = "encryption")] + file_decryption_properties, + )?, Some((footer_start, suffix.slice(..metadata_start))), )) } } - /// Decodes the Parquet footer returning the metadata length in bytes + /// Decodes the end of the Parquet footer /// - /// A parquet footer is 8 bytes long and has the following layout: + /// There are 8 bytes at the end of the Parquet footer with the following layout: /// * 4 bytes for the metadata length /// * 4 bytes for the magic bytes 'PAR1' or 'PARE' (encrypted footer) /// @@ -633,16 +669,28 @@ impl ParquetMetaDataReader { /// | len | 'PAR1' or 'PARE' | /// +-----+------------------+ /// ``` - pub fn decode_footer(slice: &[u8; FOOTER_SIZE]) -> Result { - // check this is indeed a parquet file - if slice[4..] != PARQUET_MAGIC && slice[4..] != PARQUET_MAGIC_ENCR_FOOTER { + pub fn decode_footer_tail(slice: &[u8; FOOTER_SIZE]) -> Result { + let magic = &slice[4..]; + let encrypted_footer = if magic == PARQUET_MAGIC_ENCR_FOOTER { + true + } else if magic == PARQUET_MAGIC { + false + } else { return Err(general_err!("Invalid Parquet file. Corrupt footer")); - } - + }; // get the metadata length from the footer let metadata_len = u32::from_le_bytes(slice[..4].try_into().unwrap()); - // u32 won't be larger than usize in most cases - Ok(metadata_len as usize) + Ok(FooterTail { + // u32 won't be larger than usize in most cases + metadata_length: metadata_len as usize, + encrypted_footer, + }) + } + + /// Decodes the Parquet footer, returning the metadata length in bytes + #[deprecated(note = "use decode_footer_tail instead")] + pub fn decode_footer(slice: &[u8; FOOTER_SIZE]) -> Result { + Self::decode_footer_tail(slice).map(|f| f.metadata_length) } /// Decodes [`ParquetMetaData`] from the provided bytes. @@ -654,21 +702,32 @@ impl ParquetMetaDataReader { /// [Parquet Spec]: https://github.com/apache/parquet-format#metadata pub fn decode_metadata( buf: &[u8], + encrypted_footer: bool, #[cfg(feature = "encryption")] file_decryption_properties: Option< &FileDecryptionProperties, >, ) -> Result { let mut prot = TCompactSliceInputProtocol::new(buf); + #[cfg(not(feature = "encryption"))] + if encrypted_footer { + return Err(general_err!( + "Parquet file has an encrypted footer but the encryption feature is disabled" + )); + } + #[cfg(feature = "encryption")] let mut file_decryptor = None; #[cfg(feature = "encryption")] let decrypted_fmd_buf; #[cfg(feature = "encryption")] - if file_decryption_properties.is_some() - && file_decryption_properties.unwrap().has_footer_key() - { + if encrypted_footer { + if file_decryption_properties.is_none() { + return Err(general_err!("Parquet file has an encrypted footer but no decryption properties were provided")); + }; + let file_decryption_properties = file_decryption_properties.unwrap(); + let t_file_crypto_metadata: TFileCryptoMetaData = TFileCryptoMetaData::read_from_in_protocol(&mut prot) .map_err(|e| general_err!("Could not parse crypto metadata: {}", e))?; @@ -690,7 +749,7 @@ impl ParquetMetaDataReader { let aad_prefix: Vec = aes_gcm_algo.aad_prefix.unwrap_or_default(); file_decryptor = Some(FileDecryptor::new( - file_decryption_properties.unwrap(), + file_decryption_properties, aad_file_unique.clone(), aad_prefix.clone(), )); From 8aa8ba454437c90570dd36d2d2147f1ae11b917b Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Sun, 22 Dec 2024 13:11:28 +0100 Subject: [PATCH 21/97] Allow for plaintext footer --- parquet/Cargo.toml | 2 +- parquet/src/arrow/arrow_reader/mod.rs | 23 +++++++++++--------- parquet/src/encryption/ciphers.rs | 16 +++++++++----- parquet/src/file/metadata/reader.rs | 30 ++++++++++++++++++++------- parquet/src/file/serialized_reader.rs | 7 +++++-- 5 files changed, 53 insertions(+), 25 deletions(-) diff --git a/parquet/Cargo.toml b/parquet/Cargo.toml index 895e091b6d2b..3d8d4ed9aab4 100644 --- a/parquet/Cargo.toml +++ b/parquet/Cargo.toml @@ -99,7 +99,7 @@ zstd-sys = { version = ">=2.0.0, <2.0.14", default-features = false } all-features = true [features] -default = ["arrow", "snap", "brotli", "flate2", "lz4", "zstd", "base64", "simdutf8"] +default = ["arrow", "snap", "brotli", "flate2", "lz4", "zstd", "base64", "simdutf8", "encryption"] # Enable lz4 lz4 = ["lz4_flex"] # Enable arrow reader/writer APIs diff --git a/parquet/src/arrow/arrow_reader/mod.rs b/parquet/src/arrow/arrow_reader/mod.rs index 70d38cd8faed..cf0e45d7848b 100644 --- a/parquet/src/arrow/arrow_reader/mod.rs +++ b/parquet/src/arrow/arrow_reader/mod.rs @@ -390,7 +390,10 @@ impl ArrowReaderMetadata { ) -> Result { let metadata = ParquetMetaDataReader::new().with_page_indexes(options.page_index); #[cfg(feature = "encryption")] - let metadata = metadata.with_encryption_properties(file_decryption_properties); + let metadata = metadata + .with_encryption_properties(file_decryption_properties) + .parse_and_finish(reader)?; + #[cfg(not(feature = "encryption"))] let metadata = metadata.parse_and_finish(reader)?; Self::try_new(Arc::new(metadata), options) } @@ -1914,7 +1917,7 @@ mod tests { #[cfg(feature = "encryption")] fn test_non_uniform_encryption() { let testdata = arrow::util::test_util::parquet_test_data(); - let path = format!("{testdata}/encrypt_columns_plaintext_footer.parquet.encrypted"); + let path = format!("{testdata}/encrypt_columns_and_footer.parquet.encrypted"); let file = File::open(path).unwrap(); let footer_key = "0123456789012345".as_bytes(); // 128bit/16 @@ -1932,14 +1935,14 @@ mod tests { let metadata = ArrowReaderMetadata::load(&file, Default::default(), decryption_properties.as_ref()) .unwrap(); - // let file_metadata = metadata.metadata.file_metadata(); - // - // assert_eq!(file_metadata.num_rows(), 50); - // assert_eq!(file_metadata.schema_descr().num_columns(), 8); - // assert_eq!( - // file_metadata.created_by().unwrap(), - // "parquet-cpp-arrow version 19.0.0-SNAPSHOT" - // ); + let file_metadata = metadata.metadata.file_metadata(); + + assert_eq!(file_metadata.num_rows(), 50); + assert_eq!(file_metadata.schema_descr().num_columns(), 8); + assert_eq!( + file_metadata.created_by().unwrap(), + "parquet-cpp-arrow version 19.0.0-SNAPSHOT" + ); } #[test] diff --git a/parquet/src/encryption/ciphers.rs b/parquet/src/encryption/ciphers.rs index 0b8d9246bca2..5089ab390397 100644 --- a/parquet/src/encryption/ciphers.rs +++ b/parquet/src/encryption/ciphers.rs @@ -23,6 +23,7 @@ use std::sync::Arc; use ring::aead::{Aad, LessSafeKey, NonceSequence, UnboundKey, AES_128_GCM}; use ring::rand::{SecureRandom, SystemRandom}; use crate::errors::{ParquetError, Result}; +use crate::format::EncryptionAlgorithm; pub trait BlockEncryptor { fn encrypt(&mut self, plaintext: &[u8], aad: &[u8]) -> Result>; @@ -268,7 +269,6 @@ impl DecryptionPropertiesBuilder { pub fn with_column_key(mut self, key: Vec, value: Vec) -> Self { let mut column_keys= self.column_keys.unwrap_or_else(HashMap::new); column_keys.insert(key, value); - // let _ = column_keys.insert(key, value); self.column_keys = Some(column_keys); self } @@ -278,7 +278,7 @@ impl DecryptionPropertiesBuilder { pub struct FileDecryptor { decryption_properties: FileDecryptionProperties, // todo decr: change to BlockDecryptor - footer_decryptor: RingGcmBlockDecryptor, + footer_decryptor: Option, aad_file_unique: Vec, aad_prefix: Vec, } @@ -291,9 +291,15 @@ impl PartialEq for FileDecryptor { impl FileDecryptor { pub(crate) fn new(decryption_properties: &FileDecryptionProperties, aad_file_unique: Vec, aad_prefix: Vec) -> Self { + let footer_decryptor = if let Some(footer_key) = decryption_properties.footer_key.clone() { + Some(RingGcmBlockDecryptor::new(footer_key.as_ref())) + } else { + None + }; + Self { // todo decr: if no key available yet (not set in properties, will be retrieved from metadata) - footer_decryptor: RingGcmBlockDecryptor::new(decryption_properties.footer_key.clone().unwrap().as_ref()), + footer_decryptor, decryption_properties: decryption_properties.clone(), aad_file_unique, aad_prefix, @@ -302,7 +308,7 @@ impl FileDecryptor { // todo decr: change to BlockDecryptor pub(crate) fn get_footer_decryptor(self) -> RingGcmBlockDecryptor { - self.footer_decryptor + self.footer_decryptor.unwrap() } pub(crate) fn get_column_decryptor(&self, column_key: &[u8]) -> RingGcmBlockDecryptor { @@ -314,7 +320,7 @@ impl FileDecryptor { &self.decryption_properties } - pub(crate) fn footer_decryptor(&self) -> RingGcmBlockDecryptor { + pub(crate) fn footer_decryptor(&self) -> Option { self.footer_decryptor.clone() } diff --git a/parquet/src/file/metadata/reader.rs b/parquet/src/file/metadata/reader.rs index 28c6d565c991..ddf6eb3a397e 100644 --- a/parquet/src/file/metadata/reader.rs +++ b/parquet/src/file/metadata/reader.rs @@ -717,7 +717,7 @@ impl ParquetMetaDataReader { } #[cfg(feature = "encryption")] - let mut file_decryptor = None; + let mut decryptor = None; #[cfg(feature = "encryption")] let decrypted_fmd_buf; @@ -726,7 +726,7 @@ impl ParquetMetaDataReader { if file_decryption_properties.is_none() { return Err(general_err!("Parquet file has an encrypted footer but no decryption properties were provided")); }; - let file_decryption_properties = file_decryption_properties.unwrap(); + let file_decryption_properties = file_decryption_properties; let t_file_crypto_metadata: TFileCryptoMetaData = TFileCryptoMetaData::read_from_in_protocol(&mut prot) @@ -748,14 +748,15 @@ impl ParquetMetaDataReader { let aad_footer = create_footer_aad(aad_file_unique.as_ref())?; let aad_prefix: Vec = aes_gcm_algo.aad_prefix.unwrap_or_default(); - file_decryptor = Some(FileDecryptor::new( - file_decryption_properties, + decryptor = Some(FileDecryptor::new( + file_decryption_properties.unwrap(), aad_file_unique.clone(), aad_prefix.clone(), )); - let decryptor = file_decryptor.clone().unwrap().get_footer_decryptor(); + let footer_decryptor = decryptor.clone().unwrap().get_footer_decryptor(); - decrypted_fmd_buf = decryptor.decrypt(prot.as_slice().as_ref(), aad_footer.as_ref())?; + decrypted_fmd_buf = + footer_decryptor.decrypt(prot.as_slice().as_ref(), aad_footer.as_ref())?; prot = TCompactSliceInputProtocol::new(decrypted_fmd_buf.as_ref()); } @@ -773,7 +774,22 @@ impl ParquetMetaDataReader { Self::parse_column_orders(t_file_metadata.column_orders, &schema_descr)?; // todo add file decryptor + #[cfg(feature = "encryption")] if t_file_metadata.encryption_algorithm.is_some() { + let algo = t_file_metadata.encryption_algorithm; + let aes_gcm_algo = if let Some(EncryptionAlgorithm::AESGCMV1(a)) = algo { + a + } else { + unreachable!() + }; // todo decr: add support for GCMCTRV1 + let aad_file_unique = aes_gcm_algo.aad_file_unique.unwrap(); + let aad_prefix: Vec = aes_gcm_algo.aad_prefix.unwrap_or_default(); + let fdp = file_decryption_properties.unwrap(); + decryptor = Some(FileDecryptor::new( + fdp, + aad_file_unique.clone(), + aad_prefix.clone(), + )); // todo get key_metadata etc. Set file decryptor in return value // todo check signature } @@ -790,7 +806,7 @@ impl ParquetMetaDataReader { file_metadata, row_groups, #[cfg(feature = "encryption")] - file_decryptor, + decryptor, )) } diff --git a/parquet/src/file/serialized_reader.rs b/parquet/src/file/serialized_reader.rs index 692b3668dccb..5cb20b7a9617 100644 --- a/parquet/src/file/serialized_reader.rs +++ b/parquet/src/file/serialized_reader.rs @@ -350,6 +350,9 @@ pub(crate) fn read_page_header( let decryptor = &crypto_context.data_decryptor(); // todo: get column decryptor // let file_decryptor = decryptor.get_column_decryptor(crypto_context.column_ordinal); + // if !decryptor.decryption_properties().has_footer_key() { + // return Err(general_err!("Missing footer decryptor")); + // } let file_decryptor = decryptor.footer_decryptor(); let aad_file_unique = decryptor.aad_file_unique(); @@ -371,7 +374,7 @@ pub(crate) fn read_page_header( let ciphertext_len = u32::from_le_bytes(len_bytes) as usize; let mut ciphertext = vec![0; 4 + ciphertext_len]; input.read_exact(&mut ciphertext[4..])?; - let buf = file_decryptor.decrypt(&ciphertext, aad.as_ref())?; + let buf = file_decryptor.unwrap().decrypt(&ciphertext, aad.as_ref())?; let mut prot = TCompactSliceInputProtocol::new(buf.as_slice()); let page_header = PageHeader::read_from_in_protocol(&mut prot)?; @@ -465,7 +468,7 @@ pub(crate) fn decode_page( crypto_context.column_ordinal, crypto_context.page_ordinal, )?; - let decrypted = file_decryptor.decrypt(&buffer.as_ref(), &aad)?; + let decrypted = file_decryptor.unwrap().decrypt(&buffer.as_ref(), &aad)?; Bytes::from(decrypted) } else { buffer From 490e153ce21532b92e3a25838cc806ce5f820e26 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Mon, 23 Dec 2024 23:36:27 +0100 Subject: [PATCH 22/97] work --- parquet/src/arrow/arrow_reader/mod.rs | 11 ++-- parquet/src/encryption/ciphers.rs | 21 ++++++-- parquet/src/file/metadata/mod.rs | 59 +++++++++++++++++---- parquet/src/file/metadata/reader.rs | 3 +- parquet/src/file/serialized_reader.rs | 76 +++++++++++++++------------ 5 files changed, 116 insertions(+), 54 deletions(-) diff --git a/parquet/src/arrow/arrow_reader/mod.rs b/parquet/src/arrow/arrow_reader/mod.rs index cf0e45d7848b..8196b1d7290b 100644 --- a/parquet/src/arrow/arrow_reader/mod.rs +++ b/parquet/src/arrow/arrow_reader/mod.rs @@ -1874,8 +1874,8 @@ mod tests { let decryption_properties = Some( ciphers::FileDecryptionProperties::builder() - .with_column_key("kc1".as_bytes().to_vec(), column_1_key.to_vec()) - .with_column_key("kc2".as_bytes().to_vec(), column_2_key.to_vec()) + .with_column_key("float_field".as_bytes().to_vec(), column_1_key.to_vec()) + .with_column_key("double_field".as_bytes().to_vec(), column_2_key.to_vec()) .build(), ); @@ -1916,6 +1916,8 @@ mod tests { #[test] #[cfg(feature = "encryption")] fn test_non_uniform_encryption() { + // Decryption configuration 2: Decrypt using key retriever callback that holds the + // keys of two encrypted columns and the footer key. Supply aad_prefix. let testdata = arrow::util::test_util::parquet_test_data(); let path = format!("{testdata}/encrypt_columns_and_footer.parquet.encrypted"); let file = File::open(path).unwrap(); @@ -1927,8 +1929,9 @@ mod tests { let decryption_properties = Some( ciphers::FileDecryptionProperties::builder() .with_footer_key(footer_key.to_vec()) - .with_column_key("kc1".as_bytes().to_vec(), column_1_key.to_vec()) - .with_column_key("kc2".as_bytes().to_vec(), column_2_key.to_vec()) + .with_column_key("float_field".as_bytes().to_vec(), column_1_key.to_vec()) + .with_column_key("double_field".as_bytes().to_vec(), column_2_key.to_vec()) + .with_aad_prefix("tester".as_bytes().to_vec()) .build(), ); diff --git a/parquet/src/encryption/ciphers.rs b/parquet/src/encryption/ciphers.rs index 5089ab390397..08afad684dbd 100644 --- a/parquet/src/encryption/ciphers.rs +++ b/parquet/src/encryption/ciphers.rs @@ -22,6 +22,7 @@ use std::collections::HashMap; use std::sync::Arc; use ring::aead::{Aad, LessSafeKey, NonceSequence, UnboundKey, AES_128_GCM}; use ring::rand::{SecureRandom, SystemRandom}; +use zstd::zstd_safe::WriteBuf; use crate::errors::{ParquetError, Result}; use crate::format::EncryptionAlgorithm; @@ -231,6 +232,7 @@ fn create_module_aad(file_aad: &[u8], module_type: ModuleType, row_group_ordinal pub struct FileDecryptionProperties { footer_key: Option>, column_keys: Option, Vec>>, + aad_prefix: Option>, } impl FileDecryptionProperties { @@ -238,11 +240,16 @@ impl FileDecryptionProperties { DecryptionPropertiesBuilder::with_defaults() } pub fn has_footer_key(&self) -> bool { self.footer_key.is_some() } + + pub fn aad_prefix(&self) -> Option<&Vec> { + self.aad_prefix.as_ref() + } } pub struct DecryptionPropertiesBuilder { footer_key: Option>, column_keys: Option, Vec>>, + aad_prefix: Option>, } impl DecryptionPropertiesBuilder { @@ -250,6 +257,7 @@ impl DecryptionPropertiesBuilder { Self { footer_key: None, column_keys: None, + aad_prefix: None, } } @@ -257,6 +265,7 @@ impl DecryptionPropertiesBuilder { FileDecryptionProperties { footer_key: self.footer_key, column_keys: self.column_keys, + aad_prefix: self.aad_prefix, } } @@ -266,6 +275,11 @@ impl DecryptionPropertiesBuilder { self } + pub fn with_aad_prefix(mut self, value: Vec) -> Self { + self.aad_prefix = Some(value); + self + } + pub fn with_column_key(mut self, key: Vec, value: Vec) -> Self { let mut column_keys= self.column_keys.unwrap_or_else(HashMap::new); column_keys.insert(key, value); @@ -311,9 +325,10 @@ impl FileDecryptor { self.footer_decryptor.unwrap() } - pub(crate) fn get_column_decryptor(&self, column_key: &[u8]) -> RingGcmBlockDecryptor { - let column_key = self.decryption_properties.column_keys.as_ref().unwrap().get(column_key).unwrap(); - RingGcmBlockDecryptor::new(column_key) + pub(crate) fn get_column_decryptor(&self, column_name: &[u8]) -> RingGcmBlockDecryptor { + let column_keys = &self.decryption_properties.column_keys.clone().unwrap(); + let column_key = column_keys[&column_name.to_vec()].clone(); + RingGcmBlockDecryptor::new(&column_key) } pub(crate) fn decryption_properties(&self) -> &FileDecryptionProperties { diff --git a/parquet/src/file/metadata/mod.rs b/parquet/src/file/metadata/mod.rs index 6a8e8d8147c6..851d86c0a95b 100644 --- a/parquet/src/file/metadata/mod.rs +++ b/parquet/src/file/metadata/mod.rs @@ -97,12 +97,9 @@ mod writer; use std::ops::Range; use std::sync::Arc; - -use crate::format::{ - BoundaryOrder, ColumnChunk, ColumnIndex, ColumnMetaData, OffsetIndex, PageLocation, RowGroup, - SizeStatistics, SortingColumn, -}; - +use zstd::zstd_safe::WriteBuf; +use crate::format::{BoundaryOrder, ColumnChunk, ColumnCryptoMetaData, ColumnIndex, ColumnMetaData, OffsetIndex, PageLocation, RowGroup, SizeStatistics, SortingColumn}; +use crate::encryption::ciphers::{create_footer_aad, create_page_aad, ModuleType}; use crate::basic::{ColumnOrder, Compression, Encoding, Type}; #[cfg(feature = "encryption")] use crate::encryption::ciphers::FileDecryptor; @@ -119,6 +116,9 @@ use crate::schema::types::{ pub use reader::ParquetMetaDataReader; pub use writer::ParquetMetaDataWriter; pub(crate) use writer::ThriftMetadataWriter; +use crate::data_type::AsBytes; +use crate::encryption::ciphers::{BlockDecryptor, DecryptionPropertiesBuilder, FileDecryptionProperties}; +use crate::thrift::{TCompactSliceInputProtocol, TSerializable}; /// Page level statistics for each column chunk of each row group. /// @@ -624,7 +624,7 @@ impl RowGroupMetaData { } /// Method to convert from Thrift. - pub fn from_thrift(schema_descr: SchemaDescPtr, mut rg: RowGroup) -> Result { + pub fn from_thrift(schema_descr: SchemaDescPtr, mut rg: RowGroup, #[cfg(feature = "encryption")] decryptor: Option<&FileDecryptor>) -> Result { if schema_descr.num_columns() != rg.columns.len() { return Err(general_err!( "Column count mismatch. Schema has {} columns while Row Group has {}", @@ -635,8 +635,45 @@ impl RowGroupMetaData { let total_byte_size = rg.total_byte_size; let num_rows = rg.num_rows; let mut columns = vec![]; - for (c, d) in rg.columns.drain(0..).zip(schema_descr.columns()) { - let cc = ColumnChunkMetaData::from_thrift(d.clone(), c)?; + for (i, (c, d)) in rg.columns.drain(0..).zip(schema_descr.columns()).enumerate() { + let cc; + #[cfg(feature = "encryption")] + if let Some(ColumnCryptoMetaData::ENCRYPTIONWITHCOLUMNKEY(crypto_metadata)) = c.crypto_metadata.clone() { + if decryptor.is_none() { + cc = ColumnChunkMetaData::from_thrift(d.clone(), c)?; + } else { + let column_name = crypto_metadata.path_in_schema.join("."); + let column_decryptor = decryptor.unwrap().get_column_decryptor(column_name.as_bytes()); + let aad_file_unique = decryptor.unwrap().aad_file_unique(); + let aad_prefix = decryptor.unwrap().decryption_properties().aad_prefix().unwrap(); + let aad = [aad_prefix.clone(), aad_file_unique.clone()].concat(); + // let s = aad.as_slice(); + let column_aad = create_page_aad( + aad.as_slice(), + ModuleType::ColumnMetaData, + rg.ordinal.unwrap() as usize, + i as usize, + None, + )?; + + let mut buf = c.encrypted_column_metadata.unwrap(); + // let mut prot = TCompactSliceInputProtocol::new(buf.as_slice().clone()); + // let mut prot = TCompactSliceInputProtocol::new(buf.as_slice()); + // decrypted_fmd_buf = + // footer_decryptor.decrypt(prot.as_slice().as_ref(), aad_footer.as_ref())?; + let mut c2 = column_decryptor.decrypt(buf.as_ref(), column_aad.as_ref())?; + let mut prot = TCompactSliceInputProtocol::new(c2.as_slice()); + let c3 = ColumnChunk::read_from_in_protocol(&mut prot)?; + // let md = ColumnMetaData::from_thrift(c2, d.clone())?; + // c2.meta_data = Some(md); + cc = ColumnChunkMetaData::from_thrift(d.clone(), c3)?; + // } else if let Some(ColumnCryptoMetaData::ENCRYPTIONWITHFOOTERKEY(x)) = c.crypto_metadata { + // todo!() + } + } else { + cc = ColumnChunkMetaData::from_thrift(d.clone(), c)?; + } + columns.push(cc); } let sorting_columns = rg.sorting_columns; @@ -1629,7 +1666,7 @@ mod tests { .unwrap(); let row_group_exp = row_group_meta.to_thrift(); - let row_group_res = RowGroupMetaData::from_thrift(schema_descr, row_group_exp.clone()) + let row_group_res = RowGroupMetaData::from_thrift(schema_descr, row_group_exp.clone(), #[cfg(feature = "encryption")] None) .unwrap() .to_thrift(); @@ -1711,7 +1748,7 @@ mod tests { .unwrap(); let err = - RowGroupMetaData::from_thrift(schema_descr_3cols, row_group_meta_2cols.to_thrift()) + RowGroupMetaData::from_thrift(schema_descr_3cols, row_group_meta_2cols.to_thrift(), #[cfg(feature = "encryption")] None) .unwrap_err() .to_string(); assert_eq!( diff --git a/parquet/src/file/metadata/reader.rs b/parquet/src/file/metadata/reader.rs index ddf6eb3a397e..ecaff48c97b8 100644 --- a/parquet/src/file/metadata/reader.rs +++ b/parquet/src/file/metadata/reader.rs @@ -767,8 +767,7 @@ impl ParquetMetaDataReader { let mut row_groups = Vec::new(); // TODO: row group filtering for rg in t_file_metadata.row_groups { - // rg. - row_groups.push(RowGroupMetaData::from_thrift(schema_descr.clone(), rg)?); + row_groups.push(RowGroupMetaData::from_thrift(schema_descr.clone(), rg, #[cfg(feature = "encryption")] decryptor.as_ref())?); } let column_orders = Self::parse_column_orders(t_file_metadata.column_orders, &schema_descr)?; diff --git a/parquet/src/file/serialized_reader.rs b/parquet/src/file/serialized_reader.rs index 5cb20b7a9617..69d5655321be 100644 --- a/parquet/src/file/serialized_reader.rs +++ b/parquet/src/file/serialized_reader.rs @@ -42,6 +42,7 @@ use std::collections::VecDeque; use std::iter; use std::{fs::File, io::Read, path::Path, sync::Arc}; use thrift::protocol::TCompactInputProtocol; +use crate::encryption::ciphers::RingGcmBlockDecryptor; impl TryFrom for SerializedFileReader { type Error = ParquetError; @@ -346,40 +347,42 @@ pub(crate) fn read_page_header( #[cfg(feature = "encryption")] crypto_context: Option>, ) -> Result { #[cfg(feature = "encryption")] - if let Some(crypto_context) = crypto_context { - let decryptor = &crypto_context.data_decryptor(); - // todo: get column decryptor - // let file_decryptor = decryptor.get_column_decryptor(crypto_context.column_ordinal); - // if !decryptor.decryption_properties().has_footer_key() { - // return Err(general_err!("Missing footer decryptor")); - // } - let file_decryptor = decryptor.footer_decryptor(); - let aad_file_unique = decryptor.aad_file_unique(); - - let module_type = if crypto_context.dictionary_page { - ModuleType::DictionaryPageHeader - } else { - ModuleType::DataPageHeader - }; - let aad = create_page_aad( - aad_file_unique.as_slice(), - module_type, - crypto_context.row_group_ordinal, - crypto_context.column_ordinal, - crypto_context.page_ordinal, - )?; - - let mut len_bytes = [0; 4]; - input.read_exact(&mut len_bytes)?; - let ciphertext_len = u32::from_le_bytes(len_bytes) as usize; - let mut ciphertext = vec![0; 4 + ciphertext_len]; - input.read_exact(&mut ciphertext[4..])?; - let buf = file_decryptor.unwrap().decrypt(&ciphertext, aad.as_ref())?; - - let mut prot = TCompactSliceInputProtocol::new(buf.as_slice()); - let page_header = PageHeader::read_from_in_protocol(&mut prot)?; - return Ok(page_header); - } + // if let Some(crypto_context) = crypto_context { + // // crypto_context.data_decryptor().get_column_decryptor() + // let decryptor = &crypto_context.data_decryptor(); + // // todo: get column decryptor + // // let file_decryptor = decryptor.ge(crypto_context.column_ordinal); + // // if !decryptor.decryption_properties().has_footer_key() { + // // return Err(general_err!("Missing footer decryptor")); + // // } + // let file_decryptor = decryptor.footer_decryptor(); + // let aad_file_unique = decryptor.aad_file_unique(); + // let aad_prefix = decryptor.aad_prefix(); + // + // let module_type = if crypto_context.dictionary_page { + // ModuleType::DictionaryPageHeader + // } else { + // ModuleType::DataPageHeader + // }; + // let aad = create_page_aad( + // [aad_prefix.as_slice(), aad_file_unique.as_slice()].concat().as_slice(), + // module_type, + // crypto_context.row_group_ordinal, + // crypto_context.column_ordinal, + // crypto_context.page_ordinal, + // )?; + // + // let mut len_bytes = [0; 4]; + // input.read_exact(&mut len_bytes)?; + // let ciphertext_len = u32::from_le_bytes(len_bytes) as usize; + // let mut ciphertext = vec![0; 4 + ciphertext_len]; + // input.read_exact(&mut ciphertext[4..])?; + // let buf = file_decryptor.unwrap().decrypt(&ciphertext, aad.as_ref())?; + // + // let mut prot = TCompactSliceInputProtocol::new(buf.as_slice()); + // let page_header = PageHeader::read_from_in_protocol(&mut prot)?; + // return Ok(page_header); + // } let mut prot = TCompactInputProtocol::new(input); let page_header = PageHeader::read_from_in_protocol(&mut prot)?; @@ -454,6 +457,11 @@ pub(crate) fn decode_page( let buffer: Bytes = if crypto_context.is_some() { let crypto_context = crypto_context.as_ref().unwrap(); let decryptor = crypto_context.data_decryptor(); + let Some(file_decryptor) = if let Some(f) = decryptor.footer_decryptor().clone() { + // Some(RingGcmBlockDecryptor::new(decryptor..as_ref())) + } else { + decryptor. + }; let file_decryptor = decryptor.footer_decryptor(); let module_type = if crypto_context.dictionary_page { From 6763ee96d1ffbfa9fa58d46d512649fe5934bd24 Mon Sep 17 00:00:00 2001 From: Adam Reeve Date: Mon, 23 Dec 2024 09:31:42 +1300 Subject: [PATCH 23/97] Fix method name --- parquet/src/arrow/arrow_reader/mod.rs | 2 +- parquet/src/file/metadata/reader.rs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/parquet/src/arrow/arrow_reader/mod.rs b/parquet/src/arrow/arrow_reader/mod.rs index 8196b1d7290b..6ca5fa7e3dae 100644 --- a/parquet/src/arrow/arrow_reader/mod.rs +++ b/parquet/src/arrow/arrow_reader/mod.rs @@ -391,7 +391,7 @@ impl ArrowReaderMetadata { let metadata = ParquetMetaDataReader::new().with_page_indexes(options.page_index); #[cfg(feature = "encryption")] let metadata = metadata - .with_encryption_properties(file_decryption_properties) + .with_decryption_properties(file_decryption_properties) .parse_and_finish(reader)?; #[cfg(not(feature = "encryption"))] let metadata = metadata.parse_and_finish(reader)?; diff --git a/parquet/src/file/metadata/reader.rs b/parquet/src/file/metadata/reader.rs index ecaff48c97b8..afaad108e694 100644 --- a/parquet/src/file/metadata/reader.rs +++ b/parquet/src/file/metadata/reader.rs @@ -158,7 +158,7 @@ impl ParquetMetaDataReader { /// /// This is only necessary when the file is encrypted. #[cfg(feature = "encryption")] - pub fn with_encryption_properties( + pub fn with_decryption_properties( mut self, properties: Option<&FileDecryptionProperties>, ) -> Self { From af6c589418f20d352cc2445c27b41c50653b2777 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Sat, 4 Jan 2025 12:01:31 +0100 Subject: [PATCH 24/97] work --- parquet/src/arrow/arrow_reader/mod.rs | 22 +++-- parquet/src/encryption/ciphers.rs | 23 ++++- parquet/src/file/metadata/mod.rs | 76 +++++++++++----- parquet/src/file/metadata/reader.rs | 7 +- parquet/src/file/serialized_reader.rs | 125 ++++++++++++++------------ 5 files changed, 164 insertions(+), 89 deletions(-) diff --git a/parquet/src/arrow/arrow_reader/mod.rs b/parquet/src/arrow/arrow_reader/mod.rs index 6ca5fa7e3dae..a2124b1fada2 100644 --- a/parquet/src/arrow/arrow_reader/mod.rs +++ b/parquet/src/arrow/arrow_reader/mod.rs @@ -32,6 +32,7 @@ use crate::arrow::array_reader::{build_array_reader, ArrayReader}; use crate::arrow::schema::{parquet_to_arrow_schema_and_fields, ParquetField}; use crate::arrow::{parquet_to_arrow_field_levels, FieldLevels, ProjectionMask}; use crate::column::page::{PageIterator, PageReader}; +use crate::data_type::AsBytes; use crate::errors::{ParquetError, Result}; use crate::file::metadata::{ParquetMetaData, ParquetMetaDataReader}; use crate::file::reader::{ChunkReader, SerializedPageReader}; @@ -41,6 +42,7 @@ mod filter; mod selection; pub mod statistics; +use crate::encryption::ciphers::FileDecryptor; #[cfg(feature = "encryption")] use crate::encryption::ciphers::{CryptoContext, FileDecryptionProperties}; @@ -709,13 +711,17 @@ impl Iterator for ReaderPageIterator { #[cfg(feature = "encryption")] let crypto_context = if self.metadata.file_decryptor().is_some() { let file_decryptor = Arc::new(self.metadata.file_decryptor().clone().unwrap()); - - let crypto_context = CryptoContext::new( - rg_idx, - self.column_idx, - file_decryptor.clone(), - file_decryptor, - ); + let metadata_decryptor = Arc::new(self.metadata.file_decryptor().clone().unwrap()); + let column_name = self + .metadata + .file_metadata() + .schema_descr() + .column(self.column_idx); + let data_decryptor = + Arc::new(file_decryptor.get_column_decryptor(column_name.name().as_bytes())); + + let crypto_context = + CryptoContext::new(rg_idx, self.column_idx, metadata_decryptor, data_decryptor); Some(Arc::new(crypto_context)) } else { None @@ -1888,7 +1894,7 @@ mod tests { assert_eq!(file_metadata.schema_descr().num_columns(), 8); assert_eq!( file_metadata.created_by().unwrap(), - "parquet-cpp-arrow version 14.0.0-SNAPSHOT" + "parquet-cpp-arrow version 19.0.0-SNAPSHOT" ); metadata.metadata.row_groups().iter().for_each(|rg| { diff --git a/parquet/src/encryption/ciphers.rs b/parquet/src/encryption/ciphers.rs index 08afad684dbd..4808bfa59c65 100644 --- a/parquet/src/encryption/ciphers.rs +++ b/parquet/src/encryption/ciphers.rs @@ -325,10 +325,23 @@ impl FileDecryptor { self.footer_decryptor.unwrap() } - pub(crate) fn get_column_decryptor(&self, column_name: &[u8]) -> RingGcmBlockDecryptor { + pub(crate) fn get_column_decryptor(&self, column_name: &[u8]) -> FileDecryptor { + if self.decryption_properties.column_keys.is_none() { + return self.clone(); + } let column_keys = &self.decryption_properties.column_keys.clone().unwrap(); - let column_key = column_keys[&column_name.to_vec()].clone(); - RingGcmBlockDecryptor::new(&column_key) + let decryptor = if let Some(column_key) = column_keys.get(column_name) { + Some(RingGcmBlockDecryptor::new(&column_key)) + } else { + None + }; + + FileDecryptor { + decryption_properties: self.decryption_properties.clone(), + footer_decryptor: decryptor, + aad_file_unique: self.aad_file_unique.clone(), + aad_prefix: self.aad_prefix.clone(), + } } pub(crate) fn decryption_properties(&self) -> &FileDecryptionProperties { @@ -346,6 +359,10 @@ impl FileDecryptor { pub(crate) fn aad_prefix(&self) -> &Vec { &self.aad_prefix } + + pub(crate) fn has_footer_key(&self) -> bool { + self.decryption_properties.has_footer_key() + } } #[derive(Debug, Clone)] diff --git a/parquet/src/file/metadata/mod.rs b/parquet/src/file/metadata/mod.rs index 851d86c0a95b..11771cddb31b 100644 --- a/parquet/src/file/metadata/mod.rs +++ b/parquet/src/file/metadata/mod.rs @@ -95,30 +95,35 @@ mod memory; pub(crate) mod reader; mod writer; -use std::ops::Range; -use std::sync::Arc; -use zstd::zstd_safe::WriteBuf; -use crate::format::{BoundaryOrder, ColumnChunk, ColumnCryptoMetaData, ColumnIndex, ColumnMetaData, OffsetIndex, PageLocation, RowGroup, SizeStatistics, SortingColumn}; -use crate::encryption::ciphers::{create_footer_aad, create_page_aad, ModuleType}; use crate::basic::{ColumnOrder, Compression, Encoding, Type}; +use crate::data_type::AsBytes; #[cfg(feature = "encryption")] use crate::encryption::ciphers::FileDecryptor; +use crate::encryption::ciphers::{create_footer_aad, create_page_aad, ModuleType}; +use crate::encryption::ciphers::{ + BlockDecryptor, DecryptionPropertiesBuilder, FileDecryptionProperties, +}; use crate::errors::{ParquetError, Result}; pub(crate) use crate::file::metadata::memory::HeapSize; use crate::file::page_encoding_stats::{self, PageEncodingStats}; use crate::file::page_index::index::Index; use crate::file::page_index::offset_index::OffsetIndexMetaData; use crate::file::statistics::{self, Statistics}; +use crate::format::{ + BoundaryOrder, ColumnChunk, ColumnCryptoMetaData, ColumnIndex, ColumnMetaData, OffsetIndex, + PageLocation, RowGroup, SizeStatistics, SortingColumn, +}; use crate::schema::types::{ ColumnDescPtr, ColumnDescriptor, ColumnPath, SchemaDescPtr, SchemaDescriptor, Type as SchemaType, }; +use crate::thrift::{TCompactSliceInputProtocol, TSerializable}; pub use reader::ParquetMetaDataReader; +use std::ops::Range; +use std::sync::Arc; pub use writer::ParquetMetaDataWriter; pub(crate) use writer::ThriftMetadataWriter; -use crate::data_type::AsBytes; -use crate::encryption::ciphers::{BlockDecryptor, DecryptionPropertiesBuilder, FileDecryptionProperties}; -use crate::thrift::{TCompactSliceInputProtocol, TSerializable}; +use zstd::zstd_safe::WriteBuf; /// Page level statistics for each column chunk of each row group. /// @@ -624,7 +629,11 @@ impl RowGroupMetaData { } /// Method to convert from Thrift. - pub fn from_thrift(schema_descr: SchemaDescPtr, mut rg: RowGroup, #[cfg(feature = "encryption")] decryptor: Option<&FileDecryptor>) -> Result { + pub fn from_thrift( + schema_descr: SchemaDescPtr, + mut rg: RowGroup, + #[cfg(feature = "encryption")] decryptor: Option<&FileDecryptor>, + ) -> Result { if schema_descr.num_columns() != rg.columns.len() { return Err(general_err!( "Column count mismatch. Schema has {} columns while Row Group has {}", @@ -635,17 +644,30 @@ impl RowGroupMetaData { let total_byte_size = rg.total_byte_size; let num_rows = rg.num_rows; let mut columns = vec![]; - for (i, (c, d)) in rg.columns.drain(0..).zip(schema_descr.columns()).enumerate() { + for (i, (c, d)) in rg + .columns + .drain(0..) + .zip(schema_descr.columns()) + .enumerate() + { let cc; #[cfg(feature = "encryption")] - if let Some(ColumnCryptoMetaData::ENCRYPTIONWITHCOLUMNKEY(crypto_metadata)) = c.crypto_metadata.clone() { + if let Some(ColumnCryptoMetaData::ENCRYPTIONWITHCOLUMNKEY(crypto_metadata)) = + c.crypto_metadata.clone() + { if decryptor.is_none() { cc = ColumnChunkMetaData::from_thrift(d.clone(), c)?; } else { let column_name = crypto_metadata.path_in_schema.join("."); - let column_decryptor = decryptor.unwrap().get_column_decryptor(column_name.as_bytes()); + let column_decryptor = decryptor + .unwrap() + .get_column_decryptor(column_name.as_bytes()); let aad_file_unique = decryptor.unwrap().aad_file_unique(); - let aad_prefix = decryptor.unwrap().decryption_properties().aad_prefix().unwrap(); + let aad_prefix = decryptor + .unwrap() + .decryption_properties() + .aad_prefix() + .unwrap(); let aad = [aad_prefix.clone(), aad_file_unique.clone()].concat(); // let s = aad.as_slice(); let column_aad = create_page_aad( @@ -661,7 +683,10 @@ impl RowGroupMetaData { // let mut prot = TCompactSliceInputProtocol::new(buf.as_slice()); // decrypted_fmd_buf = // footer_decryptor.decrypt(prot.as_slice().as_ref(), aad_footer.as_ref())?; - let mut c2 = column_decryptor.decrypt(buf.as_ref(), column_aad.as_ref())?; + let mut c2 = column_decryptor + .footer_decryptor() + .unwrap() + .decrypt(buf.as_ref(), column_aad.as_ref())?; let mut prot = TCompactSliceInputProtocol::new(c2.as_slice()); let c3 = ColumnChunk::read_from_in_protocol(&mut prot)?; // let md = ColumnMetaData::from_thrift(c2, d.clone())?; @@ -1666,9 +1691,14 @@ mod tests { .unwrap(); let row_group_exp = row_group_meta.to_thrift(); - let row_group_res = RowGroupMetaData::from_thrift(schema_descr, row_group_exp.clone(), #[cfg(feature = "encryption")] None) - .unwrap() - .to_thrift(); + let row_group_res = RowGroupMetaData::from_thrift( + schema_descr, + row_group_exp.clone(), + #[cfg(feature = "encryption")] + None, + ) + .unwrap() + .to_thrift(); assert_eq!(row_group_res, row_group_exp); } @@ -1747,10 +1777,14 @@ mod tests { .build() .unwrap(); - let err = - RowGroupMetaData::from_thrift(schema_descr_3cols, row_group_meta_2cols.to_thrift(), #[cfg(feature = "encryption")] None) - .unwrap_err() - .to_string(); + let err = RowGroupMetaData::from_thrift( + schema_descr_3cols, + row_group_meta_2cols.to_thrift(), + #[cfg(feature = "encryption")] + None, + ) + .unwrap_err() + .to_string(); assert_eq!( err, "Parquet error: Column count mismatch. Schema has 3 columns while Row Group has 2" diff --git a/parquet/src/file/metadata/reader.rs b/parquet/src/file/metadata/reader.rs index afaad108e694..a07f04509007 100644 --- a/parquet/src/file/metadata/reader.rs +++ b/parquet/src/file/metadata/reader.rs @@ -767,7 +767,12 @@ impl ParquetMetaDataReader { let mut row_groups = Vec::new(); // TODO: row group filtering for rg in t_file_metadata.row_groups { - row_groups.push(RowGroupMetaData::from_thrift(schema_descr.clone(), rg, #[cfg(feature = "encryption")] decryptor.as_ref())?); + row_groups.push(RowGroupMetaData::from_thrift( + schema_descr.clone(), + rg, + #[cfg(feature = "encryption")] + decryptor.as_ref(), + )?); } let column_orders = Self::parse_column_orders(t_file_metadata.column_orders, &schema_descr)?; diff --git a/parquet/src/file/serialized_reader.rs b/parquet/src/file/serialized_reader.rs index 69d5655321be..a3ca6efce6cd 100644 --- a/parquet/src/file/serialized_reader.rs +++ b/parquet/src/file/serialized_reader.rs @@ -22,6 +22,7 @@ use crate::basic::{Encoding, Type}; use crate::bloom_filter::Sbbf; use crate::column::page::{Page, PageMetadata, PageReader}; use crate::compression::{create_codec, Codec}; +use crate::encryption::ciphers::RingGcmBlockDecryptor; #[cfg(feature = "encryption")] use crate::encryption::ciphers::{create_page_aad, BlockDecryptor, CryptoContext, ModuleType}; use crate::errors::{ParquetError, Result}; @@ -42,7 +43,6 @@ use std::collections::VecDeque; use std::iter; use std::{fs::File, io::Read, path::Path, sync::Arc}; use thrift::protocol::TCompactInputProtocol; -use crate::encryption::ciphers::RingGcmBlockDecryptor; impl TryFrom for SerializedFileReader { type Error = ParquetError; @@ -347,42 +347,44 @@ pub(crate) fn read_page_header( #[cfg(feature = "encryption")] crypto_context: Option>, ) -> Result { #[cfg(feature = "encryption")] - // if let Some(crypto_context) = crypto_context { - // // crypto_context.data_decryptor().get_column_decryptor() - // let decryptor = &crypto_context.data_decryptor(); - // // todo: get column decryptor - // // let file_decryptor = decryptor.ge(crypto_context.column_ordinal); - // // if !decryptor.decryption_properties().has_footer_key() { - // // return Err(general_err!("Missing footer decryptor")); - // // } - // let file_decryptor = decryptor.footer_decryptor(); - // let aad_file_unique = decryptor.aad_file_unique(); - // let aad_prefix = decryptor.aad_prefix(); - // - // let module_type = if crypto_context.dictionary_page { - // ModuleType::DictionaryPageHeader - // } else { - // ModuleType::DataPageHeader - // }; - // let aad = create_page_aad( - // [aad_prefix.as_slice(), aad_file_unique.as_slice()].concat().as_slice(), - // module_type, - // crypto_context.row_group_ordinal, - // crypto_context.column_ordinal, - // crypto_context.page_ordinal, - // )?; - // - // let mut len_bytes = [0; 4]; - // input.read_exact(&mut len_bytes)?; - // let ciphertext_len = u32::from_le_bytes(len_bytes) as usize; - // let mut ciphertext = vec![0; 4 + ciphertext_len]; - // input.read_exact(&mut ciphertext[4..])?; - // let buf = file_decryptor.unwrap().decrypt(&ciphertext, aad.as_ref())?; - // - // let mut prot = TCompactSliceInputProtocol::new(buf.as_slice()); - // let page_header = PageHeader::read_from_in_protocol(&mut prot)?; - // return Ok(page_header); - // } + if let Some(crypto_context) = crypto_context { + // crypto_context.data_decryptor().get_column_decryptor() + let decryptor = &crypto_context.data_decryptor(); + // todo: get column decryptor + // let file_decryptor = decryptor.ge(crypto_context.column_ordinal); + // if !decryptor.decryption_properties().has_footer_key() { + // return Err(general_err!("Missing footer decryptor")); + // } + let file_decryptor = decryptor.footer_decryptor(); + let aad_file_unique = decryptor.aad_file_unique(); + let aad_prefix = decryptor.aad_prefix(); + + let module_type = if crypto_context.dictionary_page { + ModuleType::DictionaryPageHeader + } else { + ModuleType::DataPageHeader + }; + let aad = create_page_aad( + [aad_prefix.as_slice(), aad_file_unique.as_slice()] + .concat() + .as_slice(), + module_type, + crypto_context.row_group_ordinal, + crypto_context.column_ordinal, + crypto_context.page_ordinal, + )?; + + let mut len_bytes = [0; 4]; + input.read_exact(&mut len_bytes)?; + let ciphertext_len = u32::from_le_bytes(len_bytes) as usize; + let mut ciphertext = vec![0; 4 + ciphertext_len]; + input.read_exact(&mut ciphertext[4..])?; + let buf = file_decryptor.unwrap().decrypt(&ciphertext, aad.as_ref())?; + + let mut prot = TCompactSliceInputProtocol::new(buf.as_slice()); + let page_header = PageHeader::read_from_in_protocol(&mut prot)?; + return Ok(page_header); + } let mut prot = TCompactInputProtocol::new(input); let page_header = PageHeader::read_from_in_protocol(&mut prot)?; @@ -457,27 +459,38 @@ pub(crate) fn decode_page( let buffer: Bytes = if crypto_context.is_some() { let crypto_context = crypto_context.as_ref().unwrap(); let decryptor = crypto_context.data_decryptor(); - let Some(file_decryptor) = if let Some(f) = decryptor.footer_decryptor().clone() { - // Some(RingGcmBlockDecryptor::new(decryptor..as_ref())) - } else { - decryptor. - }; + // let footer_decryptor + // let file_decryptor = if decryptor.has_footer_key() { + // decryptor.footer_decryptor() + // } else { + // // CryptoMetaData::from_thrift(&crypto_context.meta_data) + // // .and_then(|meta| meta.get_page_decryptor(crypto_context.page_ordinal)) + // // .ok_or_else(|| general_err!("Missing footer decryptor"))? + // // page_header.data_page_header + // // decryptor.get_column_decryptor(crypto_context.column_ordinal) + // // decryptor.get_column_decryptor(crypto_context.column_ordinal) + // return Err(general_err!("Missing footer decryptor")); + // // TODO: decryptor should have keys for columns + // }; let file_decryptor = decryptor.footer_decryptor(); - - let module_type = if crypto_context.dictionary_page { - ModuleType::DictionaryPage + if file_decryptor.is_none() { + buffer } else { - ModuleType::DataPage - }; - let aad = create_page_aad( - decryptor.aad_file_unique().as_slice(), - module_type, - crypto_context.row_group_ordinal, - crypto_context.column_ordinal, - crypto_context.page_ordinal, - )?; - let decrypted = file_decryptor.unwrap().decrypt(&buffer.as_ref(), &aad)?; - Bytes::from(decrypted) + let module_type = if crypto_context.dictionary_page { + ModuleType::DictionaryPage + } else { + ModuleType::DataPage + }; + let aad = create_page_aad( + decryptor.aad_file_unique().as_slice(), + module_type, + crypto_context.row_group_ordinal, + crypto_context.column_ordinal, + crypto_context.page_ordinal, + )?; + let decrypted = file_decryptor.unwrap().decrypt(&buffer.as_ref(), &aad)?; + Bytes::from(decrypted) + } } else { buffer }; From 9cf130d45c4ec4a8115c879ba40ff52619028793 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Mon, 6 Jan 2025 20:26:35 +0100 Subject: [PATCH 25/97] Minor --- parquet/src/file/metadata/mod.rs | 28 ++++++++++----------------- parquet/src/file/serialized_reader.rs | 7 ++----- 2 files changed, 12 insertions(+), 23 deletions(-) diff --git a/parquet/src/file/metadata/mod.rs b/parquet/src/file/metadata/mod.rs index 11771cddb31b..56109543c3ae 100644 --- a/parquet/src/file/metadata/mod.rs +++ b/parquet/src/file/metadata/mod.rs @@ -662,38 +662,30 @@ impl RowGroupMetaData { let column_decryptor = decryptor .unwrap() .get_column_decryptor(column_name.as_bytes()); + let file_decryptor = column_decryptor.footer_decryptor().unwrap(); + let aad_file_unique = decryptor.unwrap().aad_file_unique(); let aad_prefix = decryptor .unwrap() .decryption_properties() .aad_prefix() .unwrap(); - let aad = [aad_prefix.clone(), aad_file_unique.clone()].concat(); - // let s = aad.as_slice(); + let aad: Vec = [aad_prefix.clone(), aad_file_unique.clone()].concat(); let column_aad = create_page_aad( aad.as_slice(), ModuleType::ColumnMetaData, rg.ordinal.unwrap() as usize, - i as usize, + i, None, )?; let mut buf = c.encrypted_column_metadata.unwrap(); - // let mut prot = TCompactSliceInputProtocol::new(buf.as_slice().clone()); - // let mut prot = TCompactSliceInputProtocol::new(buf.as_slice()); - // decrypted_fmd_buf = - // footer_decryptor.decrypt(prot.as_slice().as_ref(), aad_footer.as_ref())?; - let mut c2 = column_decryptor - .footer_decryptor() - .unwrap() - .decrypt(buf.as_ref(), column_aad.as_ref())?; - let mut prot = TCompactSliceInputProtocol::new(c2.as_slice()); - let c3 = ColumnChunk::read_from_in_protocol(&mut prot)?; - // let md = ColumnMetaData::from_thrift(c2, d.clone())?; - // c2.meta_data = Some(md); - cc = ColumnChunkMetaData::from_thrift(d.clone(), c3)?; - // } else if let Some(ColumnCryptoMetaData::ENCRYPTIONWITHFOOTERKEY(x)) = c.crypto_metadata { - // todo!() + let mut decrypted_cc_buf = + file_decryptor.decrypt(buf.as_slice().as_ref(), column_aad.as_ref())?; + + let mut prot = TCompactSliceInputProtocol::new(decrypted_cc_buf.as_slice()); + let c = ColumnChunk::read_from_in_protocol(&mut prot)?; + cc = ColumnChunkMetaData::from_thrift(d.clone(), c)?; } } else { cc = ColumnChunkMetaData::from_thrift(d.clone(), c)?; diff --git a/parquet/src/file/serialized_reader.rs b/parquet/src/file/serialized_reader.rs index a3ca6efce6cd..c7051db10277 100644 --- a/parquet/src/file/serialized_reader.rs +++ b/parquet/src/file/serialized_reader.rs @@ -463,14 +463,11 @@ pub(crate) fn decode_page( // let file_decryptor = if decryptor.has_footer_key() { // decryptor.footer_decryptor() // } else { + // todo + // // decryptor.get_column_decryptor(column_name) // // CryptoMetaData::from_thrift(&crypto_context.meta_data) // // .and_then(|meta| meta.get_page_decryptor(crypto_context.page_ordinal)) // // .ok_or_else(|| general_err!("Missing footer decryptor"))? - // // page_header.data_page_header - // // decryptor.get_column_decryptor(crypto_context.column_ordinal) - // // decryptor.get_column_decryptor(crypto_context.column_ordinal) - // return Err(general_err!("Missing footer decryptor")); - // // TODO: decryptor should have keys for columns // }; let file_decryptor = decryptor.footer_decryptor(); if file_decryptor.is_none() { From 6014acd17a669ee743f6ffde408d071cbc34059f Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Wed, 8 Jan 2025 00:39:49 +0100 Subject: [PATCH 26/97] work --- parquet/src/encryption/ciphers.rs | 6 ++++- parquet/src/file/metadata/mod.rs | 31 ++++++++++++---------- parquet/src/file/serialized_reader.rs | 37 +++++++++++++++------------ 3 files changed, 43 insertions(+), 31 deletions(-) diff --git a/parquet/src/encryption/ciphers.rs b/parquet/src/encryption/ciphers.rs index 4808bfa59c65..52680169eb55 100644 --- a/parquet/src/encryption/ciphers.rs +++ b/parquet/src/encryption/ciphers.rs @@ -180,7 +180,7 @@ pub fn create_page_aad(file_aad: &[u8], module_type: ModuleType, row_group_ordin create_module_aad(file_aad, module_type, row_group_ordinal, column_ordinal, page_ordinal) } -fn create_module_aad(file_aad: &[u8], module_type: ModuleType, row_group_ordinal: usize, +pub fn create_module_aad(file_aad: &[u8], module_type: ModuleType, row_group_ordinal: usize, column_ordinal: usize, page_ordinal: Option) -> Result> { let module_buf = [module_type as u8]; @@ -325,6 +325,10 @@ impl FileDecryptor { self.footer_decryptor.unwrap() } + pub(crate) fn column_decryptor(&self) -> RingGcmBlockDecryptor { + RingGcmBlockDecryptor::new(self.decryption_properties.footer_key.as_ref().unwrap()) + } + pub(crate) fn get_column_decryptor(&self, column_name: &[u8]) -> FileDecryptor { if self.decryption_properties.column_keys.is_none() { return self.clone(); diff --git a/parquet/src/file/metadata/mod.rs b/parquet/src/file/metadata/mod.rs index 56109543c3ae..8ac9a096f36c 100644 --- a/parquet/src/file/metadata/mod.rs +++ b/parquet/src/file/metadata/mod.rs @@ -99,7 +99,9 @@ use crate::basic::{ColumnOrder, Compression, Encoding, Type}; use crate::data_type::AsBytes; #[cfg(feature = "encryption")] use crate::encryption::ciphers::FileDecryptor; -use crate::encryption::ciphers::{create_footer_aad, create_page_aad, ModuleType}; +use crate::encryption::ciphers::{ + create_footer_aad, create_module_aad, create_page_aad, ModuleType, +}; use crate::encryption::ciphers::{ BlockDecryptor, DecryptionPropertiesBuilder, FileDecryptionProperties, }; @@ -659,29 +661,32 @@ impl RowGroupMetaData { cc = ColumnChunkMetaData::from_thrift(d.clone(), c)?; } else { let column_name = crypto_metadata.path_in_schema.join("."); - let column_decryptor = decryptor - .unwrap() - .get_column_decryptor(column_name.as_bytes()); - let file_decryptor = column_decryptor.footer_decryptor().unwrap(); - let aad_file_unique = decryptor.unwrap().aad_file_unique(); let aad_prefix = decryptor .unwrap() .decryption_properties() .aad_prefix() .unwrap(); - let aad: Vec = [aad_prefix.clone(), aad_file_unique.clone()].concat(); - let column_aad = create_page_aad( - aad.as_slice(), + + let column_decryptor = decryptor + .unwrap() + .get_column_decryptor(column_name.as_bytes()) + .footer_decryptor() + .unwrap(); + + let column_aad = create_module_aad( + [aad_prefix.as_slice(), aad_file_unique.as_slice()] + .concat() + .as_slice(), ModuleType::ColumnMetaData, rg.ordinal.unwrap() as usize, - i, + i as usize, None, )?; - let mut buf = c.encrypted_column_metadata.unwrap(); - let mut decrypted_cc_buf = - file_decryptor.decrypt(buf.as_slice().as_ref(), column_aad.as_ref())?; + let buf = c.encrypted_column_metadata.unwrap(); + let decrypted_cc_buf = + column_decryptor.decrypt(buf.as_slice().as_ref(), column_aad.as_ref())?; let mut prot = TCompactSliceInputProtocol::new(decrypted_cc_buf.as_slice()); let c = ColumnChunk::read_from_in_protocol(&mut prot)?; diff --git a/parquet/src/file/serialized_reader.rs b/parquet/src/file/serialized_reader.rs index c7051db10277..67e8c297efd3 100644 --- a/parquet/src/file/serialized_reader.rs +++ b/parquet/src/file/serialized_reader.rs @@ -22,9 +22,8 @@ use crate::basic::{Encoding, Type}; use crate::bloom_filter::Sbbf; use crate::column::page::{Page, PageMetadata, PageReader}; use crate::compression::{create_codec, Codec}; -use crate::encryption::ciphers::RingGcmBlockDecryptor; #[cfg(feature = "encryption")] -use crate::encryption::ciphers::{create_page_aad, BlockDecryptor, CryptoContext, ModuleType}; +use crate::encryption::ciphers::{create_module_aad, BlockDecryptor, CryptoContext, ModuleType}; use crate::errors::{ParquetError, Result}; use crate::file::page_index::offset_index::OffsetIndexMetaData; use crate::file::{ @@ -346,16 +345,13 @@ pub(crate) fn read_page_header( input: &mut T, #[cfg(feature = "encryption")] crypto_context: Option>, ) -> Result { + // todo: if column is not encrypted skip decryption + #[cfg(feature = "encryption")] if let Some(crypto_context) = crypto_context { - // crypto_context.data_decryptor().get_column_decryptor() let decryptor = &crypto_context.data_decryptor(); - // todo: get column decryptor - // let file_decryptor = decryptor.ge(crypto_context.column_ordinal); - // if !decryptor.decryption_properties().has_footer_key() { - // return Err(general_err!("Missing footer decryptor")); - // } - let file_decryptor = decryptor.footer_decryptor(); + + let file_decryptor = decryptor.column_decryptor(); let aad_file_unique = decryptor.aad_file_unique(); let aad_prefix = decryptor.aad_prefix(); @@ -364,7 +360,7 @@ pub(crate) fn read_page_header( } else { ModuleType::DataPageHeader }; - let aad = create_page_aad( + let aad = create_module_aad( [aad_prefix.as_slice(), aad_file_unique.as_slice()] .concat() .as_slice(), @@ -374,12 +370,19 @@ pub(crate) fn read_page_header( crypto_context.page_ordinal, )?; - let mut len_bytes = [0; 4]; - input.read_exact(&mut len_bytes)?; - let ciphertext_len = u32::from_le_bytes(len_bytes) as usize; - let mut ciphertext = vec![0; 4 + ciphertext_len]; - input.read_exact(&mut ciphertext[4..])?; - let buf = file_decryptor.unwrap().decrypt(&ciphertext, aad.as_ref())?; + // let mut len_bytes = [0; 4]; + // input.read_exact(&mut len_bytes)?; + // let ciphertext_len = u32::from_le_bytes(len_bytes) as usize; + // let mut ciphertext = vec![0; 4 + ciphertext_len]; + // input.read_exact(&mut ciphertext[4..])?; + // let mut ciphertext = Vec::new(); + // input.read_to_end(&mut ciphertext)?; + + let mut ciphertext: Vec = vec![]; + input.read_to_end(&mut ciphertext)?; + + // let ciphertext = input.read_to_end(); + let buf = file_decryptor.decrypt(&ciphertext, aad.as_ref())?; let mut prot = TCompactSliceInputProtocol::new(buf.as_slice()); let page_header = PageHeader::read_from_in_protocol(&mut prot)?; @@ -478,7 +481,7 @@ pub(crate) fn decode_page( } else { ModuleType::DataPage }; - let aad = create_page_aad( + let aad = create_module_aad( decryptor.aad_file_unique().as_slice(), module_type, crypto_context.row_group_ordinal, From 2f09a88d1dacd06f1102e75ab148b6d38868b282 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Thu, 9 Jan 2025 22:59:19 +0100 Subject: [PATCH 27/97] work --- parquet/src/arrow/arrow_reader/mod.rs | 10 +++++----- parquet/src/encryption/ciphers.rs | 26 ++++++++++++++++---------- parquet/src/file/metadata/mod.rs | 4 ++++ parquet/src/file/serialized_reader.rs | 20 +++++++++----------- 4 files changed, 34 insertions(+), 26 deletions(-) diff --git a/parquet/src/arrow/arrow_reader/mod.rs b/parquet/src/arrow/arrow_reader/mod.rs index a2124b1fada2..d5133e243e36 100644 --- a/parquet/src/arrow/arrow_reader/mod.rs +++ b/parquet/src/arrow/arrow_reader/mod.rs @@ -710,18 +710,18 @@ impl Iterator for ReaderPageIterator { #[cfg(feature = "encryption")] let crypto_context = if self.metadata.file_decryptor().is_some() { - let file_decryptor = Arc::new(self.metadata.file_decryptor().clone().unwrap()); - let metadata_decryptor = Arc::new(self.metadata.file_decryptor().clone().unwrap()); let column_name = self .metadata .file_metadata() .schema_descr() .column(self.column_idx); - let data_decryptor = - Arc::new(file_decryptor.get_column_decryptor(column_name.name().as_bytes())); + + let file_decryptor = self.metadata.file_decryptor().clone().unwrap().get_column_decryptor(column_name.name().as_bytes()); + let data_decryptor = Arc::new(file_decryptor.clone()); + let metadata_decryptor = Arc::new(file_decryptor.clone()); let crypto_context = - CryptoContext::new(rg_idx, self.column_idx, metadata_decryptor, data_decryptor); + CryptoContext::new(rg_idx, self.column_idx, data_decryptor, metadata_decryptor); Some(Arc::new(crypto_context)) } else { None diff --git a/parquet/src/encryption/ciphers.rs b/parquet/src/encryption/ciphers.rs index 52680169eb55..ef5d122dc442 100644 --- a/parquet/src/encryption/ciphers.rs +++ b/parquet/src/encryption/ciphers.rs @@ -329,23 +329,29 @@ impl FileDecryptor { RingGcmBlockDecryptor::new(self.decryption_properties.footer_key.as_ref().unwrap()) } + pub(crate) fn has_column_key(&self, column_name: &[u8]) -> bool { + self.decryption_properties.column_keys.clone().unwrap().contains_key(column_name) + } + pub(crate) fn get_column_decryptor(&self, column_name: &[u8]) -> FileDecryptor { - if self.decryption_properties.column_keys.is_none() { + if self.decryption_properties.column_keys.is_none() || !self.has_column_key(column_name) { return self.clone(); } let column_keys = &self.decryption_properties.column_keys.clone().unwrap(); - let decryptor = if let Some(column_key) = column_keys.get(column_name) { - Some(RingGcmBlockDecryptor::new(&column_key)) + let decryption_properties = if let Some(column_key) = column_keys.get(column_name) { + DecryptionPropertiesBuilder::with_defaults() + .with_footer_key(column_key.clone()) + .with_aad_prefix(self.aad_prefix.clone()) + .build() } else { - None + self.decryption_properties.clone() }; - FileDecryptor { - decryption_properties: self.decryption_properties.clone(), - footer_decryptor: decryptor, - aad_file_unique: self.aad_file_unique.clone(), - aad_prefix: self.aad_prefix.clone(), - } + FileDecryptor::new( + &decryption_properties, + self.aad_file_unique.clone(), + self.aad_prefix.clone(), + ) } pub(crate) fn decryption_properties(&self) -> &FileDecryptionProperties { diff --git a/parquet/src/file/metadata/mod.rs b/parquet/src/file/metadata/mod.rs index 8ac9a096f36c..530978050018 100644 --- a/parquet/src/file/metadata/mod.rs +++ b/parquet/src/file/metadata/mod.rs @@ -661,6 +661,10 @@ impl RowGroupMetaData { cc = ColumnChunkMetaData::from_thrift(d.clone(), c)?; } else { let column_name = crypto_metadata.path_in_schema.join("."); + if !decryptor.unwrap().has_column_key(&column_name.as_bytes()) { + cc = ColumnChunkMetaData::from_thrift(d.clone(), c)?; + break; + } let aad_file_unique = decryptor.unwrap().aad_file_unique(); let aad_prefix = decryptor .unwrap() diff --git a/parquet/src/file/serialized_reader.rs b/parquet/src/file/serialized_reader.rs index 67e8c297efd3..f9f4c796ae4c 100644 --- a/parquet/src/file/serialized_reader.rs +++ b/parquet/src/file/serialized_reader.rs @@ -350,8 +350,15 @@ pub(crate) fn read_page_header( #[cfg(feature = "encryption")] if let Some(crypto_context) = crypto_context { let decryptor = &crypto_context.data_decryptor(); + // todo: in case of per-column key, decryptor should be column decryptor + if !decryptor.has_footer_key() || !decryptor.footer_decryptor().is_some() { + let mut prot = TCompactInputProtocol::new(input); + let page_header = PageHeader::read_from_in_protocol(&mut prot)?; + return Ok(page_header) + }; - let file_decryptor = decryptor.column_decryptor(); + // let file_decryptor = decryptor.column_decryptor(); + let data_decryptor = &crypto_context.data_decryptor(); let aad_file_unique = decryptor.aad_file_unique(); let aad_prefix = decryptor.aad_prefix(); @@ -370,19 +377,10 @@ pub(crate) fn read_page_header( crypto_context.page_ordinal, )?; - // let mut len_bytes = [0; 4]; - // input.read_exact(&mut len_bytes)?; - // let ciphertext_len = u32::from_le_bytes(len_bytes) as usize; - // let mut ciphertext = vec![0; 4 + ciphertext_len]; - // input.read_exact(&mut ciphertext[4..])?; - // let mut ciphertext = Vec::new(); - // input.read_to_end(&mut ciphertext)?; - let mut ciphertext: Vec = vec![]; input.read_to_end(&mut ciphertext)?; - // let ciphertext = input.read_to_end(); - let buf = file_decryptor.decrypt(&ciphertext, aad.as_ref())?; + let buf = data_decryptor.footer_decryptor().unwrap().decrypt(&ciphertext, aad.as_ref())?; let mut prot = TCompactSliceInputProtocol::new(buf.as_slice()); let page_header = PageHeader::read_from_in_protocol(&mut prot)?; From 9104ab52c4d188a337f68c80b7da0ac4ed4c882c Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Tue, 21 Jan 2025 00:58:27 +0100 Subject: [PATCH 28/97] work --- parquet/src/arrow/arrow_reader/mod.rs | 11 +++++++--- parquet/src/encryption/ciphers.rs | 8 +++++-- parquet/src/file/metadata/mod.rs | 28 +++++++++++------------- parquet/src/file/metadata/reader.rs | 31 ++++++++++++++------------- parquet/src/file/serialized_reader.rs | 7 ++++-- 5 files changed, 48 insertions(+), 37 deletions(-) diff --git a/parquet/src/arrow/arrow_reader/mod.rs b/parquet/src/arrow/arrow_reader/mod.rs index d5133e243e36..b7f2dfe2c10f 100644 --- a/parquet/src/arrow/arrow_reader/mod.rs +++ b/parquet/src/arrow/arrow_reader/mod.rs @@ -716,7 +716,12 @@ impl Iterator for ReaderPageIterator { .schema_descr() .column(self.column_idx); - let file_decryptor = self.metadata.file_decryptor().clone().unwrap().get_column_decryptor(column_name.name().as_bytes()); + let file_decryptor = self + .metadata + .file_decryptor() + .clone() + .unwrap() + .get_column_decryptor(column_name.name().as_bytes()); let data_decryptor = Arc::new(file_decryptor.clone()); let metadata_decryptor = Arc::new(file_decryptor.clone()); @@ -1880,8 +1885,8 @@ mod tests { let decryption_properties = Some( ciphers::FileDecryptionProperties::builder() - .with_column_key("float_field".as_bytes().to_vec(), column_1_key.to_vec()) - .with_column_key("double_field".as_bytes().to_vec(), column_2_key.to_vec()) + .with_column_key("double_field".as_bytes().to_vec(), column_1_key.to_vec()) + .with_column_key("float_field".as_bytes().to_vec(), column_2_key.to_vec()) .build(), ); diff --git a/parquet/src/encryption/ciphers.rs b/parquet/src/encryption/ciphers.rs index ef5d122dc442..797734b7c567 100644 --- a/parquet/src/encryption/ciphers.rs +++ b/parquet/src/encryption/ciphers.rs @@ -241,8 +241,12 @@ impl FileDecryptionProperties { } pub fn has_footer_key(&self) -> bool { self.footer_key.is_some() } - pub fn aad_prefix(&self) -> Option<&Vec> { - self.aad_prefix.as_ref() + pub fn has_column_keys(&self) -> bool { + self.column_keys.is_some() + } + + pub fn aad_prefix(&self) -> Option> { + self.aad_prefix.clone() } } diff --git a/parquet/src/file/metadata/mod.rs b/parquet/src/file/metadata/mod.rs index 530978050018..4c47eecdebc1 100644 --- a/parquet/src/file/metadata/mod.rs +++ b/parquet/src/file/metadata/mod.rs @@ -112,8 +112,8 @@ use crate::file::page_index::index::Index; use crate::file::page_index::offset_index::OffsetIndexMetaData; use crate::file::statistics::{self, Statistics}; use crate::format::{ - BoundaryOrder, ColumnChunk, ColumnCryptoMetaData, ColumnIndex, ColumnMetaData, OffsetIndex, - PageLocation, RowGroup, SizeStatistics, SortingColumn, + BoundaryOrder, ColumnChunk, ColumnCryptoMetaData, ColumnIndex, ColumnMetaData, + EncryptionAlgorithm, OffsetIndex, PageLocation, RowGroup, SizeStatistics, SortingColumn, }; use crate::schema::types::{ ColumnDescPtr, ColumnDescriptor, ColumnPath, SchemaDescPtr, SchemaDescriptor, @@ -646,7 +646,7 @@ impl RowGroupMetaData { let total_byte_size = rg.total_byte_size; let num_rows = rg.num_rows; let mut columns = vec![]; - for (i, (c, d)) in rg + for (i, (mut c, d)) in rg .columns .drain(0..) .zip(schema_descr.columns()) @@ -657,27 +657,25 @@ impl RowGroupMetaData { if let Some(ColumnCryptoMetaData::ENCRYPTIONWITHCOLUMNKEY(crypto_metadata)) = c.crypto_metadata.clone() { - if decryptor.is_none() { + if c.encrypted_column_metadata.is_none() { cc = ColumnChunkMetaData::from_thrift(d.clone(), c)?; } else { + let decryptor = decryptor.unwrap(); let column_name = crypto_metadata.path_in_schema.join("."); - if !decryptor.unwrap().has_column_key(&column_name.as_bytes()) { + if !decryptor.has_column_key(&column_name.as_bytes()) { cc = ColumnChunkMetaData::from_thrift(d.clone(), c)?; break; } - let aad_file_unique = decryptor.unwrap().aad_file_unique(); - let aad_prefix = decryptor - .unwrap() - .decryption_properties() - .aad_prefix() - .unwrap(); - let column_decryptor = decryptor - .unwrap() .get_column_decryptor(column_name.as_bytes()) .footer_decryptor() .unwrap(); + let aad_file_unique = decryptor.aad_file_unique(); + let aad_prefix: Vec = decryptor + .decryption_properties() + .aad_prefix() + .unwrap_or_default(); let column_aad = create_module_aad( [aad_prefix.as_slice(), aad_file_unique.as_slice()] .concat() @@ -688,12 +686,12 @@ impl RowGroupMetaData { None, )?; - let buf = c.encrypted_column_metadata.unwrap(); + let buf = c.encrypted_column_metadata.clone().unwrap(); let decrypted_cc_buf = column_decryptor.decrypt(buf.as_slice().as_ref(), column_aad.as_ref())?; let mut prot = TCompactSliceInputProtocol::new(decrypted_cc_buf.as_slice()); - let c = ColumnChunk::read_from_in_protocol(&mut prot)?; + c.meta_data = Some(ColumnMetaData::read_from_in_protocol(&mut prot)?); cc = ColumnChunkMetaData::from_thrift(d.clone(), c)?; } } else { diff --git a/parquet/src/file/metadata/reader.rs b/parquet/src/file/metadata/reader.rs index a07f04509007..238cd69a9b31 100644 --- a/parquet/src/file/metadata/reader.rs +++ b/parquet/src/file/metadata/reader.rs @@ -726,7 +726,6 @@ impl ParquetMetaDataReader { if file_decryption_properties.is_none() { return Err(general_err!("Parquet file has an encrypted footer but no decryption properties were provided")); }; - let file_decryption_properties = file_decryption_properties; let t_file_crypto_metadata: TFileCryptoMetaData = TFileCryptoMetaData::read_from_in_protocol(&mut prot) @@ -764,18 +763,6 @@ impl ParquetMetaDataReader { .map_err(|e| general_err!("Could not parse metadata: {}", e))?; let schema = types::from_thrift(&t_file_metadata.schema)?; let schema_descr = Arc::new(SchemaDescriptor::new(schema)); - let mut row_groups = Vec::new(); - // TODO: row group filtering - for rg in t_file_metadata.row_groups { - row_groups.push(RowGroupMetaData::from_thrift( - schema_descr.clone(), - rg, - #[cfg(feature = "encryption")] - decryptor.as_ref(), - )?); - } - let column_orders = - Self::parse_column_orders(t_file_metadata.column_orders, &schema_descr)?; // todo add file decryptor #[cfg(feature = "encryption")] @@ -788,9 +775,9 @@ impl ParquetMetaDataReader { }; // todo decr: add support for GCMCTRV1 let aad_file_unique = aes_gcm_algo.aad_file_unique.unwrap(); let aad_prefix: Vec = aes_gcm_algo.aad_prefix.unwrap_or_default(); - let fdp = file_decryption_properties.unwrap(); + decryptor = Some(FileDecryptor::new( - fdp, + file_decryption_properties.unwrap(), aad_file_unique.clone(), aad_prefix.clone(), )); @@ -798,6 +785,20 @@ impl ParquetMetaDataReader { // todo check signature } + let mut row_groups = Vec::new(); + // TODO: row group filtering + for rg in t_file_metadata.row_groups { + let r = RowGroupMetaData::from_thrift( + schema_descr.clone(), + rg, + #[cfg(feature = "encryption")] + decryptor.as_ref(), + )?; + row_groups.push(r); + } + let column_orders = + Self::parse_column_orders(t_file_metadata.column_orders, &schema_descr)?; + let file_metadata = FileMetaData::new( t_file_metadata.version, t_file_metadata.num_rows, diff --git a/parquet/src/file/serialized_reader.rs b/parquet/src/file/serialized_reader.rs index f9f4c796ae4c..bde7ab19c53c 100644 --- a/parquet/src/file/serialized_reader.rs +++ b/parquet/src/file/serialized_reader.rs @@ -354,7 +354,7 @@ pub(crate) fn read_page_header( if !decryptor.has_footer_key() || !decryptor.footer_decryptor().is_some() { let mut prot = TCompactInputProtocol::new(input); let page_header = PageHeader::read_from_in_protocol(&mut prot)?; - return Ok(page_header) + return Ok(page_header); }; // let file_decryptor = decryptor.column_decryptor(); @@ -380,7 +380,10 @@ pub(crate) fn read_page_header( let mut ciphertext: Vec = vec![]; input.read_to_end(&mut ciphertext)?; - let buf = data_decryptor.footer_decryptor().unwrap().decrypt(&ciphertext, aad.as_ref())?; + let buf = data_decryptor + .footer_decryptor() + .unwrap() + .decrypt(&ciphertext, aad.as_ref())?; let mut prot = TCompactSliceInputProtocol::new(buf.as_slice()); let page_header = PageHeader::read_from_in_protocol(&mut prot)?; From 3b3b75a37a2be2f9a4dfc47437312cff071622e8 Mon Sep 17 00:00:00 2001 From: Adam Reeve Date: Tue, 21 Jan 2025 16:53:52 +1300 Subject: [PATCH 29/97] Fix reading to end of file --- parquet/src/file/serialized_reader.rs | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/parquet/src/file/serialized_reader.rs b/parquet/src/file/serialized_reader.rs index bde7ab19c53c..8ff4bd0c5e36 100644 --- a/parquet/src/file/serialized_reader.rs +++ b/parquet/src/file/serialized_reader.rs @@ -377,8 +377,11 @@ pub(crate) fn read_page_header( crypto_context.page_ordinal, )?; - let mut ciphertext: Vec = vec![]; - input.read_to_end(&mut ciphertext)?; + let mut len_bytes = [0; 4]; + input.read_exact(&mut len_bytes)?; + let ciphertext_len = u32::from_le_bytes(len_bytes) as usize; + let mut ciphertext = vec![0; 4 + ciphertext_len]; + input.read_exact(&mut ciphertext[4..])?; let buf = data_decryptor .footer_decryptor() From 40d3c21f84a85ad93d4cebb945832d1d1d822915 Mon Sep 17 00:00:00 2001 From: Adam Reeve Date: Tue, 21 Jan 2025 17:01:44 +1300 Subject: [PATCH 30/97] Refactor tests --- parquet/src/arrow/arrow_reader/mod.rs | 105 +++++++------------------- 1 file changed, 28 insertions(+), 77 deletions(-) diff --git a/parquet/src/arrow/arrow_reader/mod.rs b/parquet/src/arrow/arrow_reader/mod.rs index b7f2dfe2c10f..fb4d43f2b919 100644 --- a/parquet/src/arrow/arrow_reader/mod.rs +++ b/parquet/src/arrow/arrow_reader/mod.rs @@ -1883,45 +1883,12 @@ mod tests { let column_1_key = "1234567890123450".as_bytes(); let column_2_key = "1234567890123451".as_bytes(); - let decryption_properties = Some( - ciphers::FileDecryptionProperties::builder() - .with_column_key("double_field".as_bytes().to_vec(), column_1_key.to_vec()) - .with_column_key("float_field".as_bytes().to_vec(), column_2_key.to_vec()) - .build(), - ); - - let metadata = - ArrowReaderMetadata::load(&file, Default::default(), decryption_properties.as_ref()) - .unwrap(); - let file_metadata = metadata.metadata.file_metadata(); - - assert_eq!(file_metadata.num_rows(), 50); - assert_eq!(file_metadata.schema_descr().num_columns(), 8); - assert_eq!( - file_metadata.created_by().unwrap(), - "parquet-cpp-arrow version 19.0.0-SNAPSHOT" - ); - - metadata.metadata.row_groups().iter().for_each(|rg| { - assert_eq!(rg.num_columns(), 8); - assert_eq!(rg.num_rows(), 50); - assert_eq!(rg.total_byte_size(), 3816); - }); - - let record_reader = ParquetRecordBatchReader::try_new_with_decryption( - file, - 128, - decryption_properties.as_ref(), - ) - .unwrap(); - - let mut row_count = 0; - for batch in record_reader { - let batch = batch.unwrap(); - row_count += batch.num_rows(); - } + let decryption_properties = ciphers::FileDecryptionProperties::builder() + .with_column_key("double_field".as_bytes().to_vec(), column_1_key.to_vec()) + .with_column_key("float_field".as_bytes().to_vec(), column_2_key.to_vec()) + .build(); - assert_eq!(row_count, file_metadata.num_rows() as usize); + verify_encryption_test_file_read(file, decryption_properties); } #[test] @@ -1937,26 +1904,14 @@ mod tests { let column_1_key = "1234567890123450".as_bytes(); let column_2_key = "1234567890123451".as_bytes(); - let decryption_properties = Some( - ciphers::FileDecryptionProperties::builder() - .with_footer_key(footer_key.to_vec()) - .with_column_key("float_field".as_bytes().to_vec(), column_1_key.to_vec()) - .with_column_key("double_field".as_bytes().to_vec(), column_2_key.to_vec()) - .with_aad_prefix("tester".as_bytes().to_vec()) - .build(), - ); - - let metadata = - ArrowReaderMetadata::load(&file, Default::default(), decryption_properties.as_ref()) - .unwrap(); - let file_metadata = metadata.metadata.file_metadata(); + let decryption_properties = ciphers::FileDecryptionProperties::builder() + .with_footer_key(footer_key.to_vec()) + .with_column_key("float_field".as_bytes().to_vec(), column_1_key.to_vec()) + .with_column_key("double_field".as_bytes().to_vec(), column_2_key.to_vec()) + .with_aad_prefix("tester".as_bytes().to_vec()) + .build(); - assert_eq!(file_metadata.num_rows(), 50); - assert_eq!(file_metadata.schema_descr().num_columns(), 8); - assert_eq!( - file_metadata.created_by().unwrap(), - "parquet-cpp-arrow version 19.0.0-SNAPSHOT" - ); + verify_encryption_test_file_read(file, decryption_properties); } #[test] @@ -1967,17 +1922,26 @@ mod tests { let file = File::open(path).unwrap(); let key_code: &[u8] = "0123456789012345".as_bytes(); - let decryption_properties = Some( - ciphers::FileDecryptionProperties::builder() - .with_footer_key(key_code.to_vec()) - .build(), - ); + let decryption_properties = ciphers::FileDecryptionProperties::builder() + .with_footer_key(key_code.to_vec()) + .build(); + + verify_encryption_test_file_read(file, decryption_properties); + } + + fn verify_encryption_test_file_read(file: File, decryption_properties: ciphers::FileDecryptionProperties) { + let decryption_properties = Some(decryption_properties); let metadata = - ArrowReaderMetadata::load(&file, Default::default(), decryption_properties.as_ref()) - .unwrap(); + ArrowReaderMetadata::load(&file, Default::default(), decryption_properties.as_ref()).unwrap(); let file_metadata = metadata.metadata.file_metadata(); + let record_reader = ParquetRecordBatchReader::try_new_with_decryption( + file, + 128, + decryption_properties.as_ref(), + ).unwrap(); + assert_eq!(file_metadata.num_rows(), 50); assert_eq!(file_metadata.schema_descr().num_columns(), 8); assert_eq!( @@ -1988,21 +1952,8 @@ mod tests { metadata.metadata.row_groups().iter().for_each(|rg| { assert_eq!(rg.num_columns(), 8); assert_eq!(rg.num_rows(), 50); - assert_eq!(rg.total_byte_size(), 4172); }); - let decryption_properties = Some( - ciphers::FileDecryptionProperties::builder() - .with_footer_key(key_code.to_vec()) - .build(), - ); - let record_reader = ParquetRecordBatchReader::try_new_with_decryption( - file, - 128, - decryption_properties.as_ref(), - ) - .unwrap(); - let mut row_count = 0; for batch in record_reader { let batch = batch.unwrap(); From 3f7e8419877863640a8a4637769cdd0d2fadc642 Mon Sep 17 00:00:00 2001 From: Adam Reeve Date: Tue, 21 Jan 2025 17:07:26 +1300 Subject: [PATCH 31/97] Fix non-uniform encryption configuration --- parquet/src/arrow/arrow_reader/mod.rs | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/parquet/src/arrow/arrow_reader/mod.rs b/parquet/src/arrow/arrow_reader/mod.rs index fb4d43f2b919..23b81b2841d7 100644 --- a/parquet/src/arrow/arrow_reader/mod.rs +++ b/parquet/src/arrow/arrow_reader/mod.rs @@ -1894,8 +1894,6 @@ mod tests { #[test] #[cfg(feature = "encryption")] fn test_non_uniform_encryption() { - // Decryption configuration 2: Decrypt using key retriever callback that holds the - // keys of two encrypted columns and the footer key. Supply aad_prefix. let testdata = arrow::util::test_util::parquet_test_data(); let path = format!("{testdata}/encrypt_columns_and_footer.parquet.encrypted"); let file = File::open(path).unwrap(); @@ -1906,9 +1904,8 @@ mod tests { let decryption_properties = ciphers::FileDecryptionProperties::builder() .with_footer_key(footer_key.to_vec()) - .with_column_key("float_field".as_bytes().to_vec(), column_1_key.to_vec()) - .with_column_key("double_field".as_bytes().to_vec(), column_2_key.to_vec()) - .with_aad_prefix("tester".as_bytes().to_vec()) + .with_column_key("double_field".as_bytes().to_vec(), column_1_key.to_vec()) + .with_column_key("float_field".as_bytes().to_vec(), column_2_key.to_vec()) .build(); verify_encryption_test_file_read(file, decryption_properties); From bf4df8a759caccde442a43f441d8fac985a58e45 Mon Sep 17 00:00:00 2001 From: Adam Reeve Date: Tue, 21 Jan 2025 17:22:22 +1300 Subject: [PATCH 32/97] Don't use footer key for non-encrypted columns --- parquet/src/arrow/arrow_reader/mod.rs | 26 +++++++++++++++----------- parquet/src/encryption/ciphers.rs | 5 +++++ 2 files changed, 20 insertions(+), 11 deletions(-) diff --git a/parquet/src/arrow/arrow_reader/mod.rs b/parquet/src/arrow/arrow_reader/mod.rs index 23b81b2841d7..c203e7d2047f 100644 --- a/parquet/src/arrow/arrow_reader/mod.rs +++ b/parquet/src/arrow/arrow_reader/mod.rs @@ -716,18 +716,22 @@ impl Iterator for ReaderPageIterator { .schema_descr() .column(self.column_idx); - let file_decryptor = self - .metadata - .file_decryptor() - .clone() - .unwrap() - .get_column_decryptor(column_name.name().as_bytes()); - let data_decryptor = Arc::new(file_decryptor.clone()); - let metadata_decryptor = Arc::new(file_decryptor.clone()); + if self.metadata.file_decryptor().as_ref().unwrap().is_column_encrypted(column_name.name().as_bytes()) { + let file_decryptor = self + .metadata + .file_decryptor() + .clone() + .unwrap() + .get_column_decryptor(column_name.name().as_bytes()); + let data_decryptor = Arc::new(file_decryptor.clone()); + let metadata_decryptor = Arc::new(file_decryptor.clone()); - let crypto_context = - CryptoContext::new(rg_idx, self.column_idx, data_decryptor, metadata_decryptor); - Some(Arc::new(crypto_context)) + let crypto_context = + CryptoContext::new(rg_idx, self.column_idx, data_decryptor, metadata_decryptor); + Some(Arc::new(crypto_context)) + } else { + None + } } else { None }; diff --git a/parquet/src/encryption/ciphers.rs b/parquet/src/encryption/ciphers.rs index 797734b7c567..1b845899947c 100644 --- a/parquet/src/encryption/ciphers.rs +++ b/parquet/src/encryption/ciphers.rs @@ -377,6 +377,11 @@ impl FileDecryptor { pub(crate) fn has_footer_key(&self) -> bool { self.decryption_properties.has_footer_key() } + + pub(crate) fn is_column_encrypted(&self, column_name: &[u8]) -> bool { + // Column is encrypted if either uniform encryption is used or an encryption key is set for the column + self.decryption_properties.column_keys.is_none() || self.has_column_key(column_name) + } } #[derive(Debug, Clone)] From 135eef2128eba84207ffa68b0c9cac40ad025d18 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Tue, 21 Jan 2025 12:55:48 +0100 Subject: [PATCH 33/97] Rebase and cleanup --- parquet/src/arrow/arrow_reader/mod.rs | 21 +++++++++++++++------ parquet/src/encryption/ciphers.rs | 10 ---------- parquet/src/file/footer.rs | 1 - parquet/src/file/metadata/mod.rs | 18 ++++-------------- parquet/src/file/serialized_reader.rs | 9 ++++++++- 5 files changed, 27 insertions(+), 32 deletions(-) diff --git a/parquet/src/arrow/arrow_reader/mod.rs b/parquet/src/arrow/arrow_reader/mod.rs index c203e7d2047f..be41f9c21904 100644 --- a/parquet/src/arrow/arrow_reader/mod.rs +++ b/parquet/src/arrow/arrow_reader/mod.rs @@ -32,7 +32,6 @@ use crate::arrow::array_reader::{build_array_reader, ArrayReader}; use crate::arrow::schema::{parquet_to_arrow_schema_and_fields, ParquetField}; use crate::arrow::{parquet_to_arrow_field_levels, FieldLevels, ProjectionMask}; use crate::column::page::{PageIterator, PageReader}; -use crate::data_type::AsBytes; use crate::errors::{ParquetError, Result}; use crate::file::metadata::{ParquetMetaData, ParquetMetaDataReader}; use crate::file::reader::{ChunkReader, SerializedPageReader}; @@ -42,7 +41,6 @@ mod filter; mod selection; pub mod statistics; -use crate::encryption::ciphers::FileDecryptor; #[cfg(feature = "encryption")] use crate::encryption::ciphers::{CryptoContext, FileDecryptionProperties}; @@ -716,7 +714,13 @@ impl Iterator for ReaderPageIterator { .schema_descr() .column(self.column_idx); - if self.metadata.file_decryptor().as_ref().unwrap().is_column_encrypted(column_name.name().as_bytes()) { + if self + .metadata + .file_decryptor() + .as_ref() + .unwrap() + .is_column_encrypted(column_name.name().as_bytes()) + { let file_decryptor = self .metadata .file_decryptor() @@ -1930,18 +1934,23 @@ mod tests { verify_encryption_test_file_read(file, decryption_properties); } - fn verify_encryption_test_file_read(file: File, decryption_properties: ciphers::FileDecryptionProperties) { + fn verify_encryption_test_file_read( + file: File, + decryption_properties: ciphers::FileDecryptionProperties, + ) { let decryption_properties = Some(decryption_properties); let metadata = - ArrowReaderMetadata::load(&file, Default::default(), decryption_properties.as_ref()).unwrap(); + ArrowReaderMetadata::load(&file, Default::default(), decryption_properties.as_ref()) + .unwrap(); let file_metadata = metadata.metadata.file_metadata(); let record_reader = ParquetRecordBatchReader::try_new_with_decryption( file, 128, decryption_properties.as_ref(), - ).unwrap(); + ) + .unwrap(); assert_eq!(file_metadata.num_rows(), 50); assert_eq!(file_metadata.schema_descr().num_columns(), 8); diff --git a/parquet/src/encryption/ciphers.rs b/parquet/src/encryption/ciphers.rs index 1b845899947c..0e9b9e5d906d 100644 --- a/parquet/src/encryption/ciphers.rs +++ b/parquet/src/encryption/ciphers.rs @@ -22,9 +22,7 @@ use std::collections::HashMap; use std::sync::Arc; use ring::aead::{Aad, LessSafeKey, NonceSequence, UnboundKey, AES_128_GCM}; use ring::rand::{SecureRandom, SystemRandom}; -use zstd::zstd_safe::WriteBuf; use crate::errors::{ParquetError, Result}; -use crate::format::EncryptionAlgorithm; pub trait BlockEncryptor { fn encrypt(&mut self, plaintext: &[u8], aad: &[u8]) -> Result>; @@ -176,10 +174,6 @@ pub fn create_footer_aad(file_aad: &[u8]) -> Result> { create_module_aad(file_aad, ModuleType::Footer, 0, 0, None) } -pub fn create_page_aad(file_aad: &[u8], module_type: ModuleType, row_group_ordinal: usize, column_ordinal: usize, page_ordinal: Option) -> Result> { - create_module_aad(file_aad, module_type, row_group_ordinal, column_ordinal, page_ordinal) -} - pub fn create_module_aad(file_aad: &[u8], module_type: ModuleType, row_group_ordinal: usize, column_ordinal: usize, page_ordinal: Option) -> Result> { @@ -329,10 +323,6 @@ impl FileDecryptor { self.footer_decryptor.unwrap() } - pub(crate) fn column_decryptor(&self) -> RingGcmBlockDecryptor { - RingGcmBlockDecryptor::new(self.decryption_properties.footer_key.as_ref().unwrap()) - } - pub(crate) fn has_column_key(&self, column_name: &[u8]) -> bool { self.decryption_properties.column_keys.clone().unwrap().contains_key(column_name) } diff --git a/parquet/src/file/footer.rs b/parquet/src/file/footer.rs index bdab765cf700..3a4e68d4a57d 100644 --- a/parquet/src/file/footer.rs +++ b/parquet/src/file/footer.rs @@ -18,7 +18,6 @@ //! Module for working with Parquet file footers. #[cfg(feature = "encryption")] -use crate::encryption::ciphers::FileDecryptionProperties; use crate::errors::Result; use crate::file::{metadata::*, reader::ChunkReader, FOOTER_SIZE}; diff --git a/parquet/src/file/metadata/mod.rs b/parquet/src/file/metadata/mod.rs index 4c47eecdebc1..d2fb128c27b8 100644 --- a/parquet/src/file/metadata/mod.rs +++ b/parquet/src/file/metadata/mod.rs @@ -96,15 +96,10 @@ pub(crate) mod reader; mod writer; use crate::basic::{ColumnOrder, Compression, Encoding, Type}; -use crate::data_type::AsBytes; +use crate::encryption::ciphers::BlockDecryptor; #[cfg(feature = "encryption")] use crate::encryption::ciphers::FileDecryptor; -use crate::encryption::ciphers::{ - create_footer_aad, create_module_aad, create_page_aad, ModuleType, -}; -use crate::encryption::ciphers::{ - BlockDecryptor, DecryptionPropertiesBuilder, FileDecryptionProperties, -}; +use crate::encryption::ciphers::{create_module_aad, ModuleType}; use crate::errors::{ParquetError, Result}; pub(crate) use crate::file::metadata::memory::HeapSize; use crate::file::page_encoding_stats::{self, PageEncodingStats}; @@ -112,8 +107,8 @@ use crate::file::page_index::index::Index; use crate::file::page_index::offset_index::OffsetIndexMetaData; use crate::file::statistics::{self, Statistics}; use crate::format::{ - BoundaryOrder, ColumnChunk, ColumnCryptoMetaData, ColumnIndex, ColumnMetaData, - EncryptionAlgorithm, OffsetIndex, PageLocation, RowGroup, SizeStatistics, SortingColumn, + BoundaryOrder, ColumnChunk, ColumnCryptoMetaData, ColumnIndex, ColumnMetaData, OffsetIndex, + PageLocation, RowGroup, SizeStatistics, SortingColumn, }; use crate::schema::types::{ ColumnDescPtr, ColumnDescriptor, ColumnPath, SchemaDescPtr, SchemaDescriptor, @@ -125,7 +120,6 @@ use std::ops::Range; use std::sync::Arc; pub use writer::ParquetMetaDataWriter; pub(crate) use writer::ThriftMetadataWriter; -use zstd::zstd_safe::WriteBuf; /// Page level statistics for each column chunk of each row group. /// @@ -662,10 +656,6 @@ impl RowGroupMetaData { } else { let decryptor = decryptor.unwrap(); let column_name = crypto_metadata.path_in_schema.join("."); - if !decryptor.has_column_key(&column_name.as_bytes()) { - cc = ColumnChunkMetaData::from_thrift(d.clone(), c)?; - break; - } let column_decryptor = decryptor .get_column_decryptor(column_name.as_bytes()) .footer_decryptor() diff --git a/parquet/src/file/serialized_reader.rs b/parquet/src/file/serialized_reader.rs index 8ff4bd0c5e36..42ace893e4d4 100644 --- a/parquet/src/file/serialized_reader.rs +++ b/parquet/src/file/serialized_reader.rs @@ -704,6 +704,7 @@ impl SerializedPageReader { offset, remaining_bytes, next_page_header, + .. } => { loop { if *remaining_bytes == 0 { @@ -719,7 +720,11 @@ impl SerializedPageReader { } } else { let mut read = self.reader.get_read(*offset as u64)?; - let (header_len, header) = read_page_header_len(&mut read)?; + let (header_len, header) = read_page_header_len( + &mut read, + #[cfg(feature = "encryption")] + None, + )?; *offset += header_len; *remaining_bytes -= header_len; let page_meta = if let Ok(_page_meta) = PageMetadata::try_from(&header) { @@ -1375,6 +1380,8 @@ mod tests { row_group.metadata.num_rows() as usize, page_locations, props, + #[cfg(feature = "encryption")] + None, ) } From 4617870dc9a1090bdab82c8e511f8bcddd655b89 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Tue, 21 Jan 2025 17:17:31 +0100 Subject: [PATCH 34/97] Cleanup --- parquet/src/arrow/async_reader/mod.rs | 3 +- parquet/src/file/metadata/mod.rs | 78 ++++++++++++--------------- parquet/src/file/metadata/reader.rs | 6 --- 3 files changed, 37 insertions(+), 50 deletions(-) diff --git a/parquet/src/arrow/async_reader/mod.rs b/parquet/src/arrow/async_reader/mod.rs index c2b5410eba62..607518fdc2de 100644 --- a/parquet/src/arrow/async_reader/mod.rs +++ b/parquet/src/arrow/async_reader/mod.rs @@ -154,7 +154,7 @@ impl AsyncFileReader for T { let mut buf = Vec::with_capacity(metadata_len); self.take(metadata_len as _).read_to_end(&mut buf).await?; - // todo: use file_decryption_properties + // todo: provide file_decryption_properties Ok(Arc::new(ParquetMetaDataReader::decode_metadata( &buf, footer.encrypted_footer(), @@ -971,6 +971,7 @@ impl RowGroups for InMemoryRowGroup<'_> { // filter out empty offset indexes (old versions specified Some(vec![]) when no present) .filter(|index| !index.is_empty()) .map(|index| index[i].page_locations.clone()); + // todo: provide crypto_context let page_reader: Box = Box::new(SerializedPageReader::new( data.clone(), self.metadata.column(i), diff --git a/parquet/src/file/metadata/mod.rs b/parquet/src/file/metadata/mod.rs index d2fb128c27b8..908741306409 100644 --- a/parquet/src/file/metadata/mod.rs +++ b/parquet/src/file/metadata/mod.rs @@ -551,8 +551,6 @@ pub struct RowGroupMetaData { ordinal: Option, } -// todo:rok - impl RowGroupMetaData { /// Returns builder for row group metadata. pub fn builder(schema_descr: SchemaDescPtr) -> RowGroupMetaDataBuilder { @@ -646,49 +644,43 @@ impl RowGroupMetaData { .zip(schema_descr.columns()) .enumerate() { - let cc; #[cfg(feature = "encryption")] - if let Some(ColumnCryptoMetaData::ENCRYPTIONWITHCOLUMNKEY(crypto_metadata)) = - c.crypto_metadata.clone() - { - if c.encrypted_column_metadata.is_none() { - cc = ColumnChunkMetaData::from_thrift(d.clone(), c)?; - } else { - let decryptor = decryptor.unwrap(); - let column_name = crypto_metadata.path_in_schema.join("."); - let column_decryptor = decryptor - .get_column_decryptor(column_name.as_bytes()) - .footer_decryptor() - .unwrap(); - - let aad_file_unique = decryptor.aad_file_unique(); - let aad_prefix: Vec = decryptor - .decryption_properties() - .aad_prefix() - .unwrap_or_default(); - let column_aad = create_module_aad( - [aad_prefix.as_slice(), aad_file_unique.as_slice()] - .concat() - .as_slice(), - ModuleType::ColumnMetaData, - rg.ordinal.unwrap() as usize, - i as usize, - None, - )?; - - let buf = c.encrypted_column_metadata.clone().unwrap(); - let decrypted_cc_buf = - column_decryptor.decrypt(buf.as_slice().as_ref(), column_aad.as_ref())?; - - let mut prot = TCompactSliceInputProtocol::new(decrypted_cc_buf.as_slice()); - c.meta_data = Some(ColumnMetaData::read_from_in_protocol(&mut prot)?); - cc = ColumnChunkMetaData::from_thrift(d.clone(), c)?; - } - } else { - cc = ColumnChunkMetaData::from_thrift(d.clone(), c)?; + if c.encrypted_column_metadata.is_some() { + let decryptor = decryptor.unwrap(); + let Some(ColumnCryptoMetaData::ENCRYPTIONWITHCOLUMNKEY(crypto_metadata)) = + c.crypto_metadata.clone() + else { + todo!() + }; + let column_name = crypto_metadata.path_in_schema.join("."); + let column_decryptor = decryptor + .get_column_decryptor(column_name.as_bytes()) + .footer_decryptor() + .unwrap(); + + let aad_file_unique = decryptor.aad_file_unique(); + let aad_prefix: Vec = decryptor + .decryption_properties() + .aad_prefix() + .unwrap_or_default(); + let column_aad = create_module_aad( + [aad_prefix.as_slice(), aad_file_unique.as_slice()] + .concat() + .as_slice(), + ModuleType::ColumnMetaData, + rg.ordinal.unwrap() as usize, + i, + None, + )?; + + let buf = c.encrypted_column_metadata.clone().unwrap(); + let decrypted_cc_buf = + column_decryptor.decrypt(buf.as_slice(), column_aad.as_ref())?; + + let mut prot = TCompactSliceInputProtocol::new(decrypted_cc_buf.as_slice()); + c.meta_data = Some(ColumnMetaData::read_from_in_protocol(&mut prot)?); } - - columns.push(cc); + columns.push(ColumnChunkMetaData::from_thrift(d.clone(), c)?); } let sorting_columns = rg.sorting_columns; Ok(RowGroupMetaData { diff --git a/parquet/src/file/metadata/reader.rs b/parquet/src/file/metadata/reader.rs index 238cd69a9b31..0c5c9f2cb4e5 100644 --- a/parquet/src/file/metadata/reader.rs +++ b/parquet/src/file/metadata/reader.rs @@ -738,11 +738,6 @@ impl ParquetMetaDataReader { }; // todo decr: add support for GCMCTRV1 // todo decr: get key_metadata - - // remaining buffer contains encrypted FileMetaData - - // todo decr: get aad_prefix - // todo decr: set both aad_prefix and aad_file_unique in file_decryptor let aad_file_unique = aes_gcm_algo.aad_file_unique.unwrap(); let aad_footer = create_footer_aad(aad_file_unique.as_ref())?; let aad_prefix: Vec = aes_gcm_algo.aad_prefix.unwrap_or_default(); @@ -764,7 +759,6 @@ impl ParquetMetaDataReader { let schema = types::from_thrift(&t_file_metadata.schema)?; let schema_descr = Arc::new(SchemaDescriptor::new(schema)); - // todo add file decryptor #[cfg(feature = "encryption")] if t_file_metadata.encryption_algorithm.is_some() { let algo = t_file_metadata.encryption_algorithm; From 397d37b9d55c39d3224bb8895fb6722d26c1bb61 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Tue, 21 Jan 2025 17:33:25 +0100 Subject: [PATCH 35/97] Cleanup --- parquet/src/encryption/ciphers.rs | 15 ++++++++------- parquet/src/file/metadata/mod.rs | 21 +++++++++++++++++---- parquet/src/file/serialized_reader.rs | 15 ++++++--------- 3 files changed, 31 insertions(+), 20 deletions(-) diff --git a/parquet/src/encryption/ciphers.rs b/parquet/src/encryption/ciphers.rs index 0e9b9e5d906d..8c9a3a845c60 100644 --- a/parquet/src/encryption/ciphers.rs +++ b/parquet/src/encryption/ciphers.rs @@ -174,7 +174,12 @@ pub fn create_footer_aad(file_aad: &[u8]) -> Result> { create_module_aad(file_aad, ModuleType::Footer, 0, 0, None) } -pub fn create_module_aad(file_aad: &[u8], module_type: ModuleType, row_group_ordinal: usize, +pub fn create_page_aad(file_aad: &[u8], module_type: ModuleType, row_group_ordinal: usize, + column_ordinal: usize, page_ordinal: Option) -> Result> { + create_module_aad(file_aad, module_type, row_group_ordinal, column_ordinal, page_ordinal) +} + +fn create_module_aad(file_aad: &[u8], module_type: ModuleType, row_group_ordinal: usize, column_ordinal: usize, page_ordinal: Option) -> Result> { let module_buf = [module_type as u8]; @@ -279,7 +284,7 @@ impl DecryptionPropertiesBuilder { } pub fn with_column_key(mut self, key: Vec, value: Vec) -> Self { - let mut column_keys= self.column_keys.unwrap_or_else(HashMap::new); + let mut column_keys= self.column_keys.unwrap_or_default(); column_keys.insert(key, value); self.column_keys = Some(column_keys); self @@ -303,11 +308,7 @@ impl PartialEq for FileDecryptor { impl FileDecryptor { pub(crate) fn new(decryption_properties: &FileDecryptionProperties, aad_file_unique: Vec, aad_prefix: Vec) -> Self { - let footer_decryptor = if let Some(footer_key) = decryption_properties.footer_key.clone() { - Some(RingGcmBlockDecryptor::new(footer_key.as_ref())) - } else { - None - }; + let footer_decryptor = decryption_properties.footer_key.clone().map(|footer_key| RingGcmBlockDecryptor::new(footer_key.as_ref())); Self { // todo decr: if no key available yet (not set in properties, will be retrieved from metadata) diff --git a/parquet/src/file/metadata/mod.rs b/parquet/src/file/metadata/mod.rs index 908741306409..1101a3ba8d5f 100644 --- a/parquet/src/file/metadata/mod.rs +++ b/parquet/src/file/metadata/mod.rs @@ -99,7 +99,7 @@ use crate::basic::{ColumnOrder, Compression, Encoding, Type}; use crate::encryption::ciphers::BlockDecryptor; #[cfg(feature = "encryption")] use crate::encryption::ciphers::FileDecryptor; -use crate::encryption::ciphers::{create_module_aad, ModuleType}; +use crate::encryption::ciphers::{create_page_aad, ModuleType}; use crate::errors::{ParquetError, Result}; pub(crate) use crate::file::metadata::memory::HeapSize; use crate::file::page_encoding_stats::{self, PageEncodingStats}; @@ -114,7 +114,9 @@ use crate::schema::types::{ ColumnDescPtr, ColumnDescriptor, ColumnPath, SchemaDescPtr, SchemaDescriptor, Type as SchemaType, }; -use crate::thrift::{TCompactSliceInputProtocol, TSerializable}; +#[cfg(feature = "encryption")] +use crate::thrift::TCompactSliceInputProtocol; +use crate::thrift::TSerializable; pub use reader::ParquetMetaDataReader; use std::ops::Range; use std::sync::Arc; @@ -638,13 +640,24 @@ impl RowGroupMetaData { let total_byte_size = rg.total_byte_size; let num_rows = rg.num_rows; let mut columns = vec![]; + + #[cfg(not(feature = "encryption"))] + for (i, (c, d)) in rg + .columns + .drain(0..) + .zip(schema_descr.columns()) + .enumerate() + { + columns.push(ColumnChunkMetaData::from_thrift(d.clone(), c)?); + } + + #[cfg(feature = "encryption")] for (i, (mut c, d)) in rg .columns .drain(0..) .zip(schema_descr.columns()) .enumerate() { - #[cfg(feature = "encryption")] if c.encrypted_column_metadata.is_some() { let decryptor = decryptor.unwrap(); let Some(ColumnCryptoMetaData::ENCRYPTIONWITHCOLUMNKEY(crypto_metadata)) = @@ -663,7 +676,7 @@ impl RowGroupMetaData { .decryption_properties() .aad_prefix() .unwrap_or_default(); - let column_aad = create_module_aad( + let column_aad = create_page_aad( [aad_prefix.as_slice(), aad_file_unique.as_slice()] .concat() .as_slice(), diff --git a/parquet/src/file/serialized_reader.rs b/parquet/src/file/serialized_reader.rs index 42ace893e4d4..58327e730a98 100644 --- a/parquet/src/file/serialized_reader.rs +++ b/parquet/src/file/serialized_reader.rs @@ -23,7 +23,7 @@ use crate::bloom_filter::Sbbf; use crate::column::page::{Page, PageMetadata, PageReader}; use crate::compression::{create_codec, Codec}; #[cfg(feature = "encryption")] -use crate::encryption::ciphers::{create_module_aad, BlockDecryptor, CryptoContext, ModuleType}; +use crate::encryption::ciphers::{create_page_aad, BlockDecryptor, CryptoContext, ModuleType}; use crate::errors::{ParquetError, Result}; use crate::file::page_index::offset_index::OffsetIndexMetaData; use crate::file::{ @@ -345,19 +345,16 @@ pub(crate) fn read_page_header( input: &mut T, #[cfg(feature = "encryption")] crypto_context: Option>, ) -> Result { - // todo: if column is not encrypted skip decryption - #[cfg(feature = "encryption")] if let Some(crypto_context) = crypto_context { let decryptor = &crypto_context.data_decryptor(); - // todo: in case of per-column key, decryptor should be column decryptor - if !decryptor.has_footer_key() || !decryptor.footer_decryptor().is_some() { + + if !decryptor.has_footer_key() || decryptor.footer_decryptor().is_none() { let mut prot = TCompactInputProtocol::new(input); let page_header = PageHeader::read_from_in_protocol(&mut prot)?; return Ok(page_header); }; - // let file_decryptor = decryptor.column_decryptor(); let data_decryptor = &crypto_context.data_decryptor(); let aad_file_unique = decryptor.aad_file_unique(); let aad_prefix = decryptor.aad_prefix(); @@ -367,7 +364,7 @@ pub(crate) fn read_page_header( } else { ModuleType::DataPageHeader }; - let aad = create_module_aad( + let aad = create_page_aad( [aad_prefix.as_slice(), aad_file_unique.as_slice()] .concat() .as_slice(), @@ -485,14 +482,14 @@ pub(crate) fn decode_page( } else { ModuleType::DataPage }; - let aad = create_module_aad( + let aad = create_page_aad( decryptor.aad_file_unique().as_slice(), module_type, crypto_context.row_group_ordinal, crypto_context.column_ordinal, crypto_context.page_ordinal, )?; - let decrypted = file_decryptor.unwrap().decrypt(&buffer.as_ref(), &aad)?; + let decrypted = file_decryptor.unwrap().decrypt(buffer.as_ref(), &aad)?; Bytes::from(decrypted) } } else { From 3d3bfd8646c72d317be9f063f81ae93dbd0c2ad8 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Tue, 21 Jan 2025 17:44:19 +0100 Subject: [PATCH 36/97] Cleanup --- parquet/src/encryption/ciphers.rs | 185 +++++++++++++++--------------- parquet/src/file/metadata/mod.rs | 20 ++-- 2 files changed, 100 insertions(+), 105 deletions(-) diff --git a/parquet/src/encryption/ciphers.rs b/parquet/src/encryption/ciphers.rs index 8c9a3a845c60..3c156ff69839 100644 --- a/parquet/src/encryption/ciphers.rs +++ b/parquet/src/encryption/ciphers.rs @@ -20,105 +20,106 @@ use std::collections::HashMap; use std::sync::Arc; -use ring::aead::{Aad, LessSafeKey, NonceSequence, UnboundKey, AES_128_GCM}; -use ring::rand::{SecureRandom, SystemRandom}; +use ring::aead::{Aad, LessSafeKey, UnboundKey, AES_128_GCM}; +// use ring::aead::NonceSequence; +// use ring::rand::{SecureRandom, SystemRandom}; use crate::errors::{ParquetError, Result}; -pub trait BlockEncryptor { - fn encrypt(&mut self, plaintext: &[u8], aad: &[u8]) -> Result>; -} +// pub trait BlockEncryptor { +// fn encrypt(&mut self, plaintext: &[u8], aad: &[u8]) -> Result>; +// } pub trait BlockDecryptor { fn decrypt(&self, length_and_ciphertext: &[u8], aad: &[u8]) -> Result>; } -const RIGHT_TWELVE: u128 = 0x0000_0000_ffff_ffff_ffff_ffff_ffff_ffff; +// const RIGHT_TWELVE: u128 = 0x0000_0000_ffff_ffff_ffff_ffff_ffff_ffff; const NONCE_LEN: usize = 12; const TAG_LEN: usize = 16; const SIZE_LEN: usize = 4; -struct CounterNonce { - start: u128, - counter: u128, -} - -impl CounterNonce { - pub fn new(rng: &SystemRandom) -> Self { - let mut buf = [0; 16]; - rng.fill(&mut buf).unwrap(); - - // Since this is a random seed value, endianess doesn't matter at all, - // and we can use whatever is platform-native. - let start = u128::from_ne_bytes(buf) & RIGHT_TWELVE; - let counter = start.wrapping_add(1); - - Self { start, counter } - } - - /// One accessor for the nonce bytes to avoid potentially flipping endianess - #[inline] - pub fn get_bytes(&self) -> [u8; NONCE_LEN] { - self.counter.to_le_bytes()[0..NONCE_LEN].try_into().unwrap() - } -} - -impl NonceSequence for CounterNonce { - fn advance(&mut self) -> Result { - // If we've wrapped around, we've exhausted this nonce sequence - if (self.counter & RIGHT_TWELVE) == (self.start & RIGHT_TWELVE) { - Err(ring::error::Unspecified) - } else { - // Otherwise, just advance and return the new value - let buf: [u8; NONCE_LEN] = self.get_bytes(); - self.counter = self.counter.wrapping_add(1); - Ok(ring::aead::Nonce::assume_unique_for_key(buf)) - } - } -} - -pub(crate) struct RingGcmBlockEncryptor { - key: LessSafeKey, - nonce_sequence: CounterNonce, -} - -impl RingGcmBlockEncryptor { - // todo TBD: some KMS systems produce data keys, need to be able to pass them to Encryptor. - // todo TBD: for other KMSs, we will create data keys inside arrow-rs, making sure to use SystemRandom - /// Create a new `RingGcmBlockEncryptor` with a given key and random nonce. - /// The nonce will advance appropriately with each block encryption and - /// return an error if it wraps around. - pub(crate) fn new(key_bytes: &[u8]) -> Self { - let rng = SystemRandom::new(); - - // todo support other key sizes - let key = UnboundKey::new(&AES_128_GCM, key_bytes.as_ref()).unwrap(); - let nonce = CounterNonce::new(&rng); - - Self { - key: LessSafeKey::new(key), - nonce_sequence: nonce, - } - } -} - -impl BlockEncryptor for RingGcmBlockEncryptor { - fn encrypt(&mut self, plaintext: &[u8], aad: &[u8]) -> Result> { - let nonce = self.nonce_sequence.advance()?; - let ciphertext_len = plaintext.len() + NONCE_LEN + TAG_LEN; - // todo TBD: add first 4 bytes with the length, per https://github.com/apache/parquet-format/blob/master/Encryption.md#51-encrypted-module-serialization - let mut result = Vec::with_capacity(SIZE_LEN + ciphertext_len); - result.extend_from_slice((ciphertext_len as i32).to_le_bytes().as_ref()); - result.extend_from_slice(nonce.as_ref()); - result.extend_from_slice(plaintext); - - let tag = self - .key - .seal_in_place_separate_tag(nonce, Aad::from(aad), &mut result[SIZE_LEN + NONCE_LEN..])?; - result.extend_from_slice(tag.as_ref()); - - Ok(result) - } -} +// struct CounterNonce { +// start: u128, +// counter: u128, +// } +// +// impl CounterNonce { +// pub fn new(rng: &SystemRandom) -> Self { +// let mut buf = [0; 16]; +// rng.fill(&mut buf).unwrap(); +// +// // Since this is a random seed value, endianess doesn't matter at all, +// // and we can use whatever is platform-native. +// let start = u128::from_ne_bytes(buf) & RIGHT_TWELVE; +// let counter = start.wrapping_add(1); +// +// Self { start, counter } +// } +// +// /// One accessor for the nonce bytes to avoid potentially flipping endianess +// #[inline] +// pub fn get_bytes(&self) -> [u8; NONCE_LEN] { +// self.counter.to_le_bytes()[0..NONCE_LEN].try_into().unwrap() +// } +// } +// +// impl NonceSequence for CounterNonce { +// fn advance(&mut self) -> Result { +// // If we've wrapped around, we've exhausted this nonce sequence +// if (self.counter & RIGHT_TWELVE) == (self.start & RIGHT_TWELVE) { +// Err(ring::error::Unspecified) +// } else { +// // Otherwise, just advance and return the new value +// let buf: [u8; NONCE_LEN] = self.get_bytes(); +// self.counter = self.counter.wrapping_add(1); +// Ok(ring::aead::Nonce::assume_unique_for_key(buf)) +// } +// } +// } +// +// pub(crate) struct RingGcmBlockEncryptor { +// key: LessSafeKey, +// nonce_sequence: CounterNonce, +// } +// +// impl RingGcmBlockEncryptor { +// // todo TBD: some KMS systems produce data keys, need to be able to pass them to Encryptor. +// // todo TBD: for other KMSs, we will create data keys inside arrow-rs, making sure to use SystemRandom +// /// Create a new `RingGcmBlockEncryptor` with a given key and random nonce. +// /// The nonce will advance appropriately with each block encryption and +// /// return an error if it wraps around. +// pub(crate) fn new(key_bytes: &[u8]) -> Self { +// let rng = SystemRandom::new(); +// +// // todo support other key sizes +// let key = UnboundKey::new(&AES_128_GCM, key_bytes.as_ref()).unwrap(); +// let nonce = CounterNonce::new(&rng); +// +// Self { +// key: LessSafeKey::new(key), +// nonce_sequence: nonce, +// } +// } +// } +// +// impl BlockEncryptor for RingGcmBlockEncryptor { +// fn encrypt(&mut self, plaintext: &[u8], aad: &[u8]) -> Result> { +// let nonce = self.nonce_sequence.advance()?; +// let ciphertext_len = plaintext.len() + NONCE_LEN + TAG_LEN; +// // todo TBD: add first 4 bytes with the length, per https://github.com/apache/parquet-format/blob/master/Encryption.md#51-encrypted-module-serialization +// let mut result = Vec::with_capacity(SIZE_LEN + ciphertext_len); +// result.extend_from_slice((ciphertext_len as i32).to_le_bytes().as_ref()); +// result.extend_from_slice(nonce.as_ref()); +// result.extend_from_slice(plaintext); +// +// let tag = self +// .key +// .seal_in_place_separate_tag(nonce, Aad::from(aad), &mut result[SIZE_LEN + NONCE_LEN..])?; +// result.extend_from_slice(tag.as_ref()); +// +// Ok(result) +// } +// } #[derive(Debug, Clone)] pub(crate) struct RingGcmBlockDecryptor { @@ -164,10 +165,10 @@ pub(crate) enum ModuleType { DictionaryPage = 3, DataPageHeader = 4, DictionaryPageHeader = 5, - ColumnIndex = 6, - OffsetIndex = 7, - BloomFilterHeader = 8, - BloomFilterBitset = 9, + // ColumnIndex = 6, + // OffsetIndex = 7, + // BloomFilterHeader = 8, + // BloomFilterBitset = 9, } pub fn create_footer_aad(file_aad: &[u8]) -> Result> { diff --git a/parquet/src/file/metadata/mod.rs b/parquet/src/file/metadata/mod.rs index 1101a3ba8d5f..82a2ac62e225 100644 --- a/parquet/src/file/metadata/mod.rs +++ b/parquet/src/file/metadata/mod.rs @@ -96,27 +96,26 @@ pub(crate) mod reader; mod writer; use crate::basic::{ColumnOrder, Compression, Encoding, Type}; -use crate::encryption::ciphers::BlockDecryptor; #[cfg(feature = "encryption")] -use crate::encryption::ciphers::FileDecryptor; -use crate::encryption::ciphers::{create_page_aad, ModuleType}; +use crate::encryption::ciphers::{create_page_aad, BlockDecryptor, FileDecryptor, ModuleType}; use crate::errors::{ParquetError, Result}; pub(crate) use crate::file::metadata::memory::HeapSize; use crate::file::page_encoding_stats::{self, PageEncodingStats}; use crate::file::page_index::index::Index; use crate::file::page_index::offset_index::OffsetIndexMetaData; use crate::file::statistics::{self, Statistics}; +#[cfg(feature = "encryption")] +use crate::format::ColumnCryptoMetaData; use crate::format::{ - BoundaryOrder, ColumnChunk, ColumnCryptoMetaData, ColumnIndex, ColumnMetaData, OffsetIndex, - PageLocation, RowGroup, SizeStatistics, SortingColumn, + BoundaryOrder, ColumnChunk, ColumnIndex, ColumnMetaData, OffsetIndex, PageLocation, RowGroup, + SizeStatistics, SortingColumn, }; use crate::schema::types::{ ColumnDescPtr, ColumnDescriptor, ColumnPath, SchemaDescPtr, SchemaDescriptor, Type as SchemaType, }; #[cfg(feature = "encryption")] -use crate::thrift::TCompactSliceInputProtocol; -use crate::thrift::TSerializable; +use crate::thrift::{TCompactSliceInputProtocol, TSerializable}; pub use reader::ParquetMetaDataReader; use std::ops::Range; use std::sync::Arc; @@ -642,12 +641,7 @@ impl RowGroupMetaData { let mut columns = vec![]; #[cfg(not(feature = "encryption"))] - for (i, (c, d)) in rg - .columns - .drain(0..) - .zip(schema_descr.columns()) - .enumerate() - { + for (c, d) in rg.columns.drain(0..).zip(schema_descr.columns()) { columns.push(ColumnChunkMetaData::from_thrift(d.clone(), c)?); } From e53306e6e93ee12cd8744cc1f5f50f4eaf653399 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Tue, 21 Jan 2025 17:59:44 +0100 Subject: [PATCH 37/97] Cleanup --- parquet/src/encryption/ciphers.rs | 2 +- parquet/tests/arrow_writer_layout.rs | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/parquet/src/encryption/ciphers.rs b/parquet/src/encryption/ciphers.rs index 3c156ff69839..525c219c759c 100644 --- a/parquet/src/encryption/ciphers.rs +++ b/parquet/src/encryption/ciphers.rs @@ -175,7 +175,7 @@ pub fn create_footer_aad(file_aad: &[u8]) -> Result> { create_module_aad(file_aad, ModuleType::Footer, 0, 0, None) } -pub fn create_page_aad(file_aad: &[u8], module_type: ModuleType, row_group_ordinal: usize, +pub(crate) fn create_page_aad(file_aad: &[u8], module_type: ModuleType, row_group_ordinal: usize, column_ordinal: usize, page_ordinal: Option) -> Result> { create_module_aad(file_aad, module_type, row_group_ordinal, column_ordinal, page_ordinal) } diff --git a/parquet/tests/arrow_writer_layout.rs b/parquet/tests/arrow_writer_layout.rs index 9a66d13f84d7..9297b8d13f07 100644 --- a/parquet/tests/arrow_writer_layout.rs +++ b/parquet/tests/arrow_writer_layout.rs @@ -141,6 +141,8 @@ fn assert_layout(file_reader: &Bytes, meta: &ParquetMetaData, layout: &Layout) { row_group.num_rows() as usize, None, Arc::new(properties), + #[cfg(feature = "encryption")] + None, ) .unwrap(); From 95888bcd4ca5a1561694415cdb37e58a1ce2051b Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Tue, 21 Jan 2025 18:13:16 +0100 Subject: [PATCH 38/97] Cleanup --- parquet/examples/read_with_rowgroup.rs | 2 ++ parquet/src/file/metadata/mod.rs | 4 ++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/parquet/examples/read_with_rowgroup.rs b/parquet/examples/read_with_rowgroup.rs index 8cccc7fe14ac..9312355c8845 100644 --- a/parquet/examples/read_with_rowgroup.rs +++ b/parquet/examples/read_with_rowgroup.rs @@ -121,6 +121,8 @@ impl RowGroups for InMemoryRowGroup { self.metadata.column(i), self.num_rows(), None, + #[cfg(feature = "encryption")] + None, )?); Ok(Box::new(ColumnChunkIterator { diff --git a/parquet/src/file/metadata/mod.rs b/parquet/src/file/metadata/mod.rs index 82a2ac62e225..9f79a2416cda 100644 --- a/parquet/src/file/metadata/mod.rs +++ b/parquet/src/file/metadata/mod.rs @@ -1932,7 +1932,7 @@ mod tests { let parquet_meta = ParquetMetaDataBuilder::new(file_metadata.clone()) .set_row_groups(row_group_meta_with_stats) .build(); - let base_expected_size = 2896; + let base_expected_size = 3008; assert_eq!(parquet_meta.memory_size(), base_expected_size); @@ -1959,7 +1959,7 @@ mod tests { ]])) .build(); - let bigger_expected_size = 3400; + let bigger_expected_size = 3512; // more set fields means more memory usage assert!(bigger_expected_size > base_expected_size); assert_eq!(parquet_meta.memory_size(), bigger_expected_size); From ed1bb3c9a1f63f901d140fcc497ee8842a4e32a7 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Tue, 21 Jan 2025 21:13:46 +0100 Subject: [PATCH 39/97] Cleanup --- parquet/examples/read_with_rowgroup.rs | 1 + parquet/src/encryption/ciphers.rs | 105 ++++++++++++++++++------- parquet/src/file/footer.rs | 1 - 3 files changed, 76 insertions(+), 31 deletions(-) diff --git a/parquet/examples/read_with_rowgroup.rs b/parquet/examples/read_with_rowgroup.rs index 9312355c8845..be274fe73e0b 100644 --- a/parquet/examples/read_with_rowgroup.rs +++ b/parquet/examples/read_with_rowgroup.rs @@ -115,6 +115,7 @@ impl RowGroups for InMemoryRowGroup { None => Err(ParquetError::General(format!( "Invalid column index {i}, column was not fetched" ))), + // todo: provide crypto_context Some(data) => { let page_reader: Box = Box::new(SerializedPageReader::new( data.clone(), diff --git a/parquet/src/encryption/ciphers.rs b/parquet/src/encryption/ciphers.rs index 525c219c759c..3141fb3ce843 100644 --- a/parquet/src/encryption/ciphers.rs +++ b/parquet/src/encryption/ciphers.rs @@ -139,9 +139,8 @@ impl RingGcmBlockDecryptor { impl BlockDecryptor for RingGcmBlockDecryptor { fn decrypt(&self, length_and_ciphertext: &[u8], aad: &[u8]) -> Result> { - let mut result = Vec::with_capacity( - length_and_ciphertext.len() - SIZE_LEN - NONCE_LEN - TAG_LEN, - ); + let mut result = + Vec::with_capacity(length_and_ciphertext.len() - SIZE_LEN - NONCE_LEN - TAG_LEN); result.extend_from_slice(&length_and_ciphertext[SIZE_LEN + NONCE_LEN..]); let nonce = ring::aead::Nonce::try_assume_unique_for_key( @@ -175,13 +174,29 @@ pub fn create_footer_aad(file_aad: &[u8]) -> Result> { create_module_aad(file_aad, ModuleType::Footer, 0, 0, None) } -pub(crate) fn create_page_aad(file_aad: &[u8], module_type: ModuleType, row_group_ordinal: usize, - column_ordinal: usize, page_ordinal: Option) -> Result> { - create_module_aad(file_aad, module_type, row_group_ordinal, column_ordinal, page_ordinal) +pub(crate) fn create_page_aad( + file_aad: &[u8], + module_type: ModuleType, + row_group_ordinal: usize, + column_ordinal: usize, + page_ordinal: Option, +) -> Result> { + create_module_aad( + file_aad, + module_type, + row_group_ordinal, + column_ordinal, + page_ordinal, + ) } -fn create_module_aad(file_aad: &[u8], module_type: ModuleType, row_group_ordinal: usize, - column_ordinal: usize, page_ordinal: Option) -> Result> { +fn create_module_aad( + file_aad: &[u8], + module_type: ModuleType, + row_group_ordinal: usize, + column_ordinal: usize, + page_ordinal: Option, +) -> Result> { let module_buf = [module_type as u8]; @@ -189,34 +204,44 @@ fn create_module_aad(file_aad: &[u8], module_type: ModuleType, row_group_ordinal let mut aad = Vec::with_capacity(file_aad.len() + 1); aad.extend_from_slice(file_aad); aad.extend_from_slice(module_buf.as_ref()); - return Ok(aad) + return Ok(aad); } if row_group_ordinal > i16::MAX as usize { - return Err(general_err!("Encrypted parquet files can't have more than {} row groups: {}", - i16::MAX, row_group_ordinal)); + return Err(general_err!( + "Encrypted parquet files can't have more than {} row groups: {}", + i16::MAX, + row_group_ordinal + )); } if column_ordinal > i16::MAX as usize { - return Err(general_err!("Encrypted parquet files can't have more than {} columns: {}", - i16::MAX, column_ordinal)); + return Err(general_err!( + "Encrypted parquet files can't have more than {} columns: {}", + i16::MAX, + column_ordinal + )); } - if module_buf[0] != (ModuleType::DataPageHeader as u8) && - module_buf[0] != (ModuleType::DataPage as u8) { + if module_buf[0] != (ModuleType::DataPageHeader as u8) + && module_buf[0] != (ModuleType::DataPage as u8) + { let mut aad = Vec::with_capacity(file_aad.len() + 5); aad.extend_from_slice(file_aad); aad.extend_from_slice(module_buf.as_ref()); aad.extend_from_slice((row_group_ordinal as i16).to_le_bytes().as_ref()); aad.extend_from_slice((column_ordinal as i16).to_le_bytes().as_ref()); - return Ok(aad) + return Ok(aad); } - let page_ordinal = page_ordinal.ok_or_else(|| general_err!( - "Page ordinal must be set for data pages"))?; + let page_ordinal = + page_ordinal.ok_or_else(|| general_err!("Page ordinal must be set for data pages"))?; if page_ordinal > i16::MAX as usize { - return Err(general_err!("Encrypted parquet files can't have more than {} pages per column chunk: {}", - i16::MAX, page_ordinal)); + return Err(general_err!( + "Encrypted parquet files can't have more than {} pages per column chunk: {}", + i16::MAX, + page_ordinal + )); } let mut aad = Vec::with_capacity(file_aad.len() + 7); @@ -239,7 +264,9 @@ impl FileDecryptionProperties { pub fn builder() -> DecryptionPropertiesBuilder { DecryptionPropertiesBuilder::with_defaults() } - pub fn has_footer_key(&self) -> bool { self.footer_key.is_some() } + pub fn has_footer_key(&self) -> bool { + self.footer_key.is_some() + } pub fn has_column_keys(&self) -> bool { self.column_keys.is_some() @@ -285,7 +312,7 @@ impl DecryptionPropertiesBuilder { } pub fn with_column_key(mut self, key: Vec, value: Vec) -> Self { - let mut column_keys= self.column_keys.unwrap_or_default(); + let mut column_keys = self.column_keys.unwrap_or_default(); column_keys.insert(key, value); self.column_keys = Some(column_keys); self @@ -308,8 +335,15 @@ impl PartialEq for FileDecryptor { } impl FileDecryptor { - pub(crate) fn new(decryption_properties: &FileDecryptionProperties, aad_file_unique: Vec, aad_prefix: Vec) -> Self { - let footer_decryptor = decryption_properties.footer_key.clone().map(|footer_key| RingGcmBlockDecryptor::new(footer_key.as_ref())); + pub(crate) fn new( + decryption_properties: &FileDecryptionProperties, + aad_file_unique: Vec, + aad_prefix: Vec, + ) -> Self { + let footer_decryptor = decryption_properties + .footer_key + .clone() + .map(|footer_key| RingGcmBlockDecryptor::new(footer_key.as_ref())); Self { // todo decr: if no key available yet (not set in properties, will be retrieved from metadata) @@ -326,7 +360,11 @@ impl FileDecryptor { } pub(crate) fn has_column_key(&self, column_name: &[u8]) -> bool { - self.decryption_properties.column_keys.clone().unwrap().contains_key(column_name) + self.decryption_properties + .column_keys + .clone() + .unwrap() + .contains_key(column_name) } pub(crate) fn get_column_decryptor(&self, column_name: &[u8]) -> FileDecryptor { @@ -387,9 +425,12 @@ pub struct CryptoContext { } impl CryptoContext { - pub fn new(row_group_ordinal: usize, - column_ordinal: usize, data_decryptor: Arc, - metadata_decryptor: Arc) -> Self { + pub fn new( + row_group_ordinal: usize, + column_ordinal: usize, + data_decryptor: Arc, + metadata_decryptor: Arc + ) -> Self { Self { row_group_ordinal, column_ordinal, @@ -422,6 +463,10 @@ impl CryptoContext { } } - pub fn data_decryptor(&self) -> Arc { self.data_decryptor.clone()} - pub fn metadata_decryptor(&self) -> Arc { self.metadata_decryptor.clone() } + pub fn data_decryptor(&self) -> Arc { + self.data_decryptor.clone() + } + pub fn metadata_decryptor(&self) -> Arc { + self.metadata_decryptor.clone() + } } diff --git a/parquet/src/file/footer.rs b/parquet/src/file/footer.rs index 3a4e68d4a57d..5be084259e18 100644 --- a/parquet/src/file/footer.rs +++ b/parquet/src/file/footer.rs @@ -17,7 +17,6 @@ //! Module for working with Parquet file footers. -#[cfg(feature = "encryption")] use crate::errors::Result; use crate::file::{metadata::*, reader::ChunkReader, FOOTER_SIZE}; From ee2cbed3b7467a13eb950e1504f9dcd60752997e Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Tue, 21 Jan 2025 21:25:57 +0100 Subject: [PATCH 40/97] lint --- parquet/src/arrow/arrow_reader/mod.rs | 1 + parquet/src/encryption/ciphers.rs | 8 +++----- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/parquet/src/arrow/arrow_reader/mod.rs b/parquet/src/arrow/arrow_reader/mod.rs index be41f9c21904..a98b3f7c3059 100644 --- a/parquet/src/arrow/arrow_reader/mod.rs +++ b/parquet/src/arrow/arrow_reader/mod.rs @@ -1934,6 +1934,7 @@ mod tests { verify_encryption_test_file_read(file, decryption_properties); } + #[cfg(feature = "encryption")] fn verify_encryption_test_file_read( file: File, decryption_properties: ciphers::FileDecryptionProperties, diff --git a/parquet/src/encryption/ciphers.rs b/parquet/src/encryption/ciphers.rs index 3141fb3ce843..3626b1f6259a 100644 --- a/parquet/src/encryption/ciphers.rs +++ b/parquet/src/encryption/ciphers.rs @@ -18,9 +18,9 @@ //! Encryption implementation specific to Parquet, as described //! in the [spec](https://github.com/apache/parquet-format/blob/master/Encryption.md). +use ring::aead::{Aad, LessSafeKey, UnboundKey, AES_128_GCM}; use std::collections::HashMap; use std::sync::Arc; -use ring::aead::{Aad, LessSafeKey, UnboundKey, AES_128_GCM}; // use ring::aead::NonceSequence; // use ring::rand::{SecureRandom, SystemRandom}; use crate::errors::{ParquetError, Result}; @@ -147,8 +147,7 @@ impl BlockDecryptor for RingGcmBlockDecryptor { &length_and_ciphertext[SIZE_LEN..SIZE_LEN + NONCE_LEN], )?; - self.key - .open_in_place(nonce, Aad::from(aad), &mut result)?; + self.key.open_in_place(nonce, Aad::from(aad), &mut result)?; // Truncate result to remove the tag result.resize(result.len() - TAG_LEN, 0u8); @@ -197,7 +196,6 @@ fn create_module_aad( column_ordinal: usize, page_ordinal: Option, ) -> Result> { - let module_buf = [module_type as u8]; if module_buf[0] == (ModuleType::Footer as u8) { @@ -429,7 +427,7 @@ impl CryptoContext { row_group_ordinal: usize, column_ordinal: usize, data_decryptor: Arc, - metadata_decryptor: Arc + metadata_decryptor: Arc, ) -> Self { Self { row_group_ordinal, From fbd23cb552c38472edc65414a41f616b774ce192 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Wed, 22 Jan 2025 12:15:39 +0100 Subject: [PATCH 41/97] Remove encryption setup --- parquet/src/encryption/ciphers.rs | 93 ------------------------------- 1 file changed, 93 deletions(-) diff --git a/parquet/src/encryption/ciphers.rs b/parquet/src/encryption/ciphers.rs index 3626b1f6259a..780e476726a6 100644 --- a/parquet/src/encryption/ciphers.rs +++ b/parquet/src/encryption/ciphers.rs @@ -21,105 +21,16 @@ use ring::aead::{Aad, LessSafeKey, UnboundKey, AES_128_GCM}; use std::collections::HashMap; use std::sync::Arc; -// use ring::aead::NonceSequence; -// use ring::rand::{SecureRandom, SystemRandom}; use crate::errors::{ParquetError, Result}; -// pub trait BlockEncryptor { -// fn encrypt(&mut self, plaintext: &[u8], aad: &[u8]) -> Result>; -// } - pub trait BlockDecryptor { fn decrypt(&self, length_and_ciphertext: &[u8], aad: &[u8]) -> Result>; } -// const RIGHT_TWELVE: u128 = 0x0000_0000_ffff_ffff_ffff_ffff_ffff_ffff; const NONCE_LEN: usize = 12; const TAG_LEN: usize = 16; const SIZE_LEN: usize = 4; -// struct CounterNonce { -// start: u128, -// counter: u128, -// } -// -// impl CounterNonce { -// pub fn new(rng: &SystemRandom) -> Self { -// let mut buf = [0; 16]; -// rng.fill(&mut buf).unwrap(); -// -// // Since this is a random seed value, endianess doesn't matter at all, -// // and we can use whatever is platform-native. -// let start = u128::from_ne_bytes(buf) & RIGHT_TWELVE; -// let counter = start.wrapping_add(1); -// -// Self { start, counter } -// } -// -// /// One accessor for the nonce bytes to avoid potentially flipping endianess -// #[inline] -// pub fn get_bytes(&self) -> [u8; NONCE_LEN] { -// self.counter.to_le_bytes()[0..NONCE_LEN].try_into().unwrap() -// } -// } -// -// impl NonceSequence for CounterNonce { -// fn advance(&mut self) -> Result { -// // If we've wrapped around, we've exhausted this nonce sequence -// if (self.counter & RIGHT_TWELVE) == (self.start & RIGHT_TWELVE) { -// Err(ring::error::Unspecified) -// } else { -// // Otherwise, just advance and return the new value -// let buf: [u8; NONCE_LEN] = self.get_bytes(); -// self.counter = self.counter.wrapping_add(1); -// Ok(ring::aead::Nonce::assume_unique_for_key(buf)) -// } -// } -// } -// -// pub(crate) struct RingGcmBlockEncryptor { -// key: LessSafeKey, -// nonce_sequence: CounterNonce, -// } -// -// impl RingGcmBlockEncryptor { -// // todo TBD: some KMS systems produce data keys, need to be able to pass them to Encryptor. -// // todo TBD: for other KMSs, we will create data keys inside arrow-rs, making sure to use SystemRandom -// /// Create a new `RingGcmBlockEncryptor` with a given key and random nonce. -// /// The nonce will advance appropriately with each block encryption and -// /// return an error if it wraps around. -// pub(crate) fn new(key_bytes: &[u8]) -> Self { -// let rng = SystemRandom::new(); -// -// // todo support other key sizes -// let key = UnboundKey::new(&AES_128_GCM, key_bytes.as_ref()).unwrap(); -// let nonce = CounterNonce::new(&rng); -// -// Self { -// key: LessSafeKey::new(key), -// nonce_sequence: nonce, -// } -// } -// } -// -// impl BlockEncryptor for RingGcmBlockEncryptor { -// fn encrypt(&mut self, plaintext: &[u8], aad: &[u8]) -> Result> { -// let nonce = self.nonce_sequence.advance()?; -// let ciphertext_len = plaintext.len() + NONCE_LEN + TAG_LEN; -// // todo TBD: add first 4 bytes with the length, per https://github.com/apache/parquet-format/blob/master/Encryption.md#51-encrypted-module-serialization -// let mut result = Vec::with_capacity(SIZE_LEN + ciphertext_len); -// result.extend_from_slice((ciphertext_len as i32).to_le_bytes().as_ref()); -// result.extend_from_slice(nonce.as_ref()); -// result.extend_from_slice(plaintext); -// -// let tag = self -// .key -// .seal_in_place_separate_tag(nonce, Aad::from(aad), &mut result[SIZE_LEN + NONCE_LEN..])?; -// result.extend_from_slice(tag.as_ref()); -// -// Ok(result) -// } -// } #[derive(Debug, Clone)] pub(crate) struct RingGcmBlockDecryptor { @@ -163,10 +74,6 @@ pub(crate) enum ModuleType { DictionaryPage = 3, DataPageHeader = 4, DictionaryPageHeader = 5, - // ColumnIndex = 6, - // OffsetIndex = 7, - // BloomFilterHeader = 8, - // BloomFilterBitset = 9, } pub fn create_footer_aad(file_aad: &[u8]) -> Result> { From e0507514a7e26966214d14eb00c8ecc39afa3219 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Wed, 22 Jan 2025 12:20:05 +0100 Subject: [PATCH 42/97] Fix building with ring on wasm --- parquet/Cargo.toml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/parquet/Cargo.toml b/parquet/Cargo.toml index 3d8d4ed9aab4..a17d85b664a6 100644 --- a/parquet/Cargo.toml +++ b/parquet/Cargo.toml @@ -30,6 +30,8 @@ rust-version = { workspace = true } [target.'cfg(target_arch = "wasm32")'.dependencies] ahash = { version = "0.8", default-features = false, features = ["compile-time-rng"] } +# See https://github.com/briansmith/ring/issues/918#issuecomment-2077788925 +ring = { version = "0.17", features = ["wasm32_unknown_unknown_js"] } [target.'cfg(not(target_arch = "wasm32"))'.dependencies] ahash = { version = "0.8", default-features = false, features = ["runtime-rng"] } From 62ac361c580ef26df18d9b335b34366b612f9b6e Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Wed, 22 Jan 2025 13:00:57 +0100 Subject: [PATCH 43/97] file_decryptor into a seperate module --- parquet/src/arrow/arrow_reader/mod.rs | 15 +- parquet/src/encryption/ciphers.rs | 207 +----------------------- parquet/src/encryption/decryption.rs | 223 ++++++++++++++++++++++++++ parquet/src/encryption/mod.rs | 1 + parquet/src/file/metadata/mod.rs | 5 +- parquet/src/file/metadata/reader.rs | 6 +- parquet/src/file/serialized_reader.rs | 5 +- 7 files changed, 244 insertions(+), 218 deletions(-) create mode 100644 parquet/src/encryption/decryption.rs diff --git a/parquet/src/arrow/arrow_reader/mod.rs b/parquet/src/arrow/arrow_reader/mod.rs index a98b3f7c3059..790828c3e990 100644 --- a/parquet/src/arrow/arrow_reader/mod.rs +++ b/parquet/src/arrow/arrow_reader/mod.rs @@ -32,6 +32,8 @@ use crate::arrow::array_reader::{build_array_reader, ArrayReader}; use crate::arrow::schema::{parquet_to_arrow_schema_and_fields, ParquetField}; use crate::arrow::{parquet_to_arrow_field_levels, FieldLevels, ProjectionMask}; use crate::column::page::{PageIterator, PageReader}; +#[cfg(feature = "encryption")] +use crate::encryption::{ciphers::CryptoContext, decryption::FileDecryptionProperties}; use crate::errors::{ParquetError, Result}; use crate::file::metadata::{ParquetMetaData, ParquetMetaDataReader}; use crate::file::reader::{ChunkReader, SerializedPageReader}; @@ -41,9 +43,6 @@ mod filter; mod selection; pub mod statistics; -#[cfg(feature = "encryption")] -use crate::encryption::ciphers::{CryptoContext, FileDecryptionProperties}; - /// Builder for constructing parquet readers into arrow. /// /// Most users should use one of the following specializations: @@ -1047,7 +1046,7 @@ mod tests { FloatType, Int32Type, Int64Type, Int96Type, }; #[cfg(feature = "encryption")] - use crate::encryption::ciphers; + use crate::encryption::decryption::FileDecryptionProperties; use crate::errors::Result; use crate::file::properties::{EnabledStatistics, WriterProperties, WriterVersion}; use crate::file::writer::SerializedFileWriter; @@ -1891,7 +1890,7 @@ mod tests { let column_1_key = "1234567890123450".as_bytes(); let column_2_key = "1234567890123451".as_bytes(); - let decryption_properties = ciphers::FileDecryptionProperties::builder() + let decryption_properties = FileDecryptionProperties::builder() .with_column_key("double_field".as_bytes().to_vec(), column_1_key.to_vec()) .with_column_key("float_field".as_bytes().to_vec(), column_2_key.to_vec()) .build(); @@ -1910,7 +1909,7 @@ mod tests { let column_1_key = "1234567890123450".as_bytes(); let column_2_key = "1234567890123451".as_bytes(); - let decryption_properties = ciphers::FileDecryptionProperties::builder() + let decryption_properties = FileDecryptionProperties::builder() .with_footer_key(footer_key.to_vec()) .with_column_key("double_field".as_bytes().to_vec(), column_1_key.to_vec()) .with_column_key("float_field".as_bytes().to_vec(), column_2_key.to_vec()) @@ -1927,7 +1926,7 @@ mod tests { let file = File::open(path).unwrap(); let key_code: &[u8] = "0123456789012345".as_bytes(); - let decryption_properties = ciphers::FileDecryptionProperties::builder() + let decryption_properties = FileDecryptionProperties::builder() .with_footer_key(key_code.to_vec()) .build(); @@ -1937,7 +1936,7 @@ mod tests { #[cfg(feature = "encryption")] fn verify_encryption_test_file_read( file: File, - decryption_properties: ciphers::FileDecryptionProperties, + decryption_properties: FileDecryptionProperties, ) { let decryption_properties = Some(decryption_properties); diff --git a/parquet/src/encryption/ciphers.rs b/parquet/src/encryption/ciphers.rs index 780e476726a6..2e200717c94e 100644 --- a/parquet/src/encryption/ciphers.rs +++ b/parquet/src/encryption/ciphers.rs @@ -18,53 +18,9 @@ //! Encryption implementation specific to Parquet, as described //! in the [spec](https://github.com/apache/parquet-format/blob/master/Encryption.md). -use ring::aead::{Aad, LessSafeKey, UnboundKey, AES_128_GCM}; -use std::collections::HashMap; use std::sync::Arc; use crate::errors::{ParquetError, Result}; - -pub trait BlockDecryptor { - fn decrypt(&self, length_and_ciphertext: &[u8], aad: &[u8]) -> Result>; -} - -const NONCE_LEN: usize = 12; -const TAG_LEN: usize = 16; -const SIZE_LEN: usize = 4; - - -#[derive(Debug, Clone)] -pub(crate) struct RingGcmBlockDecryptor { - key: LessSafeKey, -} - -impl RingGcmBlockDecryptor { - pub(crate) fn new(key_bytes: &[u8]) -> Self { - // todo support other key sizes - let key = UnboundKey::new(&AES_128_GCM, key_bytes).unwrap(); - - Self { - key: LessSafeKey::new(key), - } - } -} - -impl BlockDecryptor for RingGcmBlockDecryptor { - fn decrypt(&self, length_and_ciphertext: &[u8], aad: &[u8]) -> Result> { - let mut result = - Vec::with_capacity(length_and_ciphertext.len() - SIZE_LEN - NONCE_LEN - TAG_LEN); - result.extend_from_slice(&length_and_ciphertext[SIZE_LEN + NONCE_LEN..]); - - let nonce = ring::aead::Nonce::try_assume_unique_for_key( - &length_and_ciphertext[SIZE_LEN..SIZE_LEN + NONCE_LEN], - )?; - - self.key.open_in_place(nonce, Aad::from(aad), &mut result)?; - - // Truncate result to remove the tag - result.resize(result.len() - TAG_LEN, 0u8); - Ok(result) - } -} +use crate::encryption::decryption::FileDecryptor; #[derive(PartialEq)] pub(crate) enum ModuleType { @@ -158,167 +114,6 @@ fn create_module_aad( Ok(aad) } -#[derive(Debug, Clone, PartialEq)] -pub struct FileDecryptionProperties { - footer_key: Option>, - column_keys: Option, Vec>>, - aad_prefix: Option>, -} - -impl FileDecryptionProperties { - pub fn builder() -> DecryptionPropertiesBuilder { - DecryptionPropertiesBuilder::with_defaults() - } - pub fn has_footer_key(&self) -> bool { - self.footer_key.is_some() - } - - pub fn has_column_keys(&self) -> bool { - self.column_keys.is_some() - } - - pub fn aad_prefix(&self) -> Option> { - self.aad_prefix.clone() - } -} - -pub struct DecryptionPropertiesBuilder { - footer_key: Option>, - column_keys: Option, Vec>>, - aad_prefix: Option>, -} - -impl DecryptionPropertiesBuilder { - pub fn with_defaults() -> Self { - Self { - footer_key: None, - column_keys: None, - aad_prefix: None, - } - } - - pub fn build(self) -> FileDecryptionProperties { - FileDecryptionProperties { - footer_key: self.footer_key, - column_keys: self.column_keys, - aad_prefix: self.aad_prefix, - } - } - - // todo decr: doc comment - pub fn with_footer_key(mut self, value: Vec) -> Self { - self.footer_key = Some(value); - self - } - - pub fn with_aad_prefix(mut self, value: Vec) -> Self { - self.aad_prefix = Some(value); - self - } - - pub fn with_column_key(mut self, key: Vec, value: Vec) -> Self { - let mut column_keys = self.column_keys.unwrap_or_default(); - column_keys.insert(key, value); - self.column_keys = Some(column_keys); - self - } -} - -#[derive(Debug, Clone)] -pub struct FileDecryptor { - decryption_properties: FileDecryptionProperties, - // todo decr: change to BlockDecryptor - footer_decryptor: Option, - aad_file_unique: Vec, - aad_prefix: Vec, -} - -impl PartialEq for FileDecryptor { - fn eq(&self, other: &Self) -> bool { - self.decryption_properties == other.decryption_properties - } -} - -impl FileDecryptor { - pub(crate) fn new( - decryption_properties: &FileDecryptionProperties, - aad_file_unique: Vec, - aad_prefix: Vec, - ) -> Self { - let footer_decryptor = decryption_properties - .footer_key - .clone() - .map(|footer_key| RingGcmBlockDecryptor::new(footer_key.as_ref())); - - Self { - // todo decr: if no key available yet (not set in properties, will be retrieved from metadata) - footer_decryptor, - decryption_properties: decryption_properties.clone(), - aad_file_unique, - aad_prefix, - } - } - - // todo decr: change to BlockDecryptor - pub(crate) fn get_footer_decryptor(self) -> RingGcmBlockDecryptor { - self.footer_decryptor.unwrap() - } - - pub(crate) fn has_column_key(&self, column_name: &[u8]) -> bool { - self.decryption_properties - .column_keys - .clone() - .unwrap() - .contains_key(column_name) - } - - pub(crate) fn get_column_decryptor(&self, column_name: &[u8]) -> FileDecryptor { - if self.decryption_properties.column_keys.is_none() || !self.has_column_key(column_name) { - return self.clone(); - } - let column_keys = &self.decryption_properties.column_keys.clone().unwrap(); - let decryption_properties = if let Some(column_key) = column_keys.get(column_name) { - DecryptionPropertiesBuilder::with_defaults() - .with_footer_key(column_key.clone()) - .with_aad_prefix(self.aad_prefix.clone()) - .build() - } else { - self.decryption_properties.clone() - }; - - FileDecryptor::new( - &decryption_properties, - self.aad_file_unique.clone(), - self.aad_prefix.clone(), - ) - } - - pub(crate) fn decryption_properties(&self) -> &FileDecryptionProperties { - &self.decryption_properties - } - - pub(crate) fn footer_decryptor(&self) -> Option { - self.footer_decryptor.clone() - } - - pub(crate) fn aad_file_unique(&self) -> &Vec { - &self.aad_file_unique - } - - pub(crate) fn aad_prefix(&self) -> &Vec { - &self.aad_prefix - } - - pub(crate) fn has_footer_key(&self) -> bool { - self.decryption_properties.has_footer_key() - } - - pub(crate) fn is_column_encrypted(&self, column_name: &[u8]) -> bool { - // Column is encrypted if either uniform encryption is used or an encryption key is set for the column - self.decryption_properties.column_keys.is_none() || self.has_column_key(column_name) - } -} - #[derive(Debug, Clone)] pub struct CryptoContext { pub(crate) row_group_ordinal: usize, diff --git a/parquet/src/encryption/decryption.rs b/parquet/src/encryption/decryption.rs new file mode 100644 index 000000000000..23d7b7eb3295 --- /dev/null +++ b/parquet/src/encryption/decryption.rs @@ -0,0 +1,223 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::collections::HashMap; +use ring::aead::{Aad, LessSafeKey, UnboundKey, AES_128_GCM}; +use arrow_array::Datum; + +const NONCE_LEN: usize = 12; +const TAG_LEN: usize = 16; +const SIZE_LEN: usize = 4; + +pub trait BlockDecryptor { + fn decrypt(&self, length_and_ciphertext: &[u8], aad: &[u8]) -> crate::errors::Result>; +} + +#[derive(Debug, Clone)] +pub(crate) struct RingGcmBlockDecryptor { + key: LessSafeKey, +} + +impl RingGcmBlockDecryptor { + pub(crate) fn new(key_bytes: &[u8]) -> Self { + // todo support other key sizes + let key = UnboundKey::new(&AES_128_GCM, key_bytes).unwrap(); + + Self { + key: LessSafeKey::new(key), + } + } +} + +impl BlockDecryptor for RingGcmBlockDecryptor { + fn decrypt(&self, length_and_ciphertext: &[u8], aad: &[u8]) -> crate::errors::Result> { + let mut result = + Vec::with_capacity(length_and_ciphertext.len() - SIZE_LEN - NONCE_LEN - TAG_LEN); + result.extend_from_slice(&length_and_ciphertext[SIZE_LEN + NONCE_LEN..]); + + let nonce = ring::aead::Nonce::try_assume_unique_for_key( + &length_and_ciphertext[SIZE_LEN..SIZE_LEN + NONCE_LEN], + )?; + + self.key.open_in_place(nonce, Aad::from(aad), &mut result)?; + + // Truncate result to remove the tag + result.resize(result.len() - TAG_LEN, 0u8); + Ok(result) + } +} + +#[derive(Debug, Clone, PartialEq)] +pub struct FileDecryptionProperties { + footer_key: Option>, + column_keys: Option, Vec>>, + aad_prefix: Option>, +} + +impl FileDecryptionProperties { + pub fn builder() -> DecryptionPropertiesBuilder { + DecryptionPropertiesBuilder::with_defaults() + } + pub fn has_footer_key(&self) -> bool { + self.footer_key.is_some() + } + + pub fn has_column_keys(&self) -> bool { + self.column_keys.is_some() + } + + pub fn aad_prefix(&self) -> Option> { + self.aad_prefix.clone() + } +} + +pub struct DecryptionPropertiesBuilder { + footer_key: Option>, + column_keys: Option, Vec>>, + aad_prefix: Option>, +} + +impl DecryptionPropertiesBuilder { + pub fn with_defaults() -> Self { + Self { + footer_key: None, + column_keys: None, + aad_prefix: None, + } + } + + pub fn build(self) -> FileDecryptionProperties { + FileDecryptionProperties { + footer_key: self.footer_key, + column_keys: self.column_keys, + aad_prefix: self.aad_prefix, + } + } + + // todo decr: doc comment + pub fn with_footer_key(mut self, value: Vec) -> Self { + self.footer_key = Some(value); + self + } + + pub fn with_aad_prefix(mut self, value: Vec) -> Self { + self.aad_prefix = Some(value); + self + } + + pub fn with_column_key(mut self, key: Vec, value: Vec) -> Self { + let mut column_keys = self.column_keys.unwrap_or_default(); + column_keys.insert(key, value); + self.column_keys = Some(column_keys); + self + } +} + +#[derive(Debug, Clone)] +pub struct FileDecryptor { + decryption_properties: FileDecryptionProperties, + // todo decr: change to BlockDecryptor + footer_decryptor: Option, + aad_file_unique: Vec, + aad_prefix: Vec, +} + +impl PartialEq for FileDecryptor { + fn eq(&self, other: &Self) -> bool { + self.decryption_properties == other.decryption_properties + } +} + +impl FileDecryptor { + pub(crate) fn new( + decryption_properties: &FileDecryptionProperties, + aad_file_unique: Vec, + aad_prefix: Vec, + ) -> Self { + let footer_decryptor = decryption_properties + .footer_key + .clone() + .map(|footer_key| RingGcmBlockDecryptor::new(footer_key.as_ref())); + + Self { + // todo decr: if no key available yet (not set in properties, will be retrieved from metadata) + footer_decryptor, + decryption_properties: decryption_properties.clone(), + aad_file_unique, + aad_prefix, + } + } + + // todo decr: change to BlockDecryptor + pub(crate) fn get_footer_decryptor(self) -> RingGcmBlockDecryptor { + self.footer_decryptor.unwrap() + } + + pub(crate) fn has_column_key(&self, column_name: &[u8]) -> bool { + self.decryption_properties + .column_keys + .clone() + .unwrap() + .contains_key(column_name) + } + + pub(crate) fn get_column_decryptor(&self, column_name: &[u8]) -> FileDecryptor { + if self.decryption_properties.column_keys.is_none() || !self.has_column_key(column_name) { + return self.clone(); + } + let column_keys = &self.decryption_properties.column_keys.clone().unwrap(); + let decryption_properties = if let Some(column_key) = column_keys.get(column_name) { + DecryptionPropertiesBuilder::with_defaults() + .with_footer_key(column_key.clone()) + .with_aad_prefix(self.aad_prefix.clone()) + .build() + } else { + self.decryption_properties.clone() + }; + + FileDecryptor::new( + &decryption_properties, + self.aad_file_unique.clone(), + self.aad_prefix.clone(), + ) + } + + pub(crate) fn decryption_properties(&self) -> &FileDecryptionProperties { + &self.decryption_properties + } + + pub(crate) fn footer_decryptor(&self) -> Option { + self.footer_decryptor.clone() + } + + pub(crate) fn aad_file_unique(&self) -> &Vec { + &self.aad_file_unique + } + + pub(crate) fn aad_prefix(&self) -> &Vec { + &self.aad_prefix + } + + pub(crate) fn has_footer_key(&self) -> bool { + self.decryption_properties.has_footer_key() + } + + pub(crate) fn is_column_encrypted(&self, column_name: &[u8]) -> bool { + // Column is encrypted if either uniform encryption is used or an encryption key is set for the column + self.decryption_properties.column_keys.is_none() || self.has_column_key(column_name) + } +} diff --git a/parquet/src/encryption/mod.rs b/parquet/src/encryption/mod.rs index e0e7f5d81919..75a6394af766 100644 --- a/parquet/src/encryption/mod.rs +++ b/parquet/src/encryption/mod.rs @@ -19,3 +19,4 @@ //! in the [spec](https://github.com/apache/parquet-format/blob/master/Encryption.md). pub mod ciphers; +pub mod decryption; diff --git a/parquet/src/file/metadata/mod.rs b/parquet/src/file/metadata/mod.rs index 9f79a2416cda..6e0eec7be5b6 100644 --- a/parquet/src/file/metadata/mod.rs +++ b/parquet/src/file/metadata/mod.rs @@ -97,7 +97,10 @@ mod writer; use crate::basic::{ColumnOrder, Compression, Encoding, Type}; #[cfg(feature = "encryption")] -use crate::encryption::ciphers::{create_page_aad, BlockDecryptor, FileDecryptor, ModuleType}; +use crate::encryption::{ + ciphers::{create_page_aad, ModuleType}, + decryption::{BlockDecryptor, FileDecryptor}, +}; use crate::errors::{ParquetError, Result}; pub(crate) use crate::file::metadata::memory::HeapSize; use crate::file::page_encoding_stats::{self, PageEncodingStats}; diff --git a/parquet/src/file/metadata/reader.rs b/parquet/src/file/metadata/reader.rs index 0c5c9f2cb4e5..15b1e96a47fb 100644 --- a/parquet/src/file/metadata/reader.rs +++ b/parquet/src/file/metadata/reader.rs @@ -21,9 +21,11 @@ use bytes::Bytes; use crate::basic::ColumnOrder; #[cfg(feature = "encryption")] -use crate::encryption::ciphers::{ - create_footer_aad, BlockDecryptor, FileDecryptionProperties, FileDecryptor, +use crate::encryption::{ + ciphers::create_footer_aad, + decryption::{BlockDecryptor, FileDecryptionProperties, FileDecryptor}, }; + use crate::errors::{ParquetError, Result}; use crate::file::metadata::{FileMetaData, ParquetMetaData, RowGroupMetaData}; use crate::file::page_index::index::Index; diff --git a/parquet/src/file/serialized_reader.rs b/parquet/src/file/serialized_reader.rs index 58327e730a98..702201688fab 100644 --- a/parquet/src/file/serialized_reader.rs +++ b/parquet/src/file/serialized_reader.rs @@ -23,7 +23,10 @@ use crate::bloom_filter::Sbbf; use crate::column::page::{Page, PageMetadata, PageReader}; use crate::compression::{create_codec, Codec}; #[cfg(feature = "encryption")] -use crate::encryption::ciphers::{create_page_aad, BlockDecryptor, CryptoContext, ModuleType}; +use crate::encryption::{ + ciphers::{create_page_aad, CryptoContext, ModuleType}, + decryption::BlockDecryptor, +}; use crate::errors::{ParquetError, Result}; use crate::file::page_index::offset_index::OffsetIndexMetaData; use crate::file::{ From 8ab9a4333214f7d7fb6c8b84381f0a2fb4b9a196 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Wed, 22 Jan 2025 13:47:18 +0100 Subject: [PATCH 44/97] lint --- parquet/src/encryption/ciphers.rs | 4 ++-- parquet/src/encryption/decryption.rs | 3 +-- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/parquet/src/encryption/ciphers.rs b/parquet/src/encryption/ciphers.rs index 2e200717c94e..6dabadd0c373 100644 --- a/parquet/src/encryption/ciphers.rs +++ b/parquet/src/encryption/ciphers.rs @@ -18,9 +18,9 @@ //! Encryption implementation specific to Parquet, as described //! in the [spec](https://github.com/apache/parquet-format/blob/master/Encryption.md). -use std::sync::Arc; -use crate::errors::{ParquetError, Result}; use crate::encryption::decryption::FileDecryptor; +use crate::errors::{ParquetError, Result}; +use std::sync::Arc; #[derive(PartialEq)] pub(crate) enum ModuleType { diff --git a/parquet/src/encryption/decryption.rs b/parquet/src/encryption/decryption.rs index 23d7b7eb3295..a5327d3f67dc 100644 --- a/parquet/src/encryption/decryption.rs +++ b/parquet/src/encryption/decryption.rs @@ -15,9 +15,8 @@ // specific language governing permissions and limitations // under the License. -use std::collections::HashMap; use ring::aead::{Aad, LessSafeKey, UnboundKey, AES_128_GCM}; -use arrow_array::Datum; +use std::collections::HashMap; const NONCE_LEN: usize = 12; const TAG_LEN: usize = 16; From febbe83905979a870b2ac8df9ffefda2446fd979 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Wed, 22 Jan 2025 16:14:04 +0100 Subject: [PATCH 45/97] FileDecryptionProperties should have at least one key --- parquet/src/arrow/arrow_reader/mod.rs | 9 ++++++--- parquet/src/encryption/decryption.rs | 16 +++++++++++----- 2 files changed, 17 insertions(+), 8 deletions(-) diff --git a/parquet/src/arrow/arrow_reader/mod.rs b/parquet/src/arrow/arrow_reader/mod.rs index 790828c3e990..2a5ee2e1cf43 100644 --- a/parquet/src/arrow/arrow_reader/mod.rs +++ b/parquet/src/arrow/arrow_reader/mod.rs @@ -1893,7 +1893,8 @@ mod tests { let decryption_properties = FileDecryptionProperties::builder() .with_column_key("double_field".as_bytes().to_vec(), column_1_key.to_vec()) .with_column_key("float_field".as_bytes().to_vec(), column_2_key.to_vec()) - .build(); + .build() + .unwrap(); verify_encryption_test_file_read(file, decryption_properties); } @@ -1913,7 +1914,8 @@ mod tests { .with_footer_key(footer_key.to_vec()) .with_column_key("double_field".as_bytes().to_vec(), column_1_key.to_vec()) .with_column_key("float_field".as_bytes().to_vec(), column_2_key.to_vec()) - .build(); + .build() + .unwrap(); verify_encryption_test_file_read(file, decryption_properties); } @@ -1928,7 +1930,8 @@ mod tests { let key_code: &[u8] = "0123456789012345".as_bytes(); let decryption_properties = FileDecryptionProperties::builder() .with_footer_key(key_code.to_vec()) - .build(); + .build() + .unwrap(); verify_encryption_test_file_read(file, decryption_properties); } diff --git a/parquet/src/encryption/decryption.rs b/parquet/src/encryption/decryption.rs index a5327d3f67dc..b93fb19385a1 100644 --- a/parquet/src/encryption/decryption.rs +++ b/parquet/src/encryption/decryption.rs @@ -15,6 +15,7 @@ // specific language governing permissions and limitations // under the License. +use crate::errors::{ParquetError, Result}; use ring::aead::{Aad, LessSafeKey, UnboundKey, AES_128_GCM}; use std::collections::HashMap; @@ -23,7 +24,7 @@ const TAG_LEN: usize = 16; const SIZE_LEN: usize = 4; pub trait BlockDecryptor { - fn decrypt(&self, length_and_ciphertext: &[u8], aad: &[u8]) -> crate::errors::Result>; + fn decrypt(&self, length_and_ciphertext: &[u8], aad: &[u8]) -> Result>; } #[derive(Debug, Clone)] @@ -43,7 +44,7 @@ impl RingGcmBlockDecryptor { } impl BlockDecryptor for RingGcmBlockDecryptor { - fn decrypt(&self, length_and_ciphertext: &[u8], aad: &[u8]) -> crate::errors::Result> { + fn decrypt(&self, length_and_ciphertext: &[u8], aad: &[u8]) -> Result> { let mut result = Vec::with_capacity(length_and_ciphertext.len() - SIZE_LEN - NONCE_LEN - TAG_LEN); result.extend_from_slice(&length_and_ciphertext[SIZE_LEN + NONCE_LEN..]); @@ -99,12 +100,16 @@ impl DecryptionPropertiesBuilder { } } - pub fn build(self) -> FileDecryptionProperties { - FileDecryptionProperties { + pub fn build(self) -> Result { + if self.footer_key.is_none() && self.column_keys.is_none() { + return Err(ParquetError::General("Footer or at least one column key is required".to_string())) + } + + Ok(FileDecryptionProperties { footer_key: self.footer_key, column_keys: self.column_keys, aad_prefix: self.aad_prefix, - } + }) } // todo decr: doc comment @@ -184,6 +189,7 @@ impl FileDecryptor { .with_footer_key(column_key.clone()) .with_aad_prefix(self.aad_prefix.clone()) .build() + .unwrap() } else { self.decryption_properties.clone() }; From e5a788e441e0718cd46a2da307cfc1b7edf4b7e5 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Thu, 23 Jan 2025 13:49:28 +0100 Subject: [PATCH 46/97] Move cyphertext reading into decryptor --- parquet/src/encryption/decryption.rs | 17 ++++++++++++++++- parquet/src/file/serialized_reader.rs | 8 +------- 2 files changed, 17 insertions(+), 8 deletions(-) diff --git a/parquet/src/encryption/decryption.rs b/parquet/src/encryption/decryption.rs index b93fb19385a1..49d6aa36491e 100644 --- a/parquet/src/encryption/decryption.rs +++ b/parquet/src/encryption/decryption.rs @@ -18,6 +18,7 @@ use crate::errors::{ParquetError, Result}; use ring::aead::{Aad, LessSafeKey, UnboundKey, AES_128_GCM}; use std::collections::HashMap; +use std::io::Read; const NONCE_LEN: usize = 12; const TAG_LEN: usize = 16; @@ -25,6 +26,8 @@ const SIZE_LEN: usize = 4; pub trait BlockDecryptor { fn decrypt(&self, length_and_ciphertext: &[u8], aad: &[u8]) -> Result>; + + fn read_and_decrypt(&self, input: &mut T, aad: &[u8]) -> Result>; } #[derive(Debug, Clone)] @@ -59,6 +62,16 @@ impl BlockDecryptor for RingGcmBlockDecryptor { result.resize(result.len() - TAG_LEN, 0u8); Ok(result) } + + fn read_and_decrypt(&self, input: &mut T, aad: &[u8]) -> Result> { + let mut len_bytes = [0; 4]; + input.read_exact(&mut len_bytes)?; + let ciphertext_len = u32::from_le_bytes(len_bytes) as usize; + let mut ciphertext = vec![0; 4 + ciphertext_len]; + input.read_exact(&mut ciphertext[4..])?; + + self.decrypt(&ciphertext, aad.as_ref()) + } } #[derive(Debug, Clone, PartialEq)] @@ -102,7 +115,9 @@ impl DecryptionPropertiesBuilder { pub fn build(self) -> Result { if self.footer_key.is_none() && self.column_keys.is_none() { - return Err(ParquetError::General("Footer or at least one column key is required".to_string())) + return Err(ParquetError::General( + "Footer or at least one column key is required".to_string(), + )); } Ok(FileDecryptionProperties { diff --git a/parquet/src/file/serialized_reader.rs b/parquet/src/file/serialized_reader.rs index 702201688fab..dc9cc6a315cc 100644 --- a/parquet/src/file/serialized_reader.rs +++ b/parquet/src/file/serialized_reader.rs @@ -377,16 +377,10 @@ pub(crate) fn read_page_header( crypto_context.page_ordinal, )?; - let mut len_bytes = [0; 4]; - input.read_exact(&mut len_bytes)?; - let ciphertext_len = u32::from_le_bytes(len_bytes) as usize; - let mut ciphertext = vec![0; 4 + ciphertext_len]; - input.read_exact(&mut ciphertext[4..])?; - let buf = data_decryptor .footer_decryptor() .unwrap() - .decrypt(&ciphertext, aad.as_ref())?; + .read_and_decrypt(input, aad.as_ref())?; let mut prot = TCompactSliceInputProtocol::new(buf.as_slice()); let page_header = PageHeader::read_from_in_protocol(&mut prot)?; From 423411d4b6a6ca17a12c1c4436eb5bbcf17f99c9 Mon Sep 17 00:00:00 2001 From: Adam Reeve Date: Fri, 24 Jan 2025 10:58:38 +1300 Subject: [PATCH 47/97] More tidy up of footer key handling --- parquet/src/arrow/arrow_reader/mod.rs | 11 +++---- parquet/src/encryption/decryption.rs | 43 +++++++-------------------- parquet/src/file/serialized_reader.rs | 6 ---- 3 files changed, 16 insertions(+), 44 deletions(-) diff --git a/parquet/src/arrow/arrow_reader/mod.rs b/parquet/src/arrow/arrow_reader/mod.rs index 2a5ee2e1cf43..06c19092946c 100644 --- a/parquet/src/arrow/arrow_reader/mod.rs +++ b/parquet/src/arrow/arrow_reader/mod.rs @@ -1887,10 +1887,13 @@ mod tests { let path = format!("{testdata}/encrypt_columns_plaintext_footer.parquet.encrypted"); let file = File::open(path).unwrap(); + // There is always a footer key even with a plaintext footer, + // but this is used for signing the footer. + let footer_key = "0123456789012345".as_bytes(); // 128bit/16 let column_1_key = "1234567890123450".as_bytes(); let column_2_key = "1234567890123451".as_bytes(); - let decryption_properties = FileDecryptionProperties::builder() + let decryption_properties = FileDecryptionProperties::builder(footer_key.to_vec()) .with_column_key("double_field".as_bytes().to_vec(), column_1_key.to_vec()) .with_column_key("float_field".as_bytes().to_vec(), column_2_key.to_vec()) .build() @@ -1910,8 +1913,7 @@ mod tests { let column_1_key = "1234567890123450".as_bytes(); let column_2_key = "1234567890123451".as_bytes(); - let decryption_properties = FileDecryptionProperties::builder() - .with_footer_key(footer_key.to_vec()) + let decryption_properties = FileDecryptionProperties::builder(footer_key.to_vec()) .with_column_key("double_field".as_bytes().to_vec(), column_1_key.to_vec()) .with_column_key("float_field".as_bytes().to_vec(), column_2_key.to_vec()) .build() @@ -1928,8 +1930,7 @@ mod tests { let file = File::open(path).unwrap(); let key_code: &[u8] = "0123456789012345".as_bytes(); - let decryption_properties = FileDecryptionProperties::builder() - .with_footer_key(key_code.to_vec()) + let decryption_properties = FileDecryptionProperties::builder(key_code.to_vec()) .build() .unwrap(); diff --git a/parquet/src/encryption/decryption.rs b/parquet/src/encryption/decryption.rs index 49d6aa36491e..5c66172a78ba 100644 --- a/parquet/src/encryption/decryption.rs +++ b/parquet/src/encryption/decryption.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -use crate::errors::{ParquetError, Result}; +use crate::errors::Result; use ring::aead::{Aad, LessSafeKey, UnboundKey, AES_128_GCM}; use std::collections::HashMap; use std::io::Read; @@ -76,17 +76,14 @@ impl BlockDecryptor for RingGcmBlockDecryptor { #[derive(Debug, Clone, PartialEq)] pub struct FileDecryptionProperties { - footer_key: Option>, + footer_key: Vec, column_keys: Option, Vec>>, aad_prefix: Option>, } impl FileDecryptionProperties { - pub fn builder() -> DecryptionPropertiesBuilder { - DecryptionPropertiesBuilder::with_defaults() - } - pub fn has_footer_key(&self) -> bool { - self.footer_key.is_some() + pub fn builder(footer_key: Vec) -> DecryptionPropertiesBuilder { + DecryptionPropertiesBuilder::new(footer_key) } pub fn has_column_keys(&self) -> bool { @@ -99,27 +96,21 @@ impl FileDecryptionProperties { } pub struct DecryptionPropertiesBuilder { - footer_key: Option>, + footer_key: Vec, column_keys: Option, Vec>>, aad_prefix: Option>, } impl DecryptionPropertiesBuilder { - pub fn with_defaults() -> Self { + pub fn new(footer_key: Vec) -> DecryptionPropertiesBuilder { Self { - footer_key: None, + footer_key, column_keys: None, aad_prefix: None, } } pub fn build(self) -> Result { - if self.footer_key.is_none() && self.column_keys.is_none() { - return Err(ParquetError::General( - "Footer or at least one column key is required".to_string(), - )); - } - Ok(FileDecryptionProperties { footer_key: self.footer_key, column_keys: self.column_keys, @@ -127,12 +118,6 @@ impl DecryptionPropertiesBuilder { }) } - // todo decr: doc comment - pub fn with_footer_key(mut self, value: Vec) -> Self { - self.footer_key = Some(value); - self - } - pub fn with_aad_prefix(mut self, value: Vec) -> Self { self.aad_prefix = Some(value); self @@ -167,14 +152,11 @@ impl FileDecryptor { aad_file_unique: Vec, aad_prefix: Vec, ) -> Self { - let footer_decryptor = decryption_properties - .footer_key - .clone() - .map(|footer_key| RingGcmBlockDecryptor::new(footer_key.as_ref())); + let footer_decryptor = RingGcmBlockDecryptor::new(&decryption_properties.footer_key); Self { // todo decr: if no key available yet (not set in properties, will be retrieved from metadata) - footer_decryptor, + footer_decryptor: Some(footer_decryptor), decryption_properties: decryption_properties.clone(), aad_file_unique, aad_prefix, @@ -200,8 +182,7 @@ impl FileDecryptor { } let column_keys = &self.decryption_properties.column_keys.clone().unwrap(); let decryption_properties = if let Some(column_key) = column_keys.get(column_name) { - DecryptionPropertiesBuilder::with_defaults() - .with_footer_key(column_key.clone()) + DecryptionPropertiesBuilder::new(column_key.clone()) .with_aad_prefix(self.aad_prefix.clone()) .build() .unwrap() @@ -232,10 +213,6 @@ impl FileDecryptor { &self.aad_prefix } - pub(crate) fn has_footer_key(&self) -> bool { - self.decryption_properties.has_footer_key() - } - pub(crate) fn is_column_encrypted(&self, column_name: &[u8]) -> bool { // Column is encrypted if either uniform encryption is used or an encryption key is set for the column self.decryption_properties.column_keys.is_none() || self.has_column_key(column_name) diff --git a/parquet/src/file/serialized_reader.rs b/parquet/src/file/serialized_reader.rs index dc9cc6a315cc..b4ec0851c32c 100644 --- a/parquet/src/file/serialized_reader.rs +++ b/parquet/src/file/serialized_reader.rs @@ -352,12 +352,6 @@ pub(crate) fn read_page_header( if let Some(crypto_context) = crypto_context { let decryptor = &crypto_context.data_decryptor(); - if !decryptor.has_footer_key() || decryptor.footer_decryptor().is_none() { - let mut prot = TCompactInputProtocol::new(input); - let page_header = PageHeader::read_from_in_protocol(&mut prot)?; - return Ok(page_header); - }; - let data_decryptor = &crypto_context.data_decryptor(); let aad_file_unique = decryptor.aad_file_unique(); let aad_prefix = decryptor.aad_prefix(); From 187e7de8df51377ac8d61920f8fa87677aaa1787 Mon Sep 17 00:00:00 2001 From: Adam Reeve Date: Fri, 24 Jan 2025 12:43:40 +1300 Subject: [PATCH 48/97] Get column decryptors as RingGcmBlockDecryptor --- parquet/src/arrow/arrow_reader/mod.rs | 33 +++++++-------- parquet/src/encryption/ciphers.rs | 32 +++++++++----- parquet/src/encryption/decryption.rs | 60 ++++++++++----------------- parquet/src/file/metadata/mod.rs | 19 +++------ parquet/src/file/metadata/reader.rs | 11 ++--- parquet/src/file/serialized_reader.rs | 56 +++++++------------------ 6 files changed, 87 insertions(+), 124 deletions(-) diff --git a/parquet/src/arrow/arrow_reader/mod.rs b/parquet/src/arrow/arrow_reader/mod.rs index 06c19092946c..cec44e72826f 100644 --- a/parquet/src/arrow/arrow_reader/mod.rs +++ b/parquet/src/arrow/arrow_reader/mod.rs @@ -706,31 +706,26 @@ impl Iterator for ReaderPageIterator { let reader = self.reader.clone(); #[cfg(feature = "encryption")] - let crypto_context = if self.metadata.file_decryptor().is_some() { + let crypto_context = if let Some(file_decryptor) = self.metadata.file_decryptor() { let column_name = self .metadata .file_metadata() .schema_descr() .column(self.column_idx); - if self - .metadata - .file_decryptor() - .as_ref() - .unwrap() - .is_column_encrypted(column_name.name().as_bytes()) - { - let file_decryptor = self - .metadata - .file_decryptor() - .clone() - .unwrap() - .get_column_decryptor(column_name.name().as_bytes()); - let data_decryptor = Arc::new(file_decryptor.clone()); - let metadata_decryptor = Arc::new(file_decryptor.clone()); - - let crypto_context = - CryptoContext::new(rg_idx, self.column_idx, data_decryptor, metadata_decryptor); + if file_decryptor.is_column_encrypted(column_name.name().as_bytes()) { + let data_decryptor = + file_decryptor.get_column_data_decryptor(column_name.name().as_bytes()); + let metadata_decryptor = + file_decryptor.get_column_metadata_decryptor(column_name.name().as_bytes()); + + let crypto_context = CryptoContext::new( + rg_idx, + self.column_idx, + data_decryptor, + metadata_decryptor, + file_decryptor.file_aad().clone(), + ); Some(Arc::new(crypto_context)) } else { None diff --git a/parquet/src/encryption/ciphers.rs b/parquet/src/encryption/ciphers.rs index 6dabadd0c373..ce9e281ed13b 100644 --- a/parquet/src/encryption/ciphers.rs +++ b/parquet/src/encryption/ciphers.rs @@ -18,9 +18,8 @@ //! Encryption implementation specific to Parquet, as described //! in the [spec](https://github.com/apache/parquet-format/blob/master/Encryption.md). -use crate::encryption::decryption::FileDecryptor; +use crate::encryption::decryption::RingGcmBlockDecryptor; use crate::errors::{ParquetError, Result}; -use std::sync::Arc; #[derive(PartialEq)] pub(crate) enum ModuleType { @@ -120,16 +119,21 @@ pub struct CryptoContext { pub(crate) column_ordinal: usize, pub(crate) page_ordinal: Option, pub(crate) dictionary_page: bool, - pub(crate) data_decryptor: Arc, - pub(crate) metadata_decryptor: Arc, + // We have separate data and metadata decryptors because + // in GCM CTR mode, the metadata and data pages use + // different algorithms. + data_decryptor: RingGcmBlockDecryptor, + metadata_decryptor: RingGcmBlockDecryptor, + file_aad: Vec, } impl CryptoContext { pub fn new( row_group_ordinal: usize, column_ordinal: usize, - data_decryptor: Arc, - metadata_decryptor: Arc, + data_decryptor: RingGcmBlockDecryptor, + metadata_decryptor: RingGcmBlockDecryptor, + file_aad: Vec, ) -> Self { Self { row_group_ordinal, @@ -138,6 +142,7 @@ impl CryptoContext { dictionary_page: false, data_decryptor, metadata_decryptor, + file_aad, } } @@ -149,6 +154,7 @@ impl CryptoContext { dictionary_page: false, data_decryptor: self.data_decryptor.clone(), metadata_decryptor: self.metadata_decryptor.clone(), + file_aad: self.file_aad.clone(), } } @@ -160,13 +166,19 @@ impl CryptoContext { dictionary_page: true, data_decryptor: self.data_decryptor.clone(), metadata_decryptor: self.metadata_decryptor.clone(), + file_aad: self.file_aad.clone(), } } - pub fn data_decryptor(&self) -> Arc { - self.data_decryptor.clone() + pub fn data_decryptor(&self) -> &RingGcmBlockDecryptor { + &self.data_decryptor } - pub fn metadata_decryptor(&self) -> Arc { - self.metadata_decryptor.clone() + + pub fn metadata_decryptor(&self) -> &RingGcmBlockDecryptor { + &self.metadata_decryptor + } + + pub fn file_aad(&self) -> &Vec { + &self.file_aad } } diff --git a/parquet/src/encryption/decryption.rs b/parquet/src/encryption/decryption.rs index 5c66172a78ba..a70e0c123606 100644 --- a/parquet/src/encryption/decryption.rs +++ b/parquet/src/encryption/decryption.rs @@ -31,7 +31,8 @@ pub trait BlockDecryptor { } #[derive(Debug, Clone)] -pub(crate) struct RingGcmBlockDecryptor { +// TODO: Make non-pub +pub struct RingGcmBlockDecryptor { key: LessSafeKey, } @@ -136,8 +137,7 @@ pub struct FileDecryptor { decryption_properties: FileDecryptionProperties, // todo decr: change to BlockDecryptor footer_decryptor: Option, - aad_file_unique: Vec, - aad_prefix: Vec, + file_aad: Vec, } impl PartialEq for FileDecryptor { @@ -152,20 +152,20 @@ impl FileDecryptor { aad_file_unique: Vec, aad_prefix: Vec, ) -> Self { + let file_aad = [aad_prefix.as_slice(), aad_file_unique.as_slice()].concat(); let footer_decryptor = RingGcmBlockDecryptor::new(&decryption_properties.footer_key); Self { // todo decr: if no key available yet (not set in properties, will be retrieved from metadata) footer_decryptor: Some(footer_decryptor), decryption_properties: decryption_properties.clone(), - aad_file_unique, - aad_prefix, + file_aad, } } // todo decr: change to BlockDecryptor - pub(crate) fn get_footer_decryptor(self) -> RingGcmBlockDecryptor { - self.footer_decryptor.unwrap() + pub(crate) fn get_footer_decryptor(&self) -> RingGcmBlockDecryptor { + self.footer_decryptor.clone().unwrap() } pub(crate) fn has_column_key(&self, column_name: &[u8]) -> bool { @@ -176,41 +176,27 @@ impl FileDecryptor { .contains_key(column_name) } - pub(crate) fn get_column_decryptor(&self, column_name: &[u8]) -> FileDecryptor { - if self.decryption_properties.column_keys.is_none() || !self.has_column_key(column_name) { - return self.clone(); + pub(crate) fn get_column_data_decryptor(&self, column_name: &[u8]) -> RingGcmBlockDecryptor { + match self.decryption_properties.column_keys.as_ref() { + None => self.get_footer_decryptor(), + Some(column_keys) => { + match column_keys.get(column_name) { + None => self.get_footer_decryptor(), + Some(column_key) => { + RingGcmBlockDecryptor::new(column_key) + } + } + } } - let column_keys = &self.decryption_properties.column_keys.clone().unwrap(); - let decryption_properties = if let Some(column_key) = column_keys.get(column_name) { - DecryptionPropertiesBuilder::new(column_key.clone()) - .with_aad_prefix(self.aad_prefix.clone()) - .build() - .unwrap() - } else { - self.decryption_properties.clone() - }; - - FileDecryptor::new( - &decryption_properties, - self.aad_file_unique.clone(), - self.aad_prefix.clone(), - ) - } - - pub(crate) fn decryption_properties(&self) -> &FileDecryptionProperties { - &self.decryption_properties - } - - pub(crate) fn footer_decryptor(&self) -> Option { - self.footer_decryptor.clone() } - pub(crate) fn aad_file_unique(&self) -> &Vec { - &self.aad_file_unique + pub(crate) fn get_column_metadata_decryptor(&self, column_name: &[u8]) -> RingGcmBlockDecryptor { + // Once GCM CTR mode is implemented, data and metadata decryptors may be different + self.get_column_data_decryptor(column_name) } - pub(crate) fn aad_prefix(&self) -> &Vec { - &self.aad_prefix + pub(crate) fn file_aad(&self) -> &Vec { + &self.file_aad } pub(crate) fn is_column_encrypted(&self, column_name: &[u8]) -> bool { diff --git a/parquet/src/file/metadata/mod.rs b/parquet/src/file/metadata/mod.rs index 6e0eec7be5b6..36d645bbe937 100644 --- a/parquet/src/file/metadata/mod.rs +++ b/parquet/src/file/metadata/mod.rs @@ -656,6 +656,8 @@ impl RowGroupMetaData { .enumerate() { if c.encrypted_column_metadata.is_some() { + // TODO: Allow ignoring encrypted column metadata in plaintext mode when no + // decryptor is set let decryptor = decryptor.unwrap(); let Some(ColumnCryptoMetaData::ENCRYPTIONWITHCOLUMNKEY(crypto_metadata)) = c.crypto_metadata.clone() @@ -663,20 +665,11 @@ impl RowGroupMetaData { todo!() }; let column_name = crypto_metadata.path_in_schema.join("."); - let column_decryptor = decryptor - .get_column_decryptor(column_name.as_bytes()) - .footer_decryptor() - .unwrap(); - - let aad_file_unique = decryptor.aad_file_unique(); - let aad_prefix: Vec = decryptor - .decryption_properties() - .aad_prefix() - .unwrap_or_default(); + let column_decryptor = + decryptor.get_column_metadata_decryptor(column_name.as_bytes()); + let column_aad = create_page_aad( - [aad_prefix.as_slice(), aad_file_unique.as_slice()] - .concat() - .as_slice(), + decryptor.file_aad(), ModuleType::ColumnMetaData, rg.ordinal.unwrap() as usize, i, diff --git a/parquet/src/file/metadata/reader.rs b/parquet/src/file/metadata/reader.rs index 15b1e96a47fb..9dbf1a07a556 100644 --- a/parquet/src/file/metadata/reader.rs +++ b/parquet/src/file/metadata/reader.rs @@ -741,16 +741,17 @@ impl ParquetMetaDataReader { // todo decr: get key_metadata let aad_file_unique = aes_gcm_algo.aad_file_unique.unwrap(); - let aad_footer = create_footer_aad(aad_file_unique.as_ref())?; let aad_prefix: Vec = aes_gcm_algo.aad_prefix.unwrap_or_default(); decryptor = Some(FileDecryptor::new( file_decryption_properties.unwrap(), - aad_file_unique.clone(), - aad_prefix.clone(), + aad_file_unique, + aad_prefix, )); let footer_decryptor = decryptor.clone().unwrap().get_footer_decryptor(); + let aad_footer = create_footer_aad(decryptor.as_ref().unwrap().file_aad())?; + decrypted_fmd_buf = footer_decryptor.decrypt(prot.as_slice().as_ref(), aad_footer.as_ref())?; prot = TCompactSliceInputProtocol::new(decrypted_fmd_buf.as_ref()); @@ -774,8 +775,8 @@ impl ParquetMetaDataReader { decryptor = Some(FileDecryptor::new( file_decryption_properties.unwrap(), - aad_file_unique.clone(), - aad_prefix.clone(), + aad_file_unique, + aad_prefix, )); // todo get key_metadata etc. Set file decryptor in return value // todo check signature diff --git a/parquet/src/file/serialized_reader.rs b/parquet/src/file/serialized_reader.rs index b4ec0851c32c..f1053ff1bf2d 100644 --- a/parquet/src/file/serialized_reader.rs +++ b/parquet/src/file/serialized_reader.rs @@ -350,11 +350,7 @@ pub(crate) fn read_page_header( ) -> Result { #[cfg(feature = "encryption")] if let Some(crypto_context) = crypto_context { - let decryptor = &crypto_context.data_decryptor(); - - let data_decryptor = &crypto_context.data_decryptor(); - let aad_file_unique = decryptor.aad_file_unique(); - let aad_prefix = decryptor.aad_prefix(); + let data_decryptor = crypto_context.data_decryptor(); let module_type = if crypto_context.dictionary_page { ModuleType::DictionaryPageHeader @@ -362,19 +358,14 @@ pub(crate) fn read_page_header( ModuleType::DataPageHeader }; let aad = create_page_aad( - [aad_prefix.as_slice(), aad_file_unique.as_slice()] - .concat() - .as_slice(), + crypto_context.file_aad(), module_type, crypto_context.row_group_ordinal, crypto_context.column_ordinal, crypto_context.page_ordinal, )?; - let buf = data_decryptor - .footer_decryptor() - .unwrap() - .read_and_decrypt(input, aad.as_ref())?; + let buf = data_decryptor.read_and_decrypt(input, aad.as_ref())?; let mut prot = TCompactSliceInputProtocol::new(buf.as_slice()); let page_header = PageHeader::read_from_in_protocol(&mut prot)?; @@ -454,35 +445,20 @@ pub(crate) fn decode_page( let buffer: Bytes = if crypto_context.is_some() { let crypto_context = crypto_context.as_ref().unwrap(); let decryptor = crypto_context.data_decryptor(); - // let footer_decryptor - // let file_decryptor = if decryptor.has_footer_key() { - // decryptor.footer_decryptor() - // } else { - // todo - // // decryptor.get_column_decryptor(column_name) - // // CryptoMetaData::from_thrift(&crypto_context.meta_data) - // // .and_then(|meta| meta.get_page_decryptor(crypto_context.page_ordinal)) - // // .ok_or_else(|| general_err!("Missing footer decryptor"))? - // }; - let file_decryptor = decryptor.footer_decryptor(); - if file_decryptor.is_none() { - buffer + let module_type = if crypto_context.dictionary_page { + ModuleType::DictionaryPage } else { - let module_type = if crypto_context.dictionary_page { - ModuleType::DictionaryPage - } else { - ModuleType::DataPage - }; - let aad = create_page_aad( - decryptor.aad_file_unique().as_slice(), - module_type, - crypto_context.row_group_ordinal, - crypto_context.column_ordinal, - crypto_context.page_ordinal, - )?; - let decrypted = file_decryptor.unwrap().decrypt(buffer.as_ref(), &aad)?; - Bytes::from(decrypted) - } + ModuleType::DataPage + }; + let aad = create_page_aad( + crypto_context.file_aad(), + module_type, + crypto_context.row_group_ordinal, + crypto_context.column_ordinal, + crypto_context.page_ordinal, + )?; + let decrypted = decryptor.decrypt(buffer.as_ref(), &aad)?; + Bytes::from(decrypted) } else { buffer }; From 65cebbe608500b38336391a64d40b969be72707b Mon Sep 17 00:00:00 2001 From: Adam Reeve Date: Fri, 24 Jan 2025 13:04:39 +1300 Subject: [PATCH 49/97] Use Arc --- parquet/src/encryption/ciphers.rs | 15 +++--- parquet/src/encryption/decryption.rs | 73 +++++++++++++-------------- parquet/src/file/metadata/mod.rs | 2 +- parquet/src/file/metadata/reader.rs | 2 +- parquet/src/file/serialized_reader.rs | 4 +- 5 files changed, 46 insertions(+), 50 deletions(-) diff --git a/parquet/src/encryption/ciphers.rs b/parquet/src/encryption/ciphers.rs index ce9e281ed13b..cdfea3368909 100644 --- a/parquet/src/encryption/ciphers.rs +++ b/parquet/src/encryption/ciphers.rs @@ -18,8 +18,9 @@ //! Encryption implementation specific to Parquet, as described //! in the [spec](https://github.com/apache/parquet-format/blob/master/Encryption.md). -use crate::encryption::decryption::RingGcmBlockDecryptor; +use crate::encryption::decryption::BlockDecryptor; use crate::errors::{ParquetError, Result}; +use std::sync::Arc; #[derive(PartialEq)] pub(crate) enum ModuleType { @@ -122,8 +123,8 @@ pub struct CryptoContext { // We have separate data and metadata decryptors because // in GCM CTR mode, the metadata and data pages use // different algorithms. - data_decryptor: RingGcmBlockDecryptor, - metadata_decryptor: RingGcmBlockDecryptor, + data_decryptor: Arc, + metadata_decryptor: Arc, file_aad: Vec, } @@ -131,8 +132,8 @@ impl CryptoContext { pub fn new( row_group_ordinal: usize, column_ordinal: usize, - data_decryptor: RingGcmBlockDecryptor, - metadata_decryptor: RingGcmBlockDecryptor, + data_decryptor: Arc, + metadata_decryptor: Arc, file_aad: Vec, ) -> Self { Self { @@ -170,11 +171,11 @@ impl CryptoContext { } } - pub fn data_decryptor(&self) -> &RingGcmBlockDecryptor { + pub fn data_decryptor(&self) -> &Arc { &self.data_decryptor } - pub fn metadata_decryptor(&self) -> &RingGcmBlockDecryptor { + pub fn metadata_decryptor(&self) -> &Arc { &self.metadata_decryptor } diff --git a/parquet/src/encryption/decryption.rs b/parquet/src/encryption/decryption.rs index a70e0c123606..45bf13aff7dd 100644 --- a/parquet/src/encryption/decryption.rs +++ b/parquet/src/encryption/decryption.rs @@ -18,21 +18,20 @@ use crate::errors::Result; use ring::aead::{Aad, LessSafeKey, UnboundKey, AES_128_GCM}; use std::collections::HashMap; +use std::fmt::Debug; use std::io::Read; +use std::sync::Arc; const NONCE_LEN: usize = 12; const TAG_LEN: usize = 16; const SIZE_LEN: usize = 4; -pub trait BlockDecryptor { +pub trait BlockDecryptor: Debug + Send + Sync { fn decrypt(&self, length_and_ciphertext: &[u8], aad: &[u8]) -> Result>; - - fn read_and_decrypt(&self, input: &mut T, aad: &[u8]) -> Result>; } #[derive(Debug, Clone)] -// TODO: Make non-pub -pub struct RingGcmBlockDecryptor { +pub(crate) struct RingGcmBlockDecryptor { key: LessSafeKey, } @@ -63,16 +62,20 @@ impl BlockDecryptor for RingGcmBlockDecryptor { result.resize(result.len() - TAG_LEN, 0u8); Ok(result) } +} - fn read_and_decrypt(&self, input: &mut T, aad: &[u8]) -> Result> { - let mut len_bytes = [0; 4]; - input.read_exact(&mut len_bytes)?; - let ciphertext_len = u32::from_le_bytes(len_bytes) as usize; - let mut ciphertext = vec![0; 4 + ciphertext_len]; - input.read_exact(&mut ciphertext[4..])?; - - self.decrypt(&ciphertext, aad.as_ref()) - } +pub fn read_and_decrypt( + decryptor: &Arc, + input: &mut T, + aad: &[u8], +) -> Result> { + let mut len_bytes = [0; 4]; + input.read_exact(&mut len_bytes)?; + let ciphertext_len = u32::from_le_bytes(len_bytes) as usize; + let mut ciphertext = vec![0; 4 + ciphertext_len]; + input.read_exact(&mut ciphertext[4..])?; + + decryptor.decrypt(&ciphertext, aad.as_ref()) } #[derive(Debug, Clone, PartialEq)] @@ -132,11 +135,10 @@ impl DecryptionPropertiesBuilder { } } -#[derive(Debug, Clone)] +#[derive(Clone, Debug)] pub struct FileDecryptor { decryption_properties: FileDecryptionProperties, - // todo decr: change to BlockDecryptor - footer_decryptor: Option, + footer_decryptor: Option>, file_aad: Vec, } @@ -157,40 +159,30 @@ impl FileDecryptor { Self { // todo decr: if no key available yet (not set in properties, will be retrieved from metadata) - footer_decryptor: Some(footer_decryptor), + footer_decryptor: Some(Arc::new(footer_decryptor)), decryption_properties: decryption_properties.clone(), file_aad, } } - // todo decr: change to BlockDecryptor - pub(crate) fn get_footer_decryptor(&self) -> RingGcmBlockDecryptor { + pub(crate) fn get_footer_decryptor(&self) -> Arc { self.footer_decryptor.clone().unwrap() } - pub(crate) fn has_column_key(&self, column_name: &[u8]) -> bool { - self.decryption_properties - .column_keys - .clone() - .unwrap() - .contains_key(column_name) - } - - pub(crate) fn get_column_data_decryptor(&self, column_name: &[u8]) -> RingGcmBlockDecryptor { + pub(crate) fn get_column_data_decryptor(&self, column_name: &[u8]) -> Arc { match self.decryption_properties.column_keys.as_ref() { None => self.get_footer_decryptor(), - Some(column_keys) => { - match column_keys.get(column_name) { - None => self.get_footer_decryptor(), - Some(column_key) => { - RingGcmBlockDecryptor::new(column_key) - } - } - } + Some(column_keys) => match column_keys.get(column_name) { + None => self.get_footer_decryptor(), + Some(column_key) => Arc::new(RingGcmBlockDecryptor::new(column_key)), + }, } } - pub(crate) fn get_column_metadata_decryptor(&self, column_name: &[u8]) -> RingGcmBlockDecryptor { + pub(crate) fn get_column_metadata_decryptor( + &self, + column_name: &[u8], + ) -> Arc { // Once GCM CTR mode is implemented, data and metadata decryptors may be different self.get_column_data_decryptor(column_name) } @@ -201,6 +193,9 @@ impl FileDecryptor { pub(crate) fn is_column_encrypted(&self, column_name: &[u8]) -> bool { // Column is encrypted if either uniform encryption is used or an encryption key is set for the column - self.decryption_properties.column_keys.is_none() || self.has_column_key(column_name) + match self.decryption_properties.column_keys.as_ref() { + None => true, + Some(keys) => keys.contains_key(column_name), + } } } diff --git a/parquet/src/file/metadata/mod.rs b/parquet/src/file/metadata/mod.rs index 36d645bbe937..06862afa5f9b 100644 --- a/parquet/src/file/metadata/mod.rs +++ b/parquet/src/file/metadata/mod.rs @@ -99,7 +99,7 @@ use crate::basic::{ColumnOrder, Compression, Encoding, Type}; #[cfg(feature = "encryption")] use crate::encryption::{ ciphers::{create_page_aad, ModuleType}, - decryption::{BlockDecryptor, FileDecryptor}, + decryption::FileDecryptor, }; use crate::errors::{ParquetError, Result}; pub(crate) use crate::file::metadata::memory::HeapSize; diff --git a/parquet/src/file/metadata/reader.rs b/parquet/src/file/metadata/reader.rs index 9dbf1a07a556..c6308876cc2f 100644 --- a/parquet/src/file/metadata/reader.rs +++ b/parquet/src/file/metadata/reader.rs @@ -23,7 +23,7 @@ use crate::basic::ColumnOrder; #[cfg(feature = "encryption")] use crate::encryption::{ ciphers::create_footer_aad, - decryption::{BlockDecryptor, FileDecryptionProperties, FileDecryptor}, + decryption::{FileDecryptionProperties, FileDecryptor}, }; use crate::errors::{ParquetError, Result}; diff --git a/parquet/src/file/serialized_reader.rs b/parquet/src/file/serialized_reader.rs index f1053ff1bf2d..e445b69f8f9c 100644 --- a/parquet/src/file/serialized_reader.rs +++ b/parquet/src/file/serialized_reader.rs @@ -25,7 +25,7 @@ use crate::compression::{create_codec, Codec}; #[cfg(feature = "encryption")] use crate::encryption::{ ciphers::{create_page_aad, CryptoContext, ModuleType}, - decryption::BlockDecryptor, + decryption::read_and_decrypt, }; use crate::errors::{ParquetError, Result}; use crate::file::page_index::offset_index::OffsetIndexMetaData; @@ -365,7 +365,7 @@ pub(crate) fn read_page_header( crypto_context.page_ordinal, )?; - let buf = data_decryptor.read_and_decrypt(input, aad.as_ref())?; + let buf = read_and_decrypt(data_decryptor, input, aad.as_ref())?; let mut prot = TCompactSliceInputProtocol::new(buf.as_slice()); let page_header = PageHeader::read_from_in_protocol(&mut prot)?; From 53e554ef4495dfe42122002a87dd05848a51fc67 Mon Sep 17 00:00:00 2001 From: Adam Reeve Date: Fri, 24 Jan 2025 13:20:58 +1300 Subject: [PATCH 50/97] Fix file metadata tests --- parquet/src/file/metadata/mod.rs | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/parquet/src/file/metadata/mod.rs b/parquet/src/file/metadata/mod.rs index 06862afa5f9b..7cba58a4ef9e 100644 --- a/parquet/src/file/metadata/mod.rs +++ b/parquet/src/file/metadata/mod.rs @@ -1928,7 +1928,11 @@ mod tests { let parquet_meta = ParquetMetaDataBuilder::new(file_metadata.clone()) .set_row_groups(row_group_meta_with_stats) .build(); - let base_expected_size = 3008; + + #[cfg(not(feature = "encryption"))] + let base_expected_size = 2312; + #[cfg(feature = "encryption")] + let base_expected_size = 2448; assert_eq!(parquet_meta.memory_size(), base_expected_size); @@ -1955,7 +1959,11 @@ mod tests { ]])) .build(); - let bigger_expected_size = 3512; + #[cfg(not(feature = "encryption"))] + let bigger_expected_size = 2816; + #[cfg(feature = "encryption")] + let bigger_expected_size = 2952; + // more set fields means more memory usage assert!(bigger_expected_size > base_expected_size); assert_eq!(parquet_meta.memory_size(), bigger_expected_size); From 55e55ce9218240e4d7e5a6fb7374291d1a868d46 Mon Sep 17 00:00:00 2001 From: Adam Reeve Date: Fri, 24 Jan 2025 14:46:35 +1300 Subject: [PATCH 51/97] Handle reading plaintext footer files without decryption properties --- parquet/Cargo.toml | 1 - parquet/src/arrow/arrow_reader/mod.rs | 64 ++++++++++++++++++- parquet/src/file/metadata/mod.rs | 27 ++++---- parquet/src/file/metadata/reader.rs | 92 +++++++++++++-------------- 4 files changed, 122 insertions(+), 62 deletions(-) diff --git a/parquet/Cargo.toml b/parquet/Cargo.toml index a17d85b664a6..8866f80b2230 100644 --- a/parquet/Cargo.toml +++ b/parquet/Cargo.toml @@ -128,7 +128,6 @@ sysinfo = ["dep:sysinfo"] crc = ["dep:crc32fast"] # Enable SIMD UTF-8 validation simdutf8 = ["dep:simdutf8"] -#encryption = ["aes-gcm", "base64"] # Enable Parquet modular encryption support encryption = ["dep:ring"] diff --git a/parquet/src/arrow/arrow_reader/mod.rs b/parquet/src/arrow/arrow_reader/mod.rs index cec44e72826f..1a590be3a363 100644 --- a/parquet/src/arrow/arrow_reader/mod.rs +++ b/parquet/src/arrow/arrow_reader/mod.rs @@ -1026,7 +1026,6 @@ mod tests { }; use arrow_select::concat::concat_batches; - #[cfg(feature = "encryption")] use crate::arrow::arrow_reader::ArrowReaderMetadata; use crate::arrow::arrow_reader::{ ArrowPredicateFn, ArrowReaderBuilder, ArrowReaderOptions, ParquetRecordBatchReader, @@ -1897,6 +1896,69 @@ mod tests { verify_encryption_test_file_read(file, decryption_properties); } + #[test] + fn test_non_uniform_encryption_plaintext_footer_without_decryption() { + let testdata = arrow::util::test_util::parquet_test_data(); + let path = format!("{testdata}/encrypt_columns_plaintext_footer.parquet.encrypted"); + let file = File::open(&path).unwrap(); + + let metadata = ArrowReaderMetadata::load( + &file, + Default::default(), + #[cfg(feature = "encryption")] + None, + ) + .unwrap(); + let file_metadata = metadata.metadata.file_metadata(); + + assert_eq!(file_metadata.num_rows(), 50); + assert_eq!(file_metadata.schema_descr().num_columns(), 8); + assert_eq!( + file_metadata.created_by().unwrap(), + "parquet-cpp-arrow version 19.0.0-SNAPSHOT" + ); + + metadata.metadata.row_groups().iter().for_each(|rg| { + assert_eq!(rg.num_columns(), 8); + assert_eq!(rg.num_rows(), 50); + }); + + // Should be able to read unencrypted columns. Test reading one column. + let builder = ParquetRecordBatchReaderBuilder::try_new(file).unwrap(); + let mask = ProjectionMask::leaves(builder.parquet_schema(), [1]); + let record_reader = builder.with_projection(mask).build().unwrap(); + + let mut row_count = 0; + for batch in record_reader { + let batch = batch.unwrap(); + row_count += batch.num_rows(); + + let time_col = batch + .column(0) + .as_primitive::(); + for (i, x) in time_col.iter().enumerate() { + assert_eq!(x.unwrap(), i as i32); + } + } + + assert_eq!(row_count, file_metadata.num_rows() as usize); + + // Reading an encrypted column should fail + let file = File::open(&path).unwrap(); + let builder = ParquetRecordBatchReaderBuilder::try_new(file).unwrap(); + let mask = ProjectionMask::leaves(builder.parquet_schema(), [4]); + let mut record_reader = builder.with_projection(mask).build().unwrap(); + + match record_reader.next() { + Some(Err(ArrowError::ParquetError(s))) => { + assert!(s.contains("protocol error")); + } + _ => { + panic!("Expected ArrowError::ParquetError"); + } + }; + } + #[test] #[cfg(feature = "encryption")] fn test_non_uniform_encryption() { diff --git a/parquet/src/file/metadata/mod.rs b/parquet/src/file/metadata/mod.rs index 7cba58a4ef9e..e6ab1ec966aa 100644 --- a/parquet/src/file/metadata/mod.rs +++ b/parquet/src/file/metadata/mod.rs @@ -655,18 +655,23 @@ impl RowGroupMetaData { .zip(schema_descr.columns()) .enumerate() { - if c.encrypted_column_metadata.is_some() { - // TODO: Allow ignoring encrypted column metadata in plaintext mode when no - // decryptor is set - let decryptor = decryptor.unwrap(); - let Some(ColumnCryptoMetaData::ENCRYPTIONWITHCOLUMNKEY(crypto_metadata)) = - c.crypto_metadata.clone() - else { - todo!() + // Read encrypted metadata if it's present and we have a decryptor. + if let (true, Some(decryptor)) = (c.encrypted_column_metadata.is_some(), decryptor) { + let column_decryptor = match c.crypto_metadata.as_ref() { + None => { + return Err(general_err!( + "No crypto_metadata is set for column {}, which has encrypted metadata", + i + )); + } + Some(ColumnCryptoMetaData::ENCRYPTIONWITHCOLUMNKEY(crypto_metadata)) => { + let column_name = crypto_metadata.path_in_schema.join("."); + decryptor.get_column_metadata_decryptor(column_name.as_bytes()) + } + Some(ColumnCryptoMetaData::ENCRYPTIONWITHFOOTERKEY(_)) => { + decryptor.get_footer_decryptor() + } }; - let column_name = crypto_metadata.path_in_schema.join("."); - let column_decryptor = - decryptor.get_column_metadata_decryptor(column_name.as_bytes()); let column_aad = create_page_aad( decryptor.file_aad(), diff --git a/parquet/src/file/metadata/reader.rs b/parquet/src/file/metadata/reader.rs index c6308876cc2f..021b8fdcff9e 100644 --- a/parquet/src/file/metadata/reader.rs +++ b/parquet/src/file/metadata/reader.rs @@ -719,42 +719,31 @@ impl ParquetMetaDataReader { } #[cfg(feature = "encryption")] - let mut decryptor = None; + let mut file_decryptor = None; #[cfg(feature = "encryption")] let decrypted_fmd_buf; #[cfg(feature = "encryption")] if encrypted_footer { - if file_decryption_properties.is_none() { - return Err(general_err!("Parquet file has an encrypted footer but no decryption properties were provided")); - }; - - let t_file_crypto_metadata: TFileCryptoMetaData = - TFileCryptoMetaData::read_from_in_protocol(&mut prot) - .map_err(|e| general_err!("Could not parse crypto metadata: {}", e))?; - let algo = t_file_crypto_metadata.encryption_algorithm; - let aes_gcm_algo = if let EncryptionAlgorithm::AESGCMV1(a) = algo { - a - } else { - unreachable!() - }; // todo decr: add support for GCMCTRV1 - - // todo decr: get key_metadata - let aad_file_unique = aes_gcm_algo.aad_file_unique.unwrap(); - let aad_prefix: Vec = aes_gcm_algo.aad_prefix.unwrap_or_default(); - - decryptor = Some(FileDecryptor::new( - file_decryption_properties.unwrap(), - aad_file_unique, - aad_prefix, - )); - let footer_decryptor = decryptor.clone().unwrap().get_footer_decryptor(); + if let Some(file_decryption_properties) = file_decryption_properties { + let t_file_crypto_metadata: TFileCryptoMetaData = + TFileCryptoMetaData::read_from_in_protocol(&mut prot) + .map_err(|e| general_err!("Could not parse crypto metadata: {}", e))?; + let decryptor = get_file_decryptor( + t_file_crypto_metadata.encryption_algorithm, + file_decryption_properties, + ); + let footer_decryptor = decryptor.get_footer_decryptor(); + let aad_footer = create_footer_aad(decryptor.file_aad())?; - let aad_footer = create_footer_aad(decryptor.as_ref().unwrap().file_aad())?; + decrypted_fmd_buf = + footer_decryptor.decrypt(prot.as_slice().as_ref(), aad_footer.as_ref())?; + prot = TCompactSliceInputProtocol::new(decrypted_fmd_buf.as_ref()); - decrypted_fmd_buf = - footer_decryptor.decrypt(prot.as_slice().as_ref(), aad_footer.as_ref())?; - prot = TCompactSliceInputProtocol::new(decrypted_fmd_buf.as_ref()); + file_decryptor = Some(decryptor); + } else { + return Err(general_err!("Parquet file has an encrypted footer but no decryption properties were provided")); + } } let t_file_metadata: TFileMetaData = TFileMetaData::read_from_in_protocol(&mut prot) @@ -763,33 +752,21 @@ impl ParquetMetaDataReader { let schema_descr = Arc::new(SchemaDescriptor::new(schema)); #[cfg(feature = "encryption")] - if t_file_metadata.encryption_algorithm.is_some() { - let algo = t_file_metadata.encryption_algorithm; - let aes_gcm_algo = if let Some(EncryptionAlgorithm::AESGCMV1(a)) = algo { - a - } else { - unreachable!() - }; // todo decr: add support for GCMCTRV1 - let aad_file_unique = aes_gcm_algo.aad_file_unique.unwrap(); - let aad_prefix: Vec = aes_gcm_algo.aad_prefix.unwrap_or_default(); - - decryptor = Some(FileDecryptor::new( - file_decryption_properties.unwrap(), - aad_file_unique, - aad_prefix, - )); - // todo get key_metadata etc. Set file decryptor in return value - // todo check signature + if let (Some(algo), Some(file_decryption_properties)) = ( + t_file_metadata.encryption_algorithm, + file_decryption_properties, + ) { + // File has a plaintext footer but encryption algorithm is set + file_decryptor = Some(get_file_decryptor(algo, file_decryption_properties)); } let mut row_groups = Vec::new(); - // TODO: row group filtering for rg in t_file_metadata.row_groups { let r = RowGroupMetaData::from_thrift( schema_descr.clone(), rg, #[cfg(feature = "encryption")] - decryptor.as_ref(), + file_decryptor.as_ref(), )?; row_groups.push(r); } @@ -808,7 +785,7 @@ impl ParquetMetaDataReader { file_metadata, row_groups, #[cfg(feature = "encryption")] - decryptor, + file_decryptor, )) } @@ -844,6 +821,23 @@ impl ParquetMetaDataReader { } } +#[cfg(feature = "encryption")] +fn get_file_decryptor( + encryption_algorithm: EncryptionAlgorithm, + file_decryption_properties: &FileDecryptionProperties, +) -> FileDecryptor { + let aes_gcm_algo = if let EncryptionAlgorithm::AESGCMV1(a) = encryption_algorithm { + a + } else { + todo!("GCMCTRV1 encryption algorithm") + }; + + let aad_file_unique = aes_gcm_algo.aad_file_unique.unwrap(); + let aad_prefix: Vec = aes_gcm_algo.aad_prefix.unwrap_or_default(); + + FileDecryptor::new(file_decryption_properties, aad_file_unique, aad_prefix) +} + #[cfg(test)] mod tests { use super::*; From 98cc63eb1c64e328256b8bfce06a4f984d222c26 Mon Sep 17 00:00:00 2001 From: Adam Reeve Date: Fri, 24 Jan 2025 15:57:08 +1300 Subject: [PATCH 52/97] Split up encryption modules further --- parquet/src/arrow/arrow_reader/mod.rs | 2 +- parquet/src/encryption/ciphers.rs | 179 ++++---------------------- parquet/src/encryption/decryption.rs | 109 ++++++++++------ parquet/src/encryption/mod.rs | 1 + parquet/src/encryption/modules.rs | 110 ++++++++++++++++ parquet/src/file/metadata/mod.rs | 2 +- parquet/src/file/metadata/reader.rs | 2 +- parquet/src/file/serialized_reader.rs | 4 +- 8 files changed, 212 insertions(+), 197 deletions(-) create mode 100644 parquet/src/encryption/modules.rs diff --git a/parquet/src/arrow/arrow_reader/mod.rs b/parquet/src/arrow/arrow_reader/mod.rs index 1a590be3a363..eb43cc5f2d45 100644 --- a/parquet/src/arrow/arrow_reader/mod.rs +++ b/parquet/src/arrow/arrow_reader/mod.rs @@ -33,7 +33,7 @@ use crate::arrow::schema::{parquet_to_arrow_schema_and_fields, ParquetField}; use crate::arrow::{parquet_to_arrow_field_levels, FieldLevels, ProjectionMask}; use crate::column::page::{PageIterator, PageReader}; #[cfg(feature = "encryption")] -use crate::encryption::{ciphers::CryptoContext, decryption::FileDecryptionProperties}; +use crate::encryption::decryption::{CryptoContext, FileDecryptionProperties}; use crate::errors::{ParquetError, Result}; use crate::file::metadata::{ParquetMetaData, ParquetMetaDataReader}; use crate::file::reader::{ChunkReader, SerializedPageReader}; diff --git a/parquet/src/encryption/ciphers.rs b/parquet/src/encryption/ciphers.rs index cdfea3368909..8385932671f5 100644 --- a/parquet/src/encryption/ciphers.rs +++ b/parquet/src/encryption/ciphers.rs @@ -15,171 +15,48 @@ // specific language governing permissions and limitations // under the License. -//! Encryption implementation specific to Parquet, as described -//! in the [spec](https://github.com/apache/parquet-format/blob/master/Encryption.md). +use crate::errors::Result; +use ring::aead::{Aad, LessSafeKey, UnboundKey, AES_128_GCM}; +use std::fmt::Debug; -use crate::encryption::decryption::BlockDecryptor; -use crate::errors::{ParquetError, Result}; -use std::sync::Arc; +const NONCE_LEN: usize = 12; +const TAG_LEN: usize = 16; +const SIZE_LEN: usize = 4; -#[derive(PartialEq)] -pub(crate) enum ModuleType { - Footer = 0, - ColumnMetaData = 1, - DataPage = 2, - DictionaryPage = 3, - DataPageHeader = 4, - DictionaryPageHeader = 5, -} - -pub fn create_footer_aad(file_aad: &[u8]) -> Result> { - create_module_aad(file_aad, ModuleType::Footer, 0, 0, None) -} - -pub(crate) fn create_page_aad( - file_aad: &[u8], - module_type: ModuleType, - row_group_ordinal: usize, - column_ordinal: usize, - page_ordinal: Option, -) -> Result> { - create_module_aad( - file_aad, - module_type, - row_group_ordinal, - column_ordinal, - page_ordinal, - ) -} - -fn create_module_aad( - file_aad: &[u8], - module_type: ModuleType, - row_group_ordinal: usize, - column_ordinal: usize, - page_ordinal: Option, -) -> Result> { - let module_buf = [module_type as u8]; - - if module_buf[0] == (ModuleType::Footer as u8) { - let mut aad = Vec::with_capacity(file_aad.len() + 1); - aad.extend_from_slice(file_aad); - aad.extend_from_slice(module_buf.as_ref()); - return Ok(aad); - } - - if row_group_ordinal > i16::MAX as usize { - return Err(general_err!( - "Encrypted parquet files can't have more than {} row groups: {}", - i16::MAX, - row_group_ordinal - )); - } - if column_ordinal > i16::MAX as usize { - return Err(general_err!( - "Encrypted parquet files can't have more than {} columns: {}", - i16::MAX, - column_ordinal - )); - } - - if module_buf[0] != (ModuleType::DataPageHeader as u8) - && module_buf[0] != (ModuleType::DataPage as u8) - { - let mut aad = Vec::with_capacity(file_aad.len() + 5); - aad.extend_from_slice(file_aad); - aad.extend_from_slice(module_buf.as_ref()); - aad.extend_from_slice((row_group_ordinal as i16).to_le_bytes().as_ref()); - aad.extend_from_slice((column_ordinal as i16).to_le_bytes().as_ref()); - return Ok(aad); - } - - let page_ordinal = - page_ordinal.ok_or_else(|| general_err!("Page ordinal must be set for data pages"))?; - - if page_ordinal > i16::MAX as usize { - return Err(general_err!( - "Encrypted parquet files can't have more than {} pages per column chunk: {}", - i16::MAX, - page_ordinal - )); - } - - let mut aad = Vec::with_capacity(file_aad.len() + 7); - aad.extend_from_slice(file_aad); - aad.extend_from_slice(module_buf.as_ref()); - aad.extend_from_slice((row_group_ordinal as i16).to_le_bytes().as_ref()); - aad.extend_from_slice((column_ordinal as i16).to_le_bytes().as_ref()); - aad.extend_from_slice((page_ordinal as i16).to_le_bytes().as_ref()); - Ok(aad) +pub trait BlockDecryptor: Debug + Send + Sync { + fn decrypt(&self, length_and_ciphertext: &[u8], aad: &[u8]) -> Result>; } #[derive(Debug, Clone)] -pub struct CryptoContext { - pub(crate) row_group_ordinal: usize, - pub(crate) column_ordinal: usize, - pub(crate) page_ordinal: Option, - pub(crate) dictionary_page: bool, - // We have separate data and metadata decryptors because - // in GCM CTR mode, the metadata and data pages use - // different algorithms. - data_decryptor: Arc, - metadata_decryptor: Arc, - file_aad: Vec, +pub(crate) struct RingGcmBlockDecryptor { + key: LessSafeKey, } -impl CryptoContext { - pub fn new( - row_group_ordinal: usize, - column_ordinal: usize, - data_decryptor: Arc, - metadata_decryptor: Arc, - file_aad: Vec, - ) -> Self { - Self { - row_group_ordinal, - column_ordinal, - page_ordinal: None, - dictionary_page: false, - data_decryptor, - metadata_decryptor, - file_aad, - } - } +impl RingGcmBlockDecryptor { + pub(crate) fn new(key_bytes: &[u8]) -> Self { + // todo support other key sizes + let key = UnboundKey::new(&AES_128_GCM, key_bytes).unwrap(); - pub fn with_page_ordinal(&self, page_ordinal: usize) -> Self { Self { - row_group_ordinal: self.row_group_ordinal, - column_ordinal: self.column_ordinal, - page_ordinal: Some(page_ordinal), - dictionary_page: false, - data_decryptor: self.data_decryptor.clone(), - metadata_decryptor: self.metadata_decryptor.clone(), - file_aad: self.file_aad.clone(), + key: LessSafeKey::new(key), } } +} - pub fn for_dictionary_page(&self) -> Self { - Self { - row_group_ordinal: self.row_group_ordinal, - column_ordinal: self.column_ordinal, - page_ordinal: self.page_ordinal, - dictionary_page: true, - data_decryptor: self.data_decryptor.clone(), - metadata_decryptor: self.metadata_decryptor.clone(), - file_aad: self.file_aad.clone(), - } - } +impl BlockDecryptor for RingGcmBlockDecryptor { + fn decrypt(&self, length_and_ciphertext: &[u8], aad: &[u8]) -> Result> { + let mut result = + Vec::with_capacity(length_and_ciphertext.len() - SIZE_LEN - NONCE_LEN - TAG_LEN); + result.extend_from_slice(&length_and_ciphertext[SIZE_LEN + NONCE_LEN..]); - pub fn data_decryptor(&self) -> &Arc { - &self.data_decryptor - } + let nonce = ring::aead::Nonce::try_assume_unique_for_key( + &length_and_ciphertext[SIZE_LEN..SIZE_LEN + NONCE_LEN], + )?; - pub fn metadata_decryptor(&self) -> &Arc { - &self.metadata_decryptor - } + self.key.open_in_place(nonce, Aad::from(aad), &mut result)?; - pub fn file_aad(&self) -> &Vec { - &self.file_aad + // Truncate result to remove the tag + result.resize(result.len() - TAG_LEN, 0u8); + Ok(result) } } diff --git a/parquet/src/encryption/decryption.rs b/parquet/src/encryption/decryption.rs index 45bf13aff7dd..eac78def6dc4 100644 --- a/parquet/src/encryption/decryption.rs +++ b/parquet/src/encryption/decryption.rs @@ -15,67 +15,94 @@ // specific language governing permissions and limitations // under the License. +use crate::encryption::ciphers::{BlockDecryptor, RingGcmBlockDecryptor}; use crate::errors::Result; -use ring::aead::{Aad, LessSafeKey, UnboundKey, AES_128_GCM}; use std::collections::HashMap; -use std::fmt::Debug; use std::io::Read; use std::sync::Arc; -const NONCE_LEN: usize = 12; -const TAG_LEN: usize = 16; -const SIZE_LEN: usize = 4; +pub fn read_and_decrypt( + decryptor: &Arc, + input: &mut T, + aad: &[u8], +) -> Result> { + let mut len_bytes = [0; 4]; + input.read_exact(&mut len_bytes)?; + let ciphertext_len = u32::from_le_bytes(len_bytes) as usize; + let mut ciphertext = vec![0; 4 + ciphertext_len]; + input.read_exact(&mut ciphertext[4..])?; -pub trait BlockDecryptor: Debug + Send + Sync { - fn decrypt(&self, length_and_ciphertext: &[u8], aad: &[u8]) -> Result>; + decryptor.decrypt(&ciphertext, aad.as_ref()) } #[derive(Debug, Clone)] -pub(crate) struct RingGcmBlockDecryptor { - key: LessSafeKey, +pub struct CryptoContext { + pub(crate) row_group_ordinal: usize, + pub(crate) column_ordinal: usize, + pub(crate) page_ordinal: Option, + pub(crate) dictionary_page: bool, + // We have separate data and metadata decryptors because + // in GCM CTR mode, the metadata and data pages use + // different algorithms. + data_decryptor: Arc, + metadata_decryptor: Arc, + file_aad: Vec, } -impl RingGcmBlockDecryptor { - pub(crate) fn new(key_bytes: &[u8]) -> Self { - // todo support other key sizes - let key = UnboundKey::new(&AES_128_GCM, key_bytes).unwrap(); - +impl CryptoContext { + pub fn new( + row_group_ordinal: usize, + column_ordinal: usize, + data_decryptor: Arc, + metadata_decryptor: Arc, + file_aad: Vec, + ) -> Self { Self { - key: LessSafeKey::new(key), + row_group_ordinal, + column_ordinal, + page_ordinal: None, + dictionary_page: false, + data_decryptor, + metadata_decryptor, + file_aad, } } -} - -impl BlockDecryptor for RingGcmBlockDecryptor { - fn decrypt(&self, length_and_ciphertext: &[u8], aad: &[u8]) -> Result> { - let mut result = - Vec::with_capacity(length_and_ciphertext.len() - SIZE_LEN - NONCE_LEN - TAG_LEN); - result.extend_from_slice(&length_and_ciphertext[SIZE_LEN + NONCE_LEN..]); - let nonce = ring::aead::Nonce::try_assume_unique_for_key( - &length_and_ciphertext[SIZE_LEN..SIZE_LEN + NONCE_LEN], - )?; + pub fn with_page_ordinal(&self, page_ordinal: usize) -> Self { + Self { + row_group_ordinal: self.row_group_ordinal, + column_ordinal: self.column_ordinal, + page_ordinal: Some(page_ordinal), + dictionary_page: false, + data_decryptor: self.data_decryptor.clone(), + metadata_decryptor: self.metadata_decryptor.clone(), + file_aad: self.file_aad.clone(), + } + } - self.key.open_in_place(nonce, Aad::from(aad), &mut result)?; + pub fn for_dictionary_page(&self) -> Self { + Self { + row_group_ordinal: self.row_group_ordinal, + column_ordinal: self.column_ordinal, + page_ordinal: self.page_ordinal, + dictionary_page: true, + data_decryptor: self.data_decryptor.clone(), + metadata_decryptor: self.metadata_decryptor.clone(), + file_aad: self.file_aad.clone(), + } + } - // Truncate result to remove the tag - result.resize(result.len() - TAG_LEN, 0u8); - Ok(result) + pub fn data_decryptor(&self) -> &Arc { + &self.data_decryptor } -} -pub fn read_and_decrypt( - decryptor: &Arc, - input: &mut T, - aad: &[u8], -) -> Result> { - let mut len_bytes = [0; 4]; - input.read_exact(&mut len_bytes)?; - let ciphertext_len = u32::from_le_bytes(len_bytes) as usize; - let mut ciphertext = vec![0; 4 + ciphertext_len]; - input.read_exact(&mut ciphertext[4..])?; + pub fn metadata_decryptor(&self) -> &Arc { + &self.metadata_decryptor + } - decryptor.decrypt(&ciphertext, aad.as_ref()) + pub fn file_aad(&self) -> &Vec { + &self.file_aad + } } #[derive(Debug, Clone, PartialEq)] diff --git a/parquet/src/encryption/mod.rs b/parquet/src/encryption/mod.rs index 75a6394af766..1e33bf4fbd6d 100644 --- a/parquet/src/encryption/mod.rs +++ b/parquet/src/encryption/mod.rs @@ -20,3 +20,4 @@ pub mod ciphers; pub mod decryption; +pub mod modules; diff --git a/parquet/src/encryption/modules.rs b/parquet/src/encryption/modules.rs new file mode 100644 index 000000000000..5ff060b3abd4 --- /dev/null +++ b/parquet/src/encryption/modules.rs @@ -0,0 +1,110 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::errors::ParquetError; + +#[derive(PartialEq)] +pub(crate) enum ModuleType { + Footer = 0, + ColumnMetaData = 1, + DataPage = 2, + DictionaryPage = 3, + DataPageHeader = 4, + DictionaryPageHeader = 5, +} + +pub fn create_footer_aad(file_aad: &[u8]) -> crate::errors::Result> { + create_module_aad(file_aad, ModuleType::Footer, 0, 0, None) +} + +pub(crate) fn create_page_aad( + file_aad: &[u8], + module_type: ModuleType, + row_group_ordinal: usize, + column_ordinal: usize, + page_ordinal: Option, +) -> crate::errors::Result> { + create_module_aad( + file_aad, + module_type, + row_group_ordinal, + column_ordinal, + page_ordinal, + ) +} + +fn create_module_aad( + file_aad: &[u8], + module_type: ModuleType, + row_group_ordinal: usize, + column_ordinal: usize, + page_ordinal: Option, +) -> crate::errors::Result> { + let module_buf = [module_type as u8]; + + if module_buf[0] == (ModuleType::Footer as u8) { + let mut aad = Vec::with_capacity(file_aad.len() + 1); + aad.extend_from_slice(file_aad); + aad.extend_from_slice(module_buf.as_ref()); + return Ok(aad); + } + + if row_group_ordinal > i16::MAX as usize { + return Err(general_err!( + "Encrypted parquet files can't have more than {} row groups: {}", + i16::MAX, + row_group_ordinal + )); + } + if column_ordinal > i16::MAX as usize { + return Err(general_err!( + "Encrypted parquet files can't have more than {} columns: {}", + i16::MAX, + column_ordinal + )); + } + + if module_buf[0] != (ModuleType::DataPageHeader as u8) + && module_buf[0] != (ModuleType::DataPage as u8) + { + let mut aad = Vec::with_capacity(file_aad.len() + 5); + aad.extend_from_slice(file_aad); + aad.extend_from_slice(module_buf.as_ref()); + aad.extend_from_slice((row_group_ordinal as i16).to_le_bytes().as_ref()); + aad.extend_from_slice((column_ordinal as i16).to_le_bytes().as_ref()); + return Ok(aad); + } + + let page_ordinal = + page_ordinal.ok_or_else(|| general_err!("Page ordinal must be set for data pages"))?; + + if page_ordinal > i16::MAX as usize { + return Err(general_err!( + "Encrypted parquet files can't have more than {} pages per column chunk: {}", + i16::MAX, + page_ordinal + )); + } + + let mut aad = Vec::with_capacity(file_aad.len() + 7); + aad.extend_from_slice(file_aad); + aad.extend_from_slice(module_buf.as_ref()); + aad.extend_from_slice((row_group_ordinal as i16).to_le_bytes().as_ref()); + aad.extend_from_slice((column_ordinal as i16).to_le_bytes().as_ref()); + aad.extend_from_slice((page_ordinal as i16).to_le_bytes().as_ref()); + Ok(aad) +} diff --git a/parquet/src/file/metadata/mod.rs b/parquet/src/file/metadata/mod.rs index e6ab1ec966aa..630730b476a7 100644 --- a/parquet/src/file/metadata/mod.rs +++ b/parquet/src/file/metadata/mod.rs @@ -98,8 +98,8 @@ mod writer; use crate::basic::{ColumnOrder, Compression, Encoding, Type}; #[cfg(feature = "encryption")] use crate::encryption::{ - ciphers::{create_page_aad, ModuleType}, decryption::FileDecryptor, + modules::{create_page_aad, ModuleType}, }; use crate::errors::{ParquetError, Result}; pub(crate) use crate::file::metadata::memory::HeapSize; diff --git a/parquet/src/file/metadata/reader.rs b/parquet/src/file/metadata/reader.rs index 021b8fdcff9e..6513adca5ae0 100644 --- a/parquet/src/file/metadata/reader.rs +++ b/parquet/src/file/metadata/reader.rs @@ -22,8 +22,8 @@ use bytes::Bytes; use crate::basic::ColumnOrder; #[cfg(feature = "encryption")] use crate::encryption::{ - ciphers::create_footer_aad, decryption::{FileDecryptionProperties, FileDecryptor}, + modules::create_footer_aad, }; use crate::errors::{ParquetError, Result}; diff --git a/parquet/src/file/serialized_reader.rs b/parquet/src/file/serialized_reader.rs index e445b69f8f9c..860df913b2c6 100644 --- a/parquet/src/file/serialized_reader.rs +++ b/parquet/src/file/serialized_reader.rs @@ -24,8 +24,8 @@ use crate::column::page::{Page, PageMetadata, PageReader}; use crate::compression::{create_codec, Codec}; #[cfg(feature = "encryption")] use crate::encryption::{ - ciphers::{create_page_aad, CryptoContext, ModuleType}, - decryption::read_and_decrypt, + decryption::{read_and_decrypt, CryptoContext}, + modules::{create_page_aad, ModuleType}, }; use crate::errors::{ParquetError, Result}; use crate::file::page_index::offset_index::OffsetIndexMetaData; From 0ff94048880430446fce8556c7657cc450535904 Mon Sep 17 00:00:00 2001 From: Adam Reeve Date: Fri, 24 Jan 2025 16:59:18 +1300 Subject: [PATCH 53/97] Error instead of panic for AES-GCM-CTR --- parquet/src/arrow/arrow_reader/mod.rs | 31 +++++++++++++++++++++++++ parquet/src/file/metadata/reader.rs | 33 ++++++++++++++++----------- 2 files changed, 51 insertions(+), 13 deletions(-) diff --git a/parquet/src/arrow/arrow_reader/mod.rs b/parquet/src/arrow/arrow_reader/mod.rs index eb43cc5f2d45..5ec61aa096aa 100644 --- a/parquet/src/arrow/arrow_reader/mod.rs +++ b/parquet/src/arrow/arrow_reader/mod.rs @@ -1994,6 +1994,37 @@ mod tests { verify_encryption_test_file_read(file, decryption_properties); } + #[test] + #[cfg(feature = "encryption")] + fn test_aes_ctr_encryption() { + let testdata = arrow::util::test_util::parquet_test_data(); + let path = format!("{testdata}/encrypt_columns_and_footer_ctr.parquet.encrypted"); + let file = File::open(path).unwrap(); + + let footer_key = "0123456789012345".as_bytes(); + let column_1_key = "1234567890123450".as_bytes(); + let column_2_key = "1234567890123451".as_bytes(); + + let decryption_properties = FileDecryptionProperties::builder(footer_key.to_vec()) + .with_column_key("double_field".as_bytes().to_vec(), column_1_key.to_vec()) + .with_column_key("float_field".as_bytes().to_vec(), column_2_key.to_vec()) + .build() + .unwrap(); + + let decryption_properties = Some(decryption_properties); + let metadata = + ArrowReaderMetadata::load(&file, Default::default(), decryption_properties.as_ref()); + + match metadata { + Err(crate::errors::ParquetError::NYI(s)) => { + assert!(s.contains("AES_GCM_CTR_V1")); + } + _ => { + panic!("Expected ParquetError::NYI"); + } + }; + } + #[cfg(feature = "encryption")] fn verify_encryption_test_file_read( file: File, diff --git a/parquet/src/file/metadata/reader.rs b/parquet/src/file/metadata/reader.rs index 6513adca5ae0..3641caa1695d 100644 --- a/parquet/src/file/metadata/reader.rs +++ b/parquet/src/file/metadata/reader.rs @@ -732,7 +732,7 @@ impl ParquetMetaDataReader { let decryptor = get_file_decryptor( t_file_crypto_metadata.encryption_algorithm, file_decryption_properties, - ); + )?; let footer_decryptor = decryptor.get_footer_decryptor(); let aad_footer = create_footer_aad(decryptor.file_aad())?; @@ -757,7 +757,7 @@ impl ParquetMetaDataReader { file_decryption_properties, ) { // File has a plaintext footer but encryption algorithm is set - file_decryptor = Some(get_file_decryptor(algo, file_decryption_properties)); + file_decryptor = Some(get_file_decryptor(algo, file_decryption_properties)?); } let mut row_groups = Vec::new(); @@ -825,17 +825,24 @@ impl ParquetMetaDataReader { fn get_file_decryptor( encryption_algorithm: EncryptionAlgorithm, file_decryption_properties: &FileDecryptionProperties, -) -> FileDecryptor { - let aes_gcm_algo = if let EncryptionAlgorithm::AESGCMV1(a) = encryption_algorithm { - a - } else { - todo!("GCMCTRV1 encryption algorithm") - }; - - let aad_file_unique = aes_gcm_algo.aad_file_unique.unwrap(); - let aad_prefix: Vec = aes_gcm_algo.aad_prefix.unwrap_or_default(); - - FileDecryptor::new(file_decryption_properties, aad_file_unique, aad_prefix) +) -> Result { + match encryption_algorithm { + EncryptionAlgorithm::AESGCMV1(algo) => { + let aad_file_unique = algo + .aad_file_unique + .ok_or_else(|| general_err!("AAD unique file identifier is not set"))?; + let aad_prefix: Vec = algo.aad_prefix.unwrap_or_default(); + + Ok(FileDecryptor::new( + file_decryption_properties, + aad_file_unique, + aad_prefix, + )) + } + EncryptionAlgorithm::AESGCMCTRV1(_) => Err(nyi_err!( + "The AES_GCM_CTR_V1 encryption algorithm is not yet supported" + )), + } } #[cfg(test)] From c6d4dca04dd39545e4ac6dc26897518fd4632acc Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Wed, 5 Feb 2025 01:51:40 +0100 Subject: [PATCH 54/97] load_async --- parquet/src/arrow/async_reader/mod.rs | 43 +++++++++++++++++++++++---- 1 file changed, 38 insertions(+), 5 deletions(-) diff --git a/parquet/src/arrow/async_reader/mod.rs b/parquet/src/arrow/async_reader/mod.rs index 607518fdc2de..a772bbc25327 100644 --- a/parquet/src/arrow/async_reader/mod.rs +++ b/parquet/src/arrow/async_reader/mod.rs @@ -61,6 +61,9 @@ use crate::format::{BloomFilterAlgorithm, BloomFilterCompression, BloomFilterHas mod metadata; pub use metadata::*; +#[cfg(feature = "encryption")] +use crate::encryption::decryption::FileDecryptionProperties; + #[cfg(feature = "object_store")] mod store; @@ -179,17 +182,29 @@ impl ArrowReaderMetadata { pub async fn load_async( input: &mut T, options: ArrowReaderOptions, + #[cfg(feature = "encryption")] file_decryption_properties: Option< + &FileDecryptionProperties, + >, ) -> Result { // TODO: this is all rather awkward. It would be nice if AsyncFileReader::get_metadata // took an argument to fetch the page indexes. let mut metadata = input.get_metadata().await?; + #[cfg(feature = "encryption")] + let use_encryption = file_decryption_properties.is_some(); + + #[cfg(not(feature = "encryption"))] + let use_encryption = false; + if options.page_index && metadata.column_index().is_none() && metadata.offset_index().is_none() + || use_encryption { let m = Arc::try_unwrap(metadata).unwrap_or_else(|e| e.as_ref().clone()); - let mut reader = ParquetMetaDataReader::new_with_metadata(m).with_page_indexes(true); + let mut reader = ParquetMetaDataReader::new_with_metadata(m) + .with_page_indexes(true) + .with_decryption_properties(file_decryption_properties); reader.load_page_index(input).await?; metadata = Arc::new(reader.finish()?) } @@ -347,13 +362,31 @@ impl ParquetRecordBatchStreamBuilder { /// # } /// ``` pub async fn new(input: T) -> Result { - Self::new_with_options(input, Default::default()).await + Self::new_with_options( + input, + Default::default(), + #[cfg(feature = "encryption")] + None, + ) + .await } /// Create a new [`ParquetRecordBatchStreamBuilder`] with the provided async source /// and [`ArrowReaderOptions`] - pub async fn new_with_options(mut input: T, options: ArrowReaderOptions) -> Result { - let metadata = ArrowReaderMetadata::load_async(&mut input, options).await?; + pub async fn new_with_options( + mut input: T, + options: ArrowReaderOptions, + #[cfg(feature = "encryption")] file_decryption_properties: Option< + &FileDecryptionProperties, + >, + ) -> Result { + let metadata = ArrowReaderMetadata::load_async( + &mut input, + options, + #[cfg(feature = "encryption")] + file_decryption_properties, + ) + .await?; Ok(Self::new_with_metadata(input, metadata)) } @@ -386,7 +419,7 @@ impl ParquetRecordBatchStreamBuilder { /// // open file with parquet data /// let mut file = tokio::fs::File::from_std(file); /// // load metadata once - /// let meta = ArrowReaderMetadata::load_async(&mut file, Default::default()).await.unwrap(); + /// let meta = ArrowReaderMetadata::load_async(&mut file, Default::default(), #[cfg(feature = "encryption")] None).await.unwrap(); /// // create two readers, a and b, from the same underlying file /// // without reading the metadata again /// let mut a = ParquetRecordBatchStreamBuilder::new_with_metadata( From dc19abc550a946fbcdf8140970f0d52455045954 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Wed, 5 Feb 2025 10:24:07 +0100 Subject: [PATCH 55/97] new_with_options --- parquet/src/arrow/async_reader/mod.rs | 181 +++++++++++++++++++++++--- 1 file changed, 161 insertions(+), 20 deletions(-) diff --git a/parquet/src/arrow/async_reader/mod.rs b/parquet/src/arrow/async_reader/mod.rs index a772bbc25327..d48a76ac0321 100644 --- a/parquet/src/arrow/async_reader/mod.rs +++ b/parquet/src/arrow/async_reader/mod.rs @@ -1106,6 +1106,7 @@ mod tests { use arrow::error::Result as ArrowResult; use arrow_array::builder::{ListBuilder, StringBuilder}; use arrow_array::cast::AsArray; + use arrow_array::types; use arrow_array::types::Int32Type; use arrow_array::{ Array, ArrayRef, Int32Array, Int8Array, RecordBatchReader, Scalar, StringArray, @@ -1117,6 +1118,7 @@ mod tests { use std::collections::HashMap; use std::sync::{Arc, Mutex}; use tempfile::tempfile; + use tokio::fs::File; #[derive(Clone)] struct TestReader { @@ -1278,9 +1280,14 @@ mod tests { }; let options = ArrowReaderOptions::new().with_page_index(true); - let builder = ParquetRecordBatchStreamBuilder::new_with_options(async_reader, options) - .await - .unwrap(); + let builder = ParquetRecordBatchStreamBuilder::new_with_options( + async_reader, + options, + #[cfg(feature = "encryption")] + None, + ) + .await + .unwrap(); // The builder should have page and offset indexes loaded now let metadata_with_index = builder.metadata(); @@ -1392,9 +1399,14 @@ mod tests { }; let options = ArrowReaderOptions::new().with_page_index(true); - let builder = ParquetRecordBatchStreamBuilder::new_with_options(async_reader, options) - .await - .unwrap(); + let builder = ParquetRecordBatchStreamBuilder::new_with_options( + async_reader, + options, + #[cfg(feature = "encryption")] + None, + ) + .await + .unwrap(); let selection = RowSelection::from(vec![ RowSelector::skip(21), // Skip first page @@ -1475,9 +1487,14 @@ mod tests { }; let options = ArrowReaderOptions::new().with_page_index(true); - let builder = ParquetRecordBatchStreamBuilder::new_with_options(async_reader, options) - .await - .unwrap(); + let builder = ParquetRecordBatchStreamBuilder::new_with_options( + async_reader, + options, + #[cfg(feature = "encryption")] + None, + ) + .await + .unwrap(); let col_idx: usize = rand.gen_range(0..13); let mask = ProjectionMask::leaves(builder.parquet_schema(), vec![col_idx]); @@ -1546,9 +1563,14 @@ mod tests { }; let options = ArrowReaderOptions::new().with_page_index(true); - let builder = ParquetRecordBatchStreamBuilder::new_with_options(async_reader, options) - .await - .unwrap(); + let builder = ParquetRecordBatchStreamBuilder::new_with_options( + async_reader, + options, + #[cfg(feature = "encryption")] + None, + ) + .await + .unwrap(); let col_idx: usize = rand.gen_range(0..13); let mask = ProjectionMask::leaves(builder.parquet_schema(), vec![col_idx]); @@ -1780,14 +1802,19 @@ mod tests { let mask = ProjectionMask::leaves(&parquet_schema, vec![0, 2]); let options = ArrowReaderOptions::new().with_page_index(true); - let stream = ParquetRecordBatchStreamBuilder::new_with_options(async_reader, options) - .await - .unwrap() - .with_projection(mask.clone()) - .with_batch_size(1024) - .with_row_filter(filter) - .build() - .unwrap(); + let stream = ParquetRecordBatchStreamBuilder::new_with_options( + async_reader, + options, + #[cfg(feature = "encryption")] + None, + ) + .await + .unwrap() + .with_projection(mask.clone()) + .with_batch_size(1024) + .with_row_filter(filter) + .build() + .unwrap(); let batches: Vec = stream.try_collect().await.unwrap(); @@ -2172,6 +2199,8 @@ mod tests { let mut reader = ParquetRecordBatchStreamBuilder::new_with_options( tokio::fs::File::from_std(file.try_clone().unwrap()), ArrowReaderOptions::new().with_page_index(true), + #[cfg(feature = "encryption")] + None, ) .await .unwrap(); @@ -2379,4 +2408,116 @@ mod tests { let result = reader.try_collect::>().await.unwrap(); assert_eq!(result.len(), 1); } + + #[cfg(feature = "encryption")] + async fn verify_encryption_test_file_read( + file: &mut File, + decryption_properties: FileDecryptionProperties, + ) { + let decryption_properties = Some(decryption_properties); + + let metadata = ArrowReaderMetadata::load_async( + file, + Default::default(), + decryption_properties.as_ref(), + ) + .await + .unwrap(); + let arrow_reader_metadata = ArrowReaderMetadata::load_async( + file, + Default::default(), + #[cfg(feature = "encryption")] + None, + ) + .await + .unwrap(); + let file_metadata = metadata.metadata.file_metadata(); + + let record_reader = ParquetRecordBatchStreamBuilder::new_with_metadata( + file.try_clone().await.unwrap(), + arrow_reader_metadata.clone(), + ) + .build() + .unwrap(); + let record_batches = record_reader.try_collect::>().await.unwrap(); + + assert_eq!(file_metadata.num_rows(), 50); + assert_eq!(file_metadata.schema_descr().num_columns(), 8); + assert_eq!( + file_metadata.created_by().unwrap(), + "parquet-cpp-arrow version 19.0.0-SNAPSHOT" + ); + + metadata.metadata.row_groups().iter().for_each(|rg| { + assert_eq!(rg.num_columns(), 8); + assert_eq!(rg.num_rows(), 50); + }); + + let mut row_count = 0; + for batch in record_batches { + row_count += batch.num_rows(); + + let bool_col = batch.column(0).as_boolean(); + let time_col = batch + .column(1) + .as_primitive::(); + let list_col = batch.column(2).as_list::(); + let timestamp_col = batch + .column(3) + .as_primitive::(); + let f32_col = batch.column(4).as_primitive::(); + let f64_col = batch.column(5).as_primitive::(); + let binary_col = batch.column(6).as_binary::(); + let fixed_size_binary_col = batch.column(7).as_fixed_size_binary(); + + for (i, x) in bool_col.iter().enumerate() { + assert_eq!(x.unwrap(), i % 2 == 0); + } + for (i, x) in time_col.iter().enumerate() { + assert_eq!(x.unwrap(), i as i32); + } + for (i, list_item) in list_col.iter().enumerate() { + let list_item = list_item.unwrap(); + let list_item = list_item.as_primitive::(); + assert_eq!(list_item.len(), 2); + assert_eq!(list_item.value(0), ((i * 2) * 1000000000000) as i64); + assert_eq!(list_item.value(1), ((i * 2 + 1) * 1000000000000) as i64); + } + for x in timestamp_col.iter() { + assert!(x.is_some()); + } + for (i, x) in f32_col.iter().enumerate() { + assert_eq!(x.unwrap(), i as f32 * 1.1f32); + } + for (i, x) in f64_col.iter().enumerate() { + assert_eq!(x.unwrap(), i as f64 * 1.1111111f64); + } + for (i, x) in binary_col.iter().enumerate() { + assert_eq!(x.is_some(), i % 2 == 0); + if let Some(x) = x { + assert_eq!(&x[0..7], b"parquet"); + } + } + for (i, x) in fixed_size_binary_col.iter().enumerate() { + assert_eq!(x.unwrap(), &[i as u8; 10]); + } + } + + assert_eq!(row_count, file_metadata.num_rows() as usize); + } + + #[tokio::test] + #[cfg(feature = "encryption")] + async fn test_uniform_encryption() { + let testdata = arrow::util::test_util::parquet_test_data(); + let path = format!("{testdata}/uniform_encryption.parquet.encrypted"); + let mut file = File::open(&path).await.unwrap(); + + let key_code: &[u8] = "0123456789012345".as_bytes(); + let decryption_properties = FileDecryptionProperties::builder(key_code.to_vec()) + .build() + .unwrap(); + + verify_encryption_test_file_read(&mut file, decryption_properties); + } } From 329a6139d50a10f621476459a87d8526ca844174 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Wed, 5 Feb 2025 17:01:34 +0100 Subject: [PATCH 56/97] Add tests --- parquet/src/arrow/async_reader/mod.rs | 158 ++++++++++++++++++++++++-- 1 file changed, 148 insertions(+), 10 deletions(-) diff --git a/parquet/src/arrow/async_reader/mod.rs b/parquet/src/arrow/async_reader/mod.rs index d48a76ac0321..5866fe1f1756 100644 --- a/parquet/src/arrow/async_reader/mod.rs +++ b/parquet/src/arrow/async_reader/mod.rs @@ -1112,7 +1112,7 @@ mod tests { Array, ArrayRef, Int32Array, Int8Array, RecordBatchReader, Scalar, StringArray, StructArray, UInt64Array, }; - use arrow_schema::{DataType, Field, Schema}; + use arrow_schema::{ArrowError, DataType, Field, Schema}; use futures::{StreamExt, TryStreamExt}; use rand::{thread_rng, Rng}; use std::collections::HashMap; @@ -2423,14 +2423,9 @@ mod tests { ) .await .unwrap(); - let arrow_reader_metadata = ArrowReaderMetadata::load_async( - file, - Default::default(), - #[cfg(feature = "encryption")] - None, - ) - .await - .unwrap(); + let arrow_reader_metadata = ArrowReaderMetadata::load_async(file, Default::default(), None) + .await + .unwrap(); let file_metadata = metadata.metadata.file_metadata(); let record_reader = ParquetRecordBatchStreamBuilder::new_with_metadata( @@ -2506,6 +2501,114 @@ mod tests { assert_eq!(row_count, file_metadata.num_rows() as usize); } + #[tokio::test] + #[cfg(feature = "encryption")] + async fn test_non_uniform_encryption_plaintext_footer() { + let testdata = arrow::util::test_util::parquet_test_data(); + let path = format!("{testdata}/encrypt_columns_plaintext_footer.parquet.encrypted"); + let mut file = File::open(&path).await.unwrap(); + + // There is always a footer key even with a plaintext footer, + // but this is used for signing the footer. + let footer_key = "0123456789012345".as_bytes(); // 128bit/16 + let column_1_key = "1234567890123450".as_bytes(); + let column_2_key = "1234567890123451".as_bytes(); + + let decryption_properties = FileDecryptionProperties::builder(footer_key.to_vec()) + .with_column_key("double_field".as_bytes().to_vec(), column_1_key.to_vec()) + .with_column_key("float_field".as_bytes().to_vec(), column_2_key.to_vec()) + .build() + .unwrap(); + + verify_encryption_test_file_read(&mut file, decryption_properties).await; + } + + #[tokio::test] + async fn test_non_uniform_encryption_plaintext_footer_without_decryption() { + let testdata = arrow::util::test_util::parquet_test_data(); + let path = format!("{testdata}/encrypt_columns_plaintext_footer.parquet.encrypted"); + let mut file = File::open(&path).await.unwrap(); + + let metadata = ArrowReaderMetadata::load_async( + &mut file, + Default::default(), + #[cfg(feature = "encryption")] + None, + ) + .await + .unwrap(); + let file_metadata = metadata.metadata.file_metadata(); + + assert_eq!(file_metadata.num_rows(), 50); + assert_eq!(file_metadata.schema_descr().num_columns(), 8); + assert_eq!( + file_metadata.created_by().unwrap(), + "parquet-cpp-arrow version 19.0.0-SNAPSHOT" + ); + + //todo + + // metadata.metadata.row_groups().iter().for_each(|rg| { + // assert_eq!(rg.num_columns(), 8); + // assert_eq!(rg.num_rows(), 50); + // }); + // + // // Should be able to read unencrypted columns. Test reading one column. + // let builder = ParquetRecordBatchReaderBuilder::try_new(file).unwrap(); + // let mask = ProjectionMask::leaves(builder.parquet_schema(), [1]); + // let record_reader = builder.with_projection(mask).build().unwrap(); + // + // let mut row_count = 0; + // for batch in record_reader { + // let batch = batch.unwrap(); + // row_count += batch.num_rows(); + // + // let time_col = batch + // .column(0) + // .as_primitive::(); + // for (i, x) in time_col.iter().enumerate() { + // assert_eq!(x.unwrap(), i as i32); + // } + // } + // + // assert_eq!(row_count, file_metadata.num_rows() as usize); + // + // // Reading an encrypted column should fail + // let file = std::fs::File::open(&path).unwrap(); + // let builder = ParquetRecordBatchReaderBuilder::try_new(file).unwrap(); + // let mask = ProjectionMask::leaves(builder.parquet_schema(), [4]); + // let mut record_reader = builder.with_projection(mask).build().unwrap(); + // + // match record_reader.next() { + // Some(Err(ArrowError::ParquetError(s))) => { + // assert!(s.contains("protocol error")); + // } + // _ => { + // panic!("Expected ArrowError::ParquetError"); + // } + // }; + } + + #[tokio::test] + #[cfg(feature = "encryption")] + async fn test_non_uniform_encryption() { + let testdata = arrow::util::test_util::parquet_test_data(); + let path = format!("{testdata}/encrypt_columns_and_footer.parquet.encrypted"); + let mut file = File::open(&path).await.unwrap(); + + let footer_key = "0123456789012345".as_bytes(); // 128bit/16 + let column_1_key = "1234567890123450".as_bytes(); + let column_2_key = "1234567890123451".as_bytes(); + + let decryption_properties = FileDecryptionProperties::builder(footer_key.to_vec()) + .with_column_key("double_field".as_bytes().to_vec(), column_1_key.to_vec()) + .with_column_key("float_field".as_bytes().to_vec(), column_2_key.to_vec()) + .build() + .unwrap(); + + verify_encryption_test_file_read(&mut file, decryption_properties).await; + } + #[tokio::test] #[cfg(feature = "encryption")] async fn test_uniform_encryption() { @@ -2518,6 +2621,41 @@ mod tests { .build() .unwrap(); - verify_encryption_test_file_read(&mut file, decryption_properties); + verify_encryption_test_file_read(&mut file, decryption_properties).await; + } + + #[tokio::test] + #[cfg(feature = "encryption")] + async fn test_aes_ctr_encryption() { + let testdata = arrow::util::test_util::parquet_test_data(); + let path = format!("{testdata}/encrypt_columns_and_footer_ctr.parquet.encrypted"); + let mut file = File::open(&path).await.unwrap(); + + let footer_key = "0123456789012345".as_bytes(); + let column_1_key = "1234567890123450".as_bytes(); + let column_2_key = "1234567890123451".as_bytes(); + + let decryption_properties = FileDecryptionProperties::builder(footer_key.to_vec()) + .with_column_key("double_field".as_bytes().to_vec(), column_1_key.to_vec()) + .with_column_key("float_field".as_bytes().to_vec(), column_2_key.to_vec()) + .build() + .unwrap(); + + let decryption_properties = Some(decryption_properties); + let metadata = ArrowReaderMetadata::load_async( + &mut file, + Default::default(), + decryption_properties.as_ref(), + ) + .await; + + match metadata { + Err(crate::errors::ParquetError::NYI(s)) => { + assert!(s.contains("AES_GCM_CTR_V1")); + } + _ => { + panic!("Expected ParquetError::NYI"); + } + }; } } From 00aa47ad98417c6e90132e0ae9e4e9431628e735 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Thu, 6 Feb 2025 00:28:34 +0100 Subject: [PATCH 57/97] get_metadata --- parquet/examples/read_with_rowgroup.rs | 2 +- parquet/src/arrow/async_reader/mod.rs | 67 ++++++++++++++++++------- parquet/src/arrow/async_reader/store.rs | 17 ++++++- 3 files changed, 67 insertions(+), 19 deletions(-) diff --git a/parquet/examples/read_with_rowgroup.rs b/parquet/examples/read_with_rowgroup.rs index be274fe73e0b..c087cb17851d 100644 --- a/parquet/examples/read_with_rowgroup.rs +++ b/parquet/examples/read_with_rowgroup.rs @@ -35,7 +35,7 @@ async fn main() -> Result<()> { let mut file = File::open(&path).await.unwrap(); // The metadata could be cached in other places, this example only shows how to read - let metadata = file.get_metadata().await?; + let metadata = file.get_metadata(None).await?; for rg in metadata.row_groups() { let mut rowgroup = InMemoryRowGroup::create(rg.clone(), ProjectionMask::all()); diff --git a/parquet/src/arrow/async_reader/mod.rs b/parquet/src/arrow/async_reader/mod.rs index 5866fe1f1756..e0f69b489d4e 100644 --- a/parquet/src/arrow/async_reader/mod.rs +++ b/parquet/src/arrow/async_reader/mod.rs @@ -106,7 +106,12 @@ pub trait AsyncFileReader: Send { /// Provides asynchronous access to the [`ParquetMetaData`] of a parquet file, /// allowing fine-grained control over how metadata is sourced, in particular allowing /// for caching, pre-fetching, catalog metadata, etc... - fn get_metadata(&mut self) -> BoxFuture<'_, Result>>; + fn get_metadata<'a>( + &'a mut self, + #[cfg(feature = "encryption")] file_decryption_properties: Option< + &'a FileDecryptionProperties, + >, + ) -> BoxFuture<'a, Result>>; } /// This allows Box to be used as an AsyncFileReader, @@ -119,8 +124,16 @@ impl AsyncFileReader for Box { self.as_mut().get_byte_ranges(ranges) } - fn get_metadata(&mut self) -> BoxFuture<'_, Result>> { - self.as_mut().get_metadata() + fn get_metadata<'a>( + &'a mut self, + #[cfg(feature = "encryption")] file_decryption_properties: Option< + &'a FileDecryptionProperties, + >, + ) -> BoxFuture<'a, Result>> { + self.as_mut().get_metadata( + #[cfg(feature = "encryption")] + file_decryption_properties, + ) } } @@ -141,7 +154,12 @@ impl AsyncFileReader for T { .boxed() } - fn get_metadata(&mut self) -> BoxFuture<'_, Result>> { + fn get_metadata<'a>( + &'a mut self, + #[cfg(feature = "encryption")] file_decryption_properties: Option< + &'a FileDecryptionProperties, + >, + ) -> BoxFuture<'a, Result>> { const FOOTER_SIZE_I64: i64 = FOOTER_SIZE as i64; async move { self.seek(SeekFrom::End(-FOOTER_SIZE_I64)).await?; @@ -157,12 +175,11 @@ impl AsyncFileReader for T { let mut buf = Vec::with_capacity(metadata_len); self.take(metadata_len as _).read_to_end(&mut buf).await?; - // todo: provide file_decryption_properties Ok(Arc::new(ParquetMetaDataReader::decode_metadata( &buf, footer.encrypted_footer(), #[cfg(feature = "encryption")] - None, + file_decryption_properties, )?)) } .boxed() @@ -188,7 +205,12 @@ impl ArrowReaderMetadata { ) -> Result { // TODO: this is all rather awkward. It would be nice if AsyncFileReader::get_metadata // took an argument to fetch the page indexes. - let mut metadata = input.get_metadata().await?; + let mut metadata = input + .get_metadata( + #[cfg(feature = "encryption")] + file_decryption_properties, + ) + .await?; #[cfg(feature = "encryption")] let use_encryption = file_decryption_properties.is_some(); @@ -199,12 +221,14 @@ impl ArrowReaderMetadata { if options.page_index && metadata.column_index().is_none() && metadata.offset_index().is_none() - || use_encryption { let m = Arc::try_unwrap(metadata).unwrap_or_else(|e| e.as_ref().clone()); - let mut reader = ParquetMetaDataReader::new_with_metadata(m) - .with_page_indexes(true) - .with_decryption_properties(file_decryption_properties); + let mut reader = ParquetMetaDataReader::new_with_metadata(m).with_page_indexes(true); + + if use_encryption { + reader = reader.with_decryption_properties(file_decryption_properties); + } + reader.load_page_index(input).await?; metadata = Arc::new(reader.finish()?) } @@ -1112,7 +1136,7 @@ mod tests { Array, ArrayRef, Int32Array, Int8Array, RecordBatchReader, Scalar, StringArray, StructArray, UInt64Array, }; - use arrow_schema::{ArrowError, DataType, Field, Schema}; + use arrow_schema::{DataType, Field, Schema}; use futures::{StreamExt, TryStreamExt}; use rand::{thread_rng, Rng}; use std::collections::HashMap; @@ -1133,7 +1157,12 @@ mod tests { futures::future::ready(Ok(self.data.slice(range))).boxed() } - fn get_metadata(&mut self) -> BoxFuture<'_, Result>> { + fn get_metadata<'a>( + &'a mut self, + #[cfg(feature = "encryption")] _file_decryption_properties: Option< + &'a FileDecryptionProperties, + >, + ) -> BoxFuture<'a, Result>> { futures::future::ready(Ok(self.metadata.clone())).boxed() } } @@ -2423,9 +2452,13 @@ mod tests { ) .await .unwrap(); - let arrow_reader_metadata = ArrowReaderMetadata::load_async(file, Default::default(), None) - .await - .unwrap(); + let arrow_reader_metadata = ArrowReaderMetadata::load_async( + file, + Default::default(), + decryption_properties.as_ref(), + ) + .await + .unwrap(); let file_metadata = metadata.metadata.file_metadata(); let record_reader = ParquetRecordBatchStreamBuilder::new_with_metadata( @@ -2650,7 +2683,7 @@ mod tests { .await; match metadata { - Err(crate::errors::ParquetError::NYI(s)) => { + Err(ParquetError::NYI(s)) => { assert!(s.contains("AES_GCM_CTR_V1")); } _ => { diff --git a/parquet/src/arrow/async_reader/store.rs b/parquet/src/arrow/async_reader/store.rs index fd0397b5e1fc..bf1765b09ef8 100644 --- a/parquet/src/arrow/async_reader/store.rs +++ b/parquet/src/arrow/async_reader/store.rs @@ -23,6 +23,7 @@ use object_store::{path::Path, ObjectMeta, ObjectStore}; use tokio::runtime::Handle; use crate::arrow::async_reader::AsyncFileReader; +use crate::encryption::decryption::FileDecryptionProperties; use crate::errors::{ParquetError, Result}; use crate::file::metadata::{ParquetMetaData, ParquetMetaDataReader}; @@ -163,15 +164,29 @@ impl AsyncFileReader for ParquetObjectReader { // an `impl MetadataFetch` and calls those methods to get data from it. Due to `Self`'s impl of // `AsyncFileReader`, the calls to `MetadataFetch::fetch` are just delegated to // `Self::get_bytes`. - fn get_metadata(&mut self) -> BoxFuture<'_, Result>> { + fn get_metadata<'a>( + &'a mut self, + #[cfg(feature = "encryption")] file_decryption_properties: Option< + &'a FileDecryptionProperties, + >, + ) -> BoxFuture<'a, Result>> { Box::pin(async move { let file_size = self.meta.size; + #[cfg(not(feature = "encryption"))] let metadata = ParquetMetaDataReader::new() .with_column_indexes(self.preload_column_index) .with_offset_indexes(self.preload_offset_index) .with_prefetch_hint(self.metadata_size_hint) .load_and_finish(self, file_size) .await?; + #[cfg(feature = "encryption")] + let metadata = ParquetMetaDataReader::new() + .with_column_indexes(self.preload_column_index) + .with_offset_indexes(self.preload_offset_index) + .with_prefetch_hint(self.metadata_size_hint) + .with_decryption_properties(file_decryption_properties) + .load_and_finish(self, file_size) + .await?; Ok(Arc::new(metadata)) }) } From fb6cdbcf76749bb4bde6e5918f7ba06d9f2f74c2 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Fri, 7 Feb 2025 12:22:14 +0100 Subject: [PATCH 58/97] Add CryptoContext to async_reader --- parquet/src/arrow/async_reader/mod.rs | 38 +++++++++++++++++++++++++-- 1 file changed, 36 insertions(+), 2 deletions(-) diff --git a/parquet/src/arrow/async_reader/mod.rs b/parquet/src/arrow/async_reader/mod.rs index e0f69b489d4e..cdfde0b87a48 100644 --- a/parquet/src/arrow/async_reader/mod.rs +++ b/parquet/src/arrow/async_reader/mod.rs @@ -68,6 +68,7 @@ use crate::encryption::decryption::FileDecryptionProperties; mod store; use crate::arrow::schema::ParquetField; +use crate::encryption::decryption::CryptoContext; #[cfg(feature = "object_store")] pub use store::*; @@ -632,6 +633,8 @@ where row_count: meta.num_rows() as usize, column_chunks: vec![None; meta.columns().len()], offset_index, + #[cfg(feature = "encryption")] + parquet_metadata: Some(self.metadata.clone()), }; if let Some(filter) = self.filter.as_mut() { @@ -917,6 +920,8 @@ struct InMemoryRowGroup<'a> { offset_index: Option<&'a [OffsetIndexMetaData]>, column_chunks: Vec>>, row_count: usize, + #[cfg(feature = "encryption")] + parquet_metadata: Option>, } impl InMemoryRowGroup<'_> { @@ -1018,6 +1023,35 @@ impl RowGroups for InMemoryRowGroup<'_> { } fn column_chunks(&self, i: usize) -> Result> { + let Some(parquet_metadata) = &self.parquet_metadata else { + todo!() + }; + + #[cfg(feature = "encryption")] + let crypto_context = if let Some(file_decryptor) = parquet_metadata.file_decryptor() { + let column_name = parquet_metadata.file_metadata().schema_descr().column(i); + + if file_decryptor.is_column_encrypted(column_name.name().as_bytes()) { + let data_decryptor = + file_decryptor.get_column_data_decryptor(column_name.name().as_bytes()); + let metadata_decryptor = + file_decryptor.get_column_metadata_decryptor(column_name.name().as_bytes()); + + let crypto_context = CryptoContext::new( + 0, + i, + data_decryptor, + metadata_decryptor, + file_decryptor.file_aad().clone(), + ); + Some(Arc::new(crypto_context)) + } else { + None + } + } else { + None + }; + match &self.column_chunks[i] { None => Err(ParquetError::General(format!( "Invalid column index {i}, column was not fetched" @@ -1028,14 +1062,14 @@ impl RowGroups for InMemoryRowGroup<'_> { // filter out empty offset indexes (old versions specified Some(vec![]) when no present) .filter(|index| !index.is_empty()) .map(|index| index[i].page_locations.clone()); - // todo: provide crypto_context + let page_reader: Box = Box::new(SerializedPageReader::new( data.clone(), self.metadata.column(i), self.row_count, page_locations, #[cfg(feature = "encryption")] - None, + crypto_context, )?); Ok(Box::new(ColumnChunkIterator { From aa444083b3d500455f9f36edb233c3901bd897d8 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Fri, 7 Feb 2025 14:38:32 +0100 Subject: [PATCH 59/97] Add row_group_ordinal to InMemoryRowGroup --- parquet/examples/read_with_rowgroup.rs | 1 - parquet/src/arrow/arrow_reader/mod.rs | 2 +- parquet/src/arrow/async_reader/mod.rs | 106 ++++++++++++++----------- 3 files changed, 60 insertions(+), 49 deletions(-) diff --git a/parquet/examples/read_with_rowgroup.rs b/parquet/examples/read_with_rowgroup.rs index c087cb17851d..b458907b6187 100644 --- a/parquet/examples/read_with_rowgroup.rs +++ b/parquet/examples/read_with_rowgroup.rs @@ -115,7 +115,6 @@ impl RowGroups for InMemoryRowGroup { None => Err(ParquetError::General(format!( "Invalid column index {i}, column was not fetched" ))), - // todo: provide crypto_context Some(data) => { let page_reader: Box = Box::new(SerializedPageReader::new( data.clone(), diff --git a/parquet/src/arrow/arrow_reader/mod.rs b/parquet/src/arrow/arrow_reader/mod.rs index 5ec61aa096aa..69fbbe5b12b5 100644 --- a/parquet/src/arrow/arrow_reader/mod.rs +++ b/parquet/src/arrow/arrow_reader/mod.rs @@ -698,7 +698,7 @@ impl Iterator for ReaderPageIterator { let meta = rg.column(self.column_idx); let offset_index = self.metadata.offset_index(); // `offset_index` may not exist and `i[rg_idx]` will be empty. - // To avoid `i[rg_idx][self.oolumn_idx`] panic, we need to filter out empty `i[rg_idx]`. + // To avoid `i[rg_idx][self.column_idx`] panic, we need to filter out empty `i[rg_idx]`. let page_locations = offset_index .filter(|i| !i[rg_idx].is_empty()) .map(|i| i[rg_idx][self.column_idx].page_locations.clone()); diff --git a/parquet/src/arrow/async_reader/mod.rs b/parquet/src/arrow/async_reader/mod.rs index cdfde0b87a48..b4fdc42b09d5 100644 --- a/parquet/src/arrow/async_reader/mod.rs +++ b/parquet/src/arrow/async_reader/mod.rs @@ -634,6 +634,8 @@ where column_chunks: vec![None; meta.columns().len()], offset_index, #[cfg(feature = "encryption")] + row_group_ordinal: row_group_idx, + #[cfg(feature = "encryption")] parquet_metadata: Some(self.metadata.clone()), }; @@ -921,6 +923,8 @@ struct InMemoryRowGroup<'a> { column_chunks: Vec>>, row_count: usize, #[cfg(feature = "encryption")] + row_group_ordinal: usize, + #[cfg(feature = "encryption")] parquet_metadata: Option>, } @@ -1023,13 +1027,21 @@ impl RowGroups for InMemoryRowGroup<'_> { } fn column_chunks(&self, i: usize) -> Result> { - let Some(parquet_metadata) = &self.parquet_metadata else { - todo!() - }; - #[cfg(feature = "encryption")] - let crypto_context = if let Some(file_decryptor) = parquet_metadata.file_decryptor() { - let column_name = parquet_metadata.file_metadata().schema_descr().column(i); + let crypto_context = if let Some(file_decryptor) = &self + .parquet_metadata + .clone() + .unwrap() + .file_decryptor() + .clone() + { + let column_name = &self + .parquet_metadata + .clone() + .unwrap() + .file_metadata() + .schema_descr() + .column(i); if file_decryptor.is_column_encrypted(column_name.name().as_bytes()) { let data_decryptor = @@ -1037,8 +1049,9 @@ impl RowGroups for InMemoryRowGroup<'_> { let metadata_decryptor = file_decryptor.get_column_metadata_decryptor(column_name.name().as_bytes()); + // todo: Do we need row_group_ordinal here? let crypto_context = CryptoContext::new( - 0, + self.row_group_ordinal, i, data_decryptor, metadata_decryptor, @@ -2613,47 +2626,46 @@ mod tests { "parquet-cpp-arrow version 19.0.0-SNAPSHOT" ); - //todo + metadata.metadata.row_groups().iter().for_each(|rg| { + assert_eq!(rg.num_columns(), 8); + assert_eq!(rg.num_rows(), 50); + }); - // metadata.metadata.row_groups().iter().for_each(|rg| { - // assert_eq!(rg.num_columns(), 8); - // assert_eq!(rg.num_rows(), 50); - // }); - // - // // Should be able to read unencrypted columns. Test reading one column. - // let builder = ParquetRecordBatchReaderBuilder::try_new(file).unwrap(); - // let mask = ProjectionMask::leaves(builder.parquet_schema(), [1]); - // let record_reader = builder.with_projection(mask).build().unwrap(); - // - // let mut row_count = 0; - // for batch in record_reader { - // let batch = batch.unwrap(); - // row_count += batch.num_rows(); - // - // let time_col = batch - // .column(0) - // .as_primitive::(); - // for (i, x) in time_col.iter().enumerate() { - // assert_eq!(x.unwrap(), i as i32); - // } - // } - // - // assert_eq!(row_count, file_metadata.num_rows() as usize); - // - // // Reading an encrypted column should fail - // let file = std::fs::File::open(&path).unwrap(); - // let builder = ParquetRecordBatchReaderBuilder::try_new(file).unwrap(); - // let mask = ProjectionMask::leaves(builder.parquet_schema(), [4]); - // let mut record_reader = builder.with_projection(mask).build().unwrap(); - // - // match record_reader.next() { - // Some(Err(ArrowError::ParquetError(s))) => { - // assert!(s.contains("protocol error")); - // } - // _ => { - // panic!("Expected ArrowError::ParquetError"); - // } - // }; + // Should be able to read unencrypted columns. Test reading one column. + let builder = ParquetRecordBatchStreamBuilder::new(file).await.unwrap(); + let mask = ProjectionMask::leaves(builder.parquet_schema(), [1]); + let record_reader = builder.with_projection(mask).build().unwrap(); + let record_batches = record_reader.try_collect::>().await.unwrap(); + + let mut row_count = 0; + for batch in record_batches { + let batch = batch; + row_count += batch.num_rows(); + + let time_col = batch + .column(0) + .as_primitive::(); + for (i, x) in time_col.iter().enumerate() { + assert_eq!(x.unwrap(), i as i32); + } + } + + assert_eq!(row_count, file_metadata.num_rows() as usize); + + // Reading an encrypted column should fail + let file = File::open(&path).await.unwrap(); + let builder = ParquetRecordBatchStreamBuilder::new(file).await.unwrap(); + let mask = ProjectionMask::leaves(builder.parquet_schema(), [4]); + let mut record_reader = builder.with_projection(mask).build().unwrap(); + + match record_reader.next().await { + Some(Err(ParquetError::ArrowError(s))) => { + assert!(s.contains("protocol error")); + } + _ => { + panic!("Expected ArrowError::ParquetError"); + } + }; } #[tokio::test] From 497abb3a914d1e66e9e9ee0cdf10caa73ee4830a Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Fri, 7 Feb 2025 15:47:30 +0100 Subject: [PATCH 60/97] Adjust docstrings --- parquet/src/arrow/arrow_reader/mod.rs | 4 +++- parquet/src/arrow/async_reader/mod.rs | 10 +++++----- parquet/src/file/serialized_reader.rs | 6 +++--- 3 files changed, 11 insertions(+), 9 deletions(-) diff --git a/parquet/src/arrow/arrow_reader/mod.rs b/parquet/src/arrow/arrow_reader/mod.rs index 69fbbe5b12b5..c931de118965 100644 --- a/parquet/src/arrow/arrow_reader/mod.rs +++ b/parquet/src/arrow/arrow_reader/mod.rs @@ -380,6 +380,9 @@ impl ArrowReaderMetadata { /// If `options` has [`ArrowReaderOptions::with_page_index`] true, but /// `Self::metadata` is missing the page index, this function will attempt /// to load the page index by making an object store request. + /// + /// If encryption is enabled and the file is encrypted, the + /// `file_decryption_properties` must be provided. pub fn load( reader: &T, options: ArrowReaderOptions, @@ -855,7 +858,6 @@ impl ParquetRecordBatchReader { /// Create a new [`ParquetRecordBatchReader`] from the provided chunk reader and [`FileDecryptionProperties`] /// /// Note: this is needed when the parquet file is encrypted - // todo: add options or put file_decryption_properties into options #[cfg(feature = "encryption")] pub fn try_new_with_decryption( reader: T, diff --git a/parquet/src/arrow/async_reader/mod.rs b/parquet/src/arrow/async_reader/mod.rs index b4fdc42b09d5..cbb3618121f5 100644 --- a/parquet/src/arrow/async_reader/mod.rs +++ b/parquet/src/arrow/async_reader/mod.rs @@ -107,6 +107,7 @@ pub trait AsyncFileReader: Send { /// Provides asynchronous access to the [`ParquetMetaData`] of a parquet file, /// allowing fine-grained control over how metadata is sourced, in particular allowing /// for caching, pre-fetching, catalog metadata, etc... + /// If data is encrypted, the [`FileDecryptionProperties`] should be provided. fn get_metadata<'a>( &'a mut self, #[cfg(feature = "encryption")] file_decryption_properties: Option< @@ -248,7 +249,8 @@ pub struct AsyncReader(T); /// /// This builder handles reading the parquet file metadata, allowing consumers /// to use this information to select what specific columns, row groups, etc... -/// they wish to be read by the resulting stream +/// they wish to be read by the resulting stream. If footer or columns are encrypted +/// [`FileDecryptionProperties`] should be provided. /// /// See examples on [`ParquetRecordBatchStreamBuilder::new`] /// @@ -396,8 +398,8 @@ impl ParquetRecordBatchStreamBuilder { .await } - /// Create a new [`ParquetRecordBatchStreamBuilder`] with the provided async source - /// and [`ArrowReaderOptions`] + /// Create a new [`ParquetRecordBatchStreamBuilder`] with the provided async source, + /// [`ArrowReaderOptions`] and [`FileDecryptionProperties`] if the data is encrypted. pub async fn new_with_options( mut input: T, options: ArrowReaderOptions, @@ -1049,7 +1051,6 @@ impl RowGroups for InMemoryRowGroup<'_> { let metadata_decryptor = file_decryptor.get_column_metadata_decryptor(column_name.name().as_bytes()); - // todo: Do we need row_group_ordinal here? let crypto_context = CryptoContext::new( self.row_group_ordinal, i, @@ -1075,7 +1076,6 @@ impl RowGroups for InMemoryRowGroup<'_> { // filter out empty offset indexes (old versions specified Some(vec![]) when no present) .filter(|index| !index.is_empty()) .map(|index| index[i].page_locations.clone()); - let page_reader: Box = Box::new(SerializedPageReader::new( data.clone(), self.metadata.column(i), diff --git a/parquet/src/file/serialized_reader.rs b/parquet/src/file/serialized_reader.rs index 860df913b2c6..c64df8434fb5 100644 --- a/parquet/src/file/serialized_reader.rs +++ b/parquet/src/file/serialized_reader.rs @@ -377,7 +377,8 @@ pub(crate) fn read_page_header( Ok(page_header) } -/// Reads a [`PageHeader`] from the provided [`Read`] returning the number of bytes read +/// Reads a [`PageHeader`] from the provided [`Read`] returning the number of bytes read. +/// If the page header is encrypted [`CryptoContext`] must be provided. fn read_page_header_len( input: &mut T, #[cfg(feature = "encryption")] crypto_context: Option>, @@ -468,7 +469,6 @@ pub(crate) fn decode_page( let buffer = match decompressor { Some(decompressor) if can_decompress => { let uncompressed_size = page_header.uncompressed_page_size as usize; - let mut decompressed = Vec::with_capacity(uncompressed_size); let compressed = &buffer.as_ref()[offset..]; decompressed.extend_from_slice(&buffer.as_ref()[..offset]); @@ -583,7 +583,7 @@ pub struct SerializedPageReader { state: SerializedPageReaderState, - /// Crypto context + /// Crypto context carrying objects required for decryption #[cfg(feature = "encryption")] crypto_context: Option>, } From 16e9efe2122dd6576b5678c9fcd94ea04068889d Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Mon, 10 Feb 2025 11:24:56 +0100 Subject: [PATCH 61/97] Apply suggestions from code review Co-authored-by: Adam Reeve --- parquet/src/arrow/async_reader/mod.rs | 5 ++--- parquet/src/file/serialized_reader.rs | 3 +-- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/parquet/src/arrow/async_reader/mod.rs b/parquet/src/arrow/async_reader/mod.rs index cbb3618121f5..f8bd1dd8dc6d 100644 --- a/parquet/src/arrow/async_reader/mod.rs +++ b/parquet/src/arrow/async_reader/mod.rs @@ -227,9 +227,8 @@ impl ArrowReaderMetadata { let m = Arc::try_unwrap(metadata).unwrap_or_else(|e| e.as_ref().clone()); let mut reader = ParquetMetaDataReader::new_with_metadata(m).with_page_indexes(true); - if use_encryption { - reader = reader.with_decryption_properties(file_decryption_properties); - } + #[cfg(feature = "encryption")] + reader = reader.with_decryption_properties(file_decryption_properties); reader.load_page_index(input).await?; metadata = Arc::new(reader.finish()?) diff --git a/parquet/src/file/serialized_reader.rs b/parquet/src/file/serialized_reader.rs index c64df8434fb5..1407e0da61f6 100644 --- a/parquet/src/file/serialized_reader.rs +++ b/parquet/src/file/serialized_reader.rs @@ -443,8 +443,7 @@ pub(crate) fn decode_page( } #[cfg(feature = "encryption")] - let buffer: Bytes = if crypto_context.is_some() { - let crypto_context = crypto_context.as_ref().unwrap(); + let buffer: Bytes = if let Some(crypto_context) = crypto_context { let decryptor = crypto_context.data_decryptor(); let module_type = if crypto_context.dictionary_page { ModuleType::DictionaryPage From 95a3097135ab2cbaad1a04db0f188103583498c8 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Mon, 10 Feb 2025 11:35:19 +0100 Subject: [PATCH 62/97] Review feedback --- parquet/Cargo.toml | 5 +++-- parquet/src/arrow/async_reader/mod.rs | 12 ++---------- parquet/src/encryption/decryption.rs | 12 ++---------- 3 files changed, 7 insertions(+), 22 deletions(-) diff --git a/parquet/Cargo.toml b/parquet/Cargo.toml index 8866f80b2230..671675ae0c02 100644 --- a/parquet/Cargo.toml +++ b/parquet/Cargo.toml @@ -31,7 +31,8 @@ rust-version = { workspace = true } [target.'cfg(target_arch = "wasm32")'.dependencies] ahash = { version = "0.8", default-features = false, features = ["compile-time-rng"] } # See https://github.com/briansmith/ring/issues/918#issuecomment-2077788925 -ring = { version = "0.17", features = ["wasm32_unknown_unknown_js"] } +[target.'cfg(feature = "encryption")'.dependencies] +ring = { version = "0.17", default-features = false, features = ["wasm32_unknown_unknown_js", "std"] } [target.'cfg(not(target_arch = "wasm32"))'.dependencies] ahash = { version = "0.8", default-features = false, features = ["runtime-rng"] } @@ -101,7 +102,7 @@ zstd-sys = { version = ">=2.0.0, <2.0.14", default-features = false } all-features = true [features] -default = ["arrow", "snap", "brotli", "flate2", "lz4", "zstd", "base64", "simdutf8", "encryption"] +default = ["arrow", "snap", "brotli", "flate2", "lz4", "zstd", "base64", "simdutf8"] # Enable lz4 lz4 = ["lz4_flex"] # Enable arrow reader/writer APIs diff --git a/parquet/src/arrow/async_reader/mod.rs b/parquet/src/arrow/async_reader/mod.rs index f8bd1dd8dc6d..429372d49239 100644 --- a/parquet/src/arrow/async_reader/mod.rs +++ b/parquet/src/arrow/async_reader/mod.rs @@ -214,12 +214,6 @@ impl ArrowReaderMetadata { ) .await?; - #[cfg(feature = "encryption")] - let use_encryption = file_decryption_properties.is_some(); - - #[cfg(not(feature = "encryption"))] - let use_encryption = false; - if options.page_index && metadata.column_index().is_none() && metadata.offset_index().is_none() @@ -637,7 +631,7 @@ where #[cfg(feature = "encryption")] row_group_ordinal: row_group_idx, #[cfg(feature = "encryption")] - parquet_metadata: Some(self.metadata.clone()), + parquet_metadata: self.metadata.clone(), }; if let Some(filter) = self.filter.as_mut() { @@ -926,7 +920,7 @@ struct InMemoryRowGroup<'a> { #[cfg(feature = "encryption")] row_group_ordinal: usize, #[cfg(feature = "encryption")] - parquet_metadata: Option>, + parquet_metadata: Arc, } impl InMemoryRowGroup<'_> { @@ -1032,14 +1026,12 @@ impl RowGroups for InMemoryRowGroup<'_> { let crypto_context = if let Some(file_decryptor) = &self .parquet_metadata .clone() - .unwrap() .file_decryptor() .clone() { let column_name = &self .parquet_metadata .clone() - .unwrap() .file_metadata() .schema_descr() .column(i); diff --git a/parquet/src/encryption/decryption.rs b/parquet/src/encryption/decryption.rs index eac78def6dc4..8f340ecf4b43 100644 --- a/parquet/src/encryption/decryption.rs +++ b/parquet/src/encryption/decryption.rs @@ -116,14 +116,6 @@ impl FileDecryptionProperties { pub fn builder(footer_key: Vec) -> DecryptionPropertiesBuilder { DecryptionPropertiesBuilder::new(footer_key) } - - pub fn has_column_keys(&self) -> bool { - self.column_keys.is_some() - } - - pub fn aad_prefix(&self) -> Option> { - self.aad_prefix.clone() - } } pub struct DecryptionPropertiesBuilder { @@ -154,9 +146,9 @@ impl DecryptionPropertiesBuilder { self } - pub fn with_column_key(mut self, key: Vec, value: Vec) -> Self { + pub fn with_column_key(mut self, column_name: Vec, decryption_key: Vec) -> Self { let mut column_keys = self.column_keys.unwrap_or_default(); - column_keys.insert(key, value); + column_keys.insert(column_name, decryption_key); self.column_keys = Some(column_keys); self } From 1e73b25b3ea482979b338b53ee04f101a13b0ce8 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Mon, 10 Feb 2025 12:04:39 +0100 Subject: [PATCH 63/97] move file_decryption_properties into ArrowReaderOptions --- parquet/Cargo.toml | 1 - parquet/src/arrow/arrow_reader/mod.rs | 80 +++++----- parquet/src/arrow/async_reader/mod.rs | 206 +++++++++----------------- 3 files changed, 107 insertions(+), 180 deletions(-) diff --git a/parquet/Cargo.toml b/parquet/Cargo.toml index 671675ae0c02..21ab28722ea6 100644 --- a/parquet/Cargo.toml +++ b/parquet/Cargo.toml @@ -31,7 +31,6 @@ rust-version = { workspace = true } [target.'cfg(target_arch = "wasm32")'.dependencies] ahash = { version = "0.8", default-features = false, features = ["compile-time-rng"] } # See https://github.com/briansmith/ring/issues/918#issuecomment-2077788925 -[target.'cfg(feature = "encryption")'.dependencies] ring = { version = "0.17", default-features = false, features = ["wasm32_unknown_unknown_js", "std"] } [target.'cfg(not(target_arch = "wasm32"))'.dependencies] diff --git a/parquet/src/arrow/arrow_reader/mod.rs b/parquet/src/arrow/arrow_reader/mod.rs index c931de118965..f31352beb4c8 100644 --- a/parquet/src/arrow/arrow_reader/mod.rs +++ b/parquet/src/arrow/arrow_reader/mod.rs @@ -253,6 +253,9 @@ pub struct ArrowReaderOptions { supplied_schema: Option, /// If true, attempt to read `OffsetIndex` and `ColumnIndex` pub(crate) page_index: bool, + /// If encryption is enabled, the file decryption properties can be provided + #[cfg(feature = "encryption")] + pub(crate) file_decryption_properties: Option, } impl ArrowReaderOptions { @@ -343,6 +346,20 @@ impl ArrowReaderOptions { pub fn with_page_index(self, page_index: bool) -> Self { Self { page_index, ..self } } + + /// Provide the file decryption properties to use when reading encrypted parquet files. + /// + /// If encryption is enabled and the file is encrypted, the `file_decryption_properties` must be provided. + #[cfg(feature = "encryption")] + pub fn with_file_decryption_properties( + self, + file_decryption_properties: FileDecryptionProperties, + ) -> Self { + Self { + file_decryption_properties: Some(file_decryption_properties), + ..self + } + } } /// The metadata necessary to construct a [`ArrowReaderBuilder`] @@ -383,17 +400,11 @@ impl ArrowReaderMetadata { /// /// If encryption is enabled and the file is encrypted, the /// `file_decryption_properties` must be provided. - pub fn load( - reader: &T, - options: ArrowReaderOptions, - #[cfg(feature = "encryption")] file_decryption_properties: Option< - &FileDecryptionProperties, - >, - ) -> Result { + pub fn load(reader: &T, options: ArrowReaderOptions) -> Result { let metadata = ParquetMetaDataReader::new().with_page_indexes(options.page_index); #[cfg(feature = "encryption")] let metadata = metadata - .with_decryption_properties(file_decryption_properties) + .with_decryption_properties(options.file_decryption_properties.as_ref()) .parse_and_finish(reader)?; #[cfg(not(feature = "encryption"))] let metadata = metadata.parse_and_finish(reader)?; @@ -542,23 +553,14 @@ impl ParquetRecordBatchReaderBuilder { /// Create a new [`ParquetRecordBatchReaderBuilder`] with [`ArrowReaderOptions`] pub fn try_new_with_options(reader: T, options: ArrowReaderOptions) -> Result { - let metadata = ArrowReaderMetadata::load( - &reader, - options, - #[cfg(feature = "encryption")] - None, - )?; + let metadata = ArrowReaderMetadata::load(&reader, options)?; Ok(Self::new_with_metadata(reader, metadata)) } /// Create a new [`ParquetRecordBatchReaderBuilder`] with [`ArrowReaderOptions`] and [`FileDecryptionProperties`] #[cfg(feature = "encryption")] - pub fn try_new_with_decryption( - reader: T, - options: ArrowReaderOptions, - file_decryption_properties: Option<&FileDecryptionProperties>, - ) -> Result { - let metadata = ArrowReaderMetadata::load(&reader, options, file_decryption_properties)?; + pub fn try_new_with_decryption(reader: T, options: ArrowReaderOptions) -> Result { + let metadata = ArrowReaderMetadata::load(&reader, options)?; Ok(Self::new_with_metadata(reader, metadata)) } @@ -592,7 +594,7 @@ impl ParquetRecordBatchReaderBuilder { /// # writer.close().unwrap(); /// # let file = Bytes::from(file); /// # - /// let metadata = ArrowReaderMetadata::load(&file, Default::default(), None).unwrap(); + /// let metadata = ArrowReaderMetadata::load(&file, Default::default()).unwrap(); /// let mut a = ParquetRecordBatchReaderBuilder::new_with_metadata(file.clone(), metadata.clone()).build().unwrap(); /// let mut b = ParquetRecordBatchReaderBuilder::new_with_metadata(file, metadata).build().unwrap(); /// @@ -864,13 +866,11 @@ impl ParquetRecordBatchReader { batch_size: usize, file_decryption_properties: Option<&FileDecryptionProperties>, ) -> Result { - ParquetRecordBatchReaderBuilder::try_new_with_decryption( - reader, - Default::default(), - file_decryption_properties, - )? - .with_batch_size(batch_size) - .build() + let options = ArrowReaderOptions::default() + .with_file_decryption_properties(file_decryption_properties.cloned().unwrap()); + ParquetRecordBatchReaderBuilder::try_new_with_decryption(reader, options)? + .with_batch_size(batch_size) + .build() } /// Create a new [`ParquetRecordBatchReader`] from the provided [`RowGroups`] @@ -1904,13 +1904,7 @@ mod tests { let path = format!("{testdata}/encrypt_columns_plaintext_footer.parquet.encrypted"); let file = File::open(&path).unwrap(); - let metadata = ArrowReaderMetadata::load( - &file, - Default::default(), - #[cfg(feature = "encryption")] - None, - ) - .unwrap(); + let metadata = ArrowReaderMetadata::load(&file, Default::default()).unwrap(); let file_metadata = metadata.metadata.file_metadata(); assert_eq!(file_metadata.num_rows(), 50); @@ -2013,9 +2007,9 @@ mod tests { .build() .unwrap(); - let decryption_properties = Some(decryption_properties); - let metadata = - ArrowReaderMetadata::load(&file, Default::default(), decryption_properties.as_ref()); + let options = + ArrowReaderOptions::default().with_file_decryption_properties(decryption_properties); + let metadata = ArrowReaderMetadata::load(&file, options); match metadata { Err(crate::errors::ParquetError::NYI(s)) => { @@ -2032,17 +2026,15 @@ mod tests { file: File, decryption_properties: FileDecryptionProperties, ) { - let decryption_properties = Some(decryption_properties); - - let metadata = - ArrowReaderMetadata::load(&file, Default::default(), decryption_properties.as_ref()) - .unwrap(); + let options = ArrowReaderOptions::default() + .with_file_decryption_properties(decryption_properties.clone()); + let metadata = ArrowReaderMetadata::load(&file, options).unwrap(); let file_metadata = metadata.metadata.file_metadata(); let record_reader = ParquetRecordBatchReader::try_new_with_decryption( file, 128, - decryption_properties.as_ref(), + Some(&decryption_properties), ) .unwrap(); diff --git a/parquet/src/arrow/async_reader/mod.rs b/parquet/src/arrow/async_reader/mod.rs index 429372d49239..c6080dd52f74 100644 --- a/parquet/src/arrow/async_reader/mod.rs +++ b/parquet/src/arrow/async_reader/mod.rs @@ -68,6 +68,7 @@ use crate::encryption::decryption::FileDecryptionProperties; mod store; use crate::arrow::schema::ParquetField; +#[cfg(feature = "encryption")] use crate::encryption::decryption::CryptoContext; #[cfg(feature = "object_store")] pub use store::*; @@ -201,16 +202,13 @@ impl ArrowReaderMetadata { pub async fn load_async( input: &mut T, options: ArrowReaderOptions, - #[cfg(feature = "encryption")] file_decryption_properties: Option< - &FileDecryptionProperties, - >, ) -> Result { // TODO: this is all rather awkward. It would be nice if AsyncFileReader::get_metadata // took an argument to fetch the page indexes. let mut metadata = input .get_metadata( #[cfg(feature = "encryption")] - file_decryption_properties, + options.file_decryption_properties.as_ref(), ) .await?; @@ -222,7 +220,10 @@ impl ArrowReaderMetadata { let mut reader = ParquetMetaDataReader::new_with_metadata(m).with_page_indexes(true); #[cfg(feature = "encryption")] - reader = reader.with_decryption_properties(file_decryption_properties); + { + reader = + reader.with_decryption_properties(options.file_decryption_properties.as_ref()); + } reader.load_page_index(input).await?; metadata = Arc::new(reader.finish()?) @@ -382,31 +383,13 @@ impl ParquetRecordBatchStreamBuilder { /// # } /// ``` pub async fn new(input: T) -> Result { - Self::new_with_options( - input, - Default::default(), - #[cfg(feature = "encryption")] - None, - ) - .await + Self::new_with_options(input, Default::default()).await } /// Create a new [`ParquetRecordBatchStreamBuilder`] with the provided async source, /// [`ArrowReaderOptions`] and [`FileDecryptionProperties`] if the data is encrypted. - pub async fn new_with_options( - mut input: T, - options: ArrowReaderOptions, - #[cfg(feature = "encryption")] file_decryption_properties: Option< - &FileDecryptionProperties, - >, - ) -> Result { - let metadata = ArrowReaderMetadata::load_async( - &mut input, - options, - #[cfg(feature = "encryption")] - file_decryption_properties, - ) - .await?; + pub async fn new_with_options(mut input: T, options: ArrowReaderOptions) -> Result { + let metadata = ArrowReaderMetadata::load_async(&mut input, options).await?; Ok(Self::new_with_metadata(input, metadata)) } @@ -439,7 +422,7 @@ impl ParquetRecordBatchStreamBuilder { /// // open file with parquet data /// let mut file = tokio::fs::File::from_std(file); /// // load metadata once - /// let meta = ArrowReaderMetadata::load_async(&mut file, Default::default(), #[cfg(feature = "encryption")] None).await.unwrap(); + /// let meta = ArrowReaderMetadata::load_async(&mut file, Default::default()).await.unwrap(); /// // create two readers, a and b, from the same underlying file /// // without reading the metadata again /// let mut a = ParquetRecordBatchStreamBuilder::new_with_metadata( @@ -1023,39 +1006,35 @@ impl RowGroups for InMemoryRowGroup<'_> { fn column_chunks(&self, i: usize) -> Result> { #[cfg(feature = "encryption")] - let crypto_context = if let Some(file_decryptor) = &self - .parquet_metadata - .clone() - .file_decryptor() - .clone() - { - let column_name = &self - .parquet_metadata - .clone() - .file_metadata() - .schema_descr() - .column(i); - - if file_decryptor.is_column_encrypted(column_name.name().as_bytes()) { - let data_decryptor = - file_decryptor.get_column_data_decryptor(column_name.name().as_bytes()); - let metadata_decryptor = - file_decryptor.get_column_metadata_decryptor(column_name.name().as_bytes()); - - let crypto_context = CryptoContext::new( - self.row_group_ordinal, - i, - data_decryptor, - metadata_decryptor, - file_decryptor.file_aad().clone(), - ); - Some(Arc::new(crypto_context)) + let crypto_context = + if let Some(file_decryptor) = &self.parquet_metadata.clone().file_decryptor().clone() { + let column_name = &self + .parquet_metadata + .clone() + .file_metadata() + .schema_descr() + .column(i); + + if file_decryptor.is_column_encrypted(column_name.name().as_bytes()) { + let data_decryptor = + file_decryptor.get_column_data_decryptor(column_name.name().as_bytes()); + let metadata_decryptor = + file_decryptor.get_column_metadata_decryptor(column_name.name().as_bytes()); + + let crypto_context = CryptoContext::new( + self.row_group_ordinal, + i, + data_decryptor, + metadata_decryptor, + file_decryptor.file_aad().clone(), + ); + Some(Arc::new(crypto_context)) + } else { + None + } } else { None - } - } else { - None - }; + }; match &self.column_chunks[i] { None => Err(ParquetError::General(format!( @@ -1347,14 +1326,9 @@ mod tests { }; let options = ArrowReaderOptions::new().with_page_index(true); - let builder = ParquetRecordBatchStreamBuilder::new_with_options( - async_reader, - options, - #[cfg(feature = "encryption")] - None, - ) - .await - .unwrap(); + let builder = ParquetRecordBatchStreamBuilder::new_with_options(async_reader, options) + .await + .unwrap(); // The builder should have page and offset indexes loaded now let metadata_with_index = builder.metadata(); @@ -1466,14 +1440,9 @@ mod tests { }; let options = ArrowReaderOptions::new().with_page_index(true); - let builder = ParquetRecordBatchStreamBuilder::new_with_options( - async_reader, - options, - #[cfg(feature = "encryption")] - None, - ) - .await - .unwrap(); + let builder = ParquetRecordBatchStreamBuilder::new_with_options(async_reader, options) + .await + .unwrap(); let selection = RowSelection::from(vec![ RowSelector::skip(21), // Skip first page @@ -1554,14 +1523,9 @@ mod tests { }; let options = ArrowReaderOptions::new().with_page_index(true); - let builder = ParquetRecordBatchStreamBuilder::new_with_options( - async_reader, - options, - #[cfg(feature = "encryption")] - None, - ) - .await - .unwrap(); + let builder = ParquetRecordBatchStreamBuilder::new_with_options(async_reader, options) + .await + .unwrap(); let col_idx: usize = rand.gen_range(0..13); let mask = ProjectionMask::leaves(builder.parquet_schema(), vec![col_idx]); @@ -1630,14 +1594,9 @@ mod tests { }; let options = ArrowReaderOptions::new().with_page_index(true); - let builder = ParquetRecordBatchStreamBuilder::new_with_options( - async_reader, - options, - #[cfg(feature = "encryption")] - None, - ) - .await - .unwrap(); + let builder = ParquetRecordBatchStreamBuilder::new_with_options(async_reader, options) + .await + .unwrap(); let col_idx: usize = rand.gen_range(0..13); let mask = ProjectionMask::leaves(builder.parquet_schema(), vec![col_idx]); @@ -1869,19 +1828,14 @@ mod tests { let mask = ProjectionMask::leaves(&parquet_schema, vec![0, 2]); let options = ArrowReaderOptions::new().with_page_index(true); - let stream = ParquetRecordBatchStreamBuilder::new_with_options( - async_reader, - options, - #[cfg(feature = "encryption")] - None, - ) - .await - .unwrap() - .with_projection(mask.clone()) - .with_batch_size(1024) - .with_row_filter(filter) - .build() - .unwrap(); + let stream = ParquetRecordBatchStreamBuilder::new_with_options(async_reader, options) + .await + .unwrap() + .with_projection(mask.clone()) + .with_batch_size(1024) + .with_row_filter(filter) + .build() + .unwrap(); let batches: Vec = stream.try_collect().await.unwrap(); @@ -2266,8 +2220,6 @@ mod tests { let mut reader = ParquetRecordBatchStreamBuilder::new_with_options( tokio::fs::File::from_std(file.try_clone().unwrap()), ArrowReaderOptions::new().with_page_index(true), - #[cfg(feature = "encryption")] - None, ) .await .unwrap(); @@ -2481,22 +2433,15 @@ mod tests { file: &mut File, decryption_properties: FileDecryptionProperties, ) { - let decryption_properties = Some(decryption_properties); + let options = + ArrowReaderOptions::new().with_file_decryption_properties(decryption_properties); - let metadata = ArrowReaderMetadata::load_async( - file, - Default::default(), - decryption_properties.as_ref(), - ) - .await - .unwrap(); - let arrow_reader_metadata = ArrowReaderMetadata::load_async( - file, - Default::default(), - decryption_properties.as_ref(), - ) - .await - .unwrap(); + let metadata = ArrowReaderMetadata::load_async(file, options.clone()) + .await + .unwrap(); + let arrow_reader_metadata = ArrowReaderMetadata::load_async(file, options) + .await + .unwrap(); let file_metadata = metadata.metadata.file_metadata(); let record_reader = ParquetRecordBatchStreamBuilder::new_with_metadata( @@ -2600,14 +2545,9 @@ mod tests { let path = format!("{testdata}/encrypt_columns_plaintext_footer.parquet.encrypted"); let mut file = File::open(&path).await.unwrap(); - let metadata = ArrowReaderMetadata::load_async( - &mut file, - Default::default(), - #[cfg(feature = "encryption")] - None, - ) - .await - .unwrap(); + let metadata = ArrowReaderMetadata::load_async(&mut file, Default::default()) + .await + .unwrap(); let file_metadata = metadata.metadata.file_metadata(); assert_eq!(file_metadata.num_rows(), 50); @@ -2711,13 +2651,9 @@ mod tests { .build() .unwrap(); - let decryption_properties = Some(decryption_properties); - let metadata = ArrowReaderMetadata::load_async( - &mut file, - Default::default(), - decryption_properties.as_ref(), - ) - .await; + let options = + ArrowReaderOptions::new().with_file_decryption_properties(decryption_properties); + let metadata = ArrowReaderMetadata::load_async(&mut file, options).await; match metadata { Err(ParquetError::NYI(s)) => { From e3e3163ca828cb01fb82fae8482b1001e570a6f1 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Mon, 10 Feb 2025 13:05:18 +0100 Subject: [PATCH 64/97] make create_page_aad method of CryptoContext --- parquet/examples/read_with_rowgroup.rs | 7 ++++++- parquet/src/encryption/decryption.rs | 14 ++++++++++++++ parquet/src/encryption/modules.rs | 18 +----------------- parquet/src/file/metadata/mod.rs | 4 ++-- parquet/src/file/serialized_reader.rs | 18 +++--------------- 5 files changed, 26 insertions(+), 35 deletions(-) diff --git a/parquet/examples/read_with_rowgroup.rs b/parquet/examples/read_with_rowgroup.rs index b458907b6187..44d25596110e 100644 --- a/parquet/examples/read_with_rowgroup.rs +++ b/parquet/examples/read_with_rowgroup.rs @@ -35,7 +35,12 @@ async fn main() -> Result<()> { let mut file = File::open(&path).await.unwrap(); // The metadata could be cached in other places, this example only shows how to read - let metadata = file.get_metadata(None).await?; + let metadata = file + .get_metadata( + #[cfg(feature = "encryption")] + None, + ) + .await?; for rg in metadata.row_groups() { let mut rowgroup = InMemoryRowGroup::create(rg.clone(), ProjectionMask::all()); diff --git a/parquet/src/encryption/decryption.rs b/parquet/src/encryption/decryption.rs index 8f340ecf4b43..ee9368f1f6cc 100644 --- a/parquet/src/encryption/decryption.rs +++ b/parquet/src/encryption/decryption.rs @@ -16,6 +16,7 @@ // under the License. use crate::encryption::ciphers::{BlockDecryptor, RingGcmBlockDecryptor}; +use crate::encryption::modules::{create_module_aad, ModuleType}; use crate::errors::Result; use std::collections::HashMap; use std::io::Read; @@ -80,6 +81,19 @@ impl CryptoContext { } } + pub(crate) fn create_page_aad( + &self, + module_type: ModuleType, + ) -> crate::errors::Result> { + create_module_aad( + self.file_aad(), + module_type, + self.row_group_ordinal, + self.column_ordinal, + self.page_ordinal, + ) + } + pub fn for_dictionary_page(&self) -> Self { Self { row_group_ordinal: self.row_group_ordinal, diff --git a/parquet/src/encryption/modules.rs b/parquet/src/encryption/modules.rs index 5ff060b3abd4..0f2f8083f5ed 100644 --- a/parquet/src/encryption/modules.rs +++ b/parquet/src/encryption/modules.rs @@ -31,23 +31,7 @@ pub fn create_footer_aad(file_aad: &[u8]) -> crate::errors::Result> { create_module_aad(file_aad, ModuleType::Footer, 0, 0, None) } -pub(crate) fn create_page_aad( - file_aad: &[u8], - module_type: ModuleType, - row_group_ordinal: usize, - column_ordinal: usize, - page_ordinal: Option, -) -> crate::errors::Result> { - create_module_aad( - file_aad, - module_type, - row_group_ordinal, - column_ordinal, - page_ordinal, - ) -} - -fn create_module_aad( +pub(crate) fn create_module_aad( file_aad: &[u8], module_type: ModuleType, row_group_ordinal: usize, diff --git a/parquet/src/file/metadata/mod.rs b/parquet/src/file/metadata/mod.rs index 630730b476a7..e6ec488fe9f4 100644 --- a/parquet/src/file/metadata/mod.rs +++ b/parquet/src/file/metadata/mod.rs @@ -99,7 +99,7 @@ use crate::basic::{ColumnOrder, Compression, Encoding, Type}; #[cfg(feature = "encryption")] use crate::encryption::{ decryption::FileDecryptor, - modules::{create_page_aad, ModuleType}, + modules::{create_module_aad, ModuleType}, }; use crate::errors::{ParquetError, Result}; pub(crate) use crate::file::metadata::memory::HeapSize; @@ -673,7 +673,7 @@ impl RowGroupMetaData { } }; - let column_aad = create_page_aad( + let column_aad = create_module_aad( decryptor.file_aad(), ModuleType::ColumnMetaData, rg.ordinal.unwrap() as usize, diff --git a/parquet/src/file/serialized_reader.rs b/parquet/src/file/serialized_reader.rs index 1407e0da61f6..36dca84e2d96 100644 --- a/parquet/src/file/serialized_reader.rs +++ b/parquet/src/file/serialized_reader.rs @@ -25,7 +25,7 @@ use crate::compression::{create_codec, Codec}; #[cfg(feature = "encryption")] use crate::encryption::{ decryption::{read_and_decrypt, CryptoContext}, - modules::{create_page_aad, ModuleType}, + modules::ModuleType, }; use crate::errors::{ParquetError, Result}; use crate::file::page_index::offset_index::OffsetIndexMetaData; @@ -357,13 +357,7 @@ pub(crate) fn read_page_header( } else { ModuleType::DataPageHeader }; - let aad = create_page_aad( - crypto_context.file_aad(), - module_type, - crypto_context.row_group_ordinal, - crypto_context.column_ordinal, - crypto_context.page_ordinal, - )?; + let aad = crypto_context.create_page_aad(module_type)?; let buf = read_and_decrypt(data_decryptor, input, aad.as_ref())?; @@ -450,13 +444,7 @@ pub(crate) fn decode_page( } else { ModuleType::DataPage }; - let aad = create_page_aad( - crypto_context.file_aad(), - module_type, - crypto_context.row_group_ordinal, - crypto_context.column_ordinal, - crypto_context.page_ordinal, - )?; + let aad = crypto_context.create_page_aad(module_type)?; let decrypted = decryptor.decrypt(buffer.as_ref(), &aad)?; Bytes::from(decrypted) } else { From 105c3e908f2772d0d9e54d9bf8d26a2579e11f5a Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Mon, 10 Feb 2025 22:22:15 +0100 Subject: [PATCH 65/97] Review feedback --- parquet/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/parquet/Cargo.toml b/parquet/Cargo.toml index 21ab28722ea6..809b5cf6c210 100644 --- a/parquet/Cargo.toml +++ b/parquet/Cargo.toml @@ -31,7 +31,7 @@ rust-version = { workspace = true } [target.'cfg(target_arch = "wasm32")'.dependencies] ahash = { version = "0.8", default-features = false, features = ["compile-time-rng"] } # See https://github.com/briansmith/ring/issues/918#issuecomment-2077788925 -ring = { version = "0.17", default-features = false, features = ["wasm32_unknown_unknown_js", "std"] } +ring = { version = "0.17", default-features = false, features = ["wasm32_unknown_unknown_js", "std"], optional = true } [target.'cfg(not(target_arch = "wasm32"))'.dependencies] ahash = { version = "0.8", default-features = false, features = ["runtime-rng"] } From a8a204e674c57a375be65481c45c5120c63db857 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Mon, 10 Feb 2025 23:08:34 +0100 Subject: [PATCH 66/97] Infer ModuleType in create_page_aad --- parquet/src/arrow/arrow_reader/mod.rs | 3 --- parquet/src/encryption/decryption.rs | 18 ++++++++++++++++-- parquet/src/file/serialized_reader.rs | 20 +++----------------- 3 files changed, 19 insertions(+), 22 deletions(-) diff --git a/parquet/src/arrow/arrow_reader/mod.rs b/parquet/src/arrow/arrow_reader/mod.rs index f31352beb4c8..dff7253f42ba 100644 --- a/parquet/src/arrow/arrow_reader/mod.rs +++ b/parquet/src/arrow/arrow_reader/mod.rs @@ -397,9 +397,6 @@ impl ArrowReaderMetadata { /// If `options` has [`ArrowReaderOptions::with_page_index`] true, but /// `Self::metadata` is missing the page index, this function will attempt /// to load the page index by making an object store request. - /// - /// If encryption is enabled and the file is encrypted, the - /// `file_decryption_properties` must be provided. pub fn load(reader: &T, options: ArrowReaderOptions) -> Result { let metadata = ParquetMetaDataReader::new().with_page_indexes(options.page_index); #[cfg(feature = "encryption")] diff --git a/parquet/src/encryption/decryption.rs b/parquet/src/encryption/decryption.rs index ee9368f1f6cc..d5e90f063cc1 100644 --- a/parquet/src/encryption/decryption.rs +++ b/parquet/src/encryption/decryption.rs @@ -83,8 +83,22 @@ impl CryptoContext { pub(crate) fn create_page_aad( &self, - module_type: ModuleType, - ) -> crate::errors::Result> { + is_header: bool, + ) -> Result> { + let module_type = if self.dictionary_page { + if is_header { + ModuleType::DictionaryPageHeader + } else { + ModuleType::DictionaryPage + } + } else { + if is_header { + ModuleType::DataPageHeader + } else { + ModuleType::DataPage + } + }; + create_module_aad( self.file_aad(), module_type, diff --git a/parquet/src/file/serialized_reader.rs b/parquet/src/file/serialized_reader.rs index 36dca84e2d96..01e7b280e1cd 100644 --- a/parquet/src/file/serialized_reader.rs +++ b/parquet/src/file/serialized_reader.rs @@ -23,10 +23,7 @@ use crate::bloom_filter::Sbbf; use crate::column::page::{Page, PageMetadata, PageReader}; use crate::compression::{create_codec, Codec}; #[cfg(feature = "encryption")] -use crate::encryption::{ - decryption::{read_and_decrypt, CryptoContext}, - modules::ModuleType, -}; +use crate::encryption::decryption::{read_and_decrypt, CryptoContext}; use crate::errors::{ParquetError, Result}; use crate::file::page_index::offset_index::OffsetIndexMetaData; use crate::file::{ @@ -351,13 +348,7 @@ pub(crate) fn read_page_header( #[cfg(feature = "encryption")] if let Some(crypto_context) = crypto_context { let data_decryptor = crypto_context.data_decryptor(); - - let module_type = if crypto_context.dictionary_page { - ModuleType::DictionaryPageHeader - } else { - ModuleType::DataPageHeader - }; - let aad = crypto_context.create_page_aad(module_type)?; + let aad = crypto_context.create_page_aad(true)?; let buf = read_and_decrypt(data_decryptor, input, aad.as_ref())?; @@ -439,12 +430,7 @@ pub(crate) fn decode_page( #[cfg(feature = "encryption")] let buffer: Bytes = if let Some(crypto_context) = crypto_context { let decryptor = crypto_context.data_decryptor(); - let module_type = if crypto_context.dictionary_page { - ModuleType::DictionaryPage - } else { - ModuleType::DataPage - }; - let aad = crypto_context.create_page_aad(module_type)?; + let aad = crypto_context.create_page_aad(false)?; let decrypted = decryptor.decrypt(buffer.as_ref(), &aad)?; Bytes::from(decrypted) } else { From fb3b6b04910088dd9afd5784c15cebb84e9d4236 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Mon, 10 Feb 2025 23:51:54 +0100 Subject: [PATCH 67/97] add create_page_header_aad --- parquet/src/encryption/decryption.rs | 33 +++++++++++++++------------ parquet/src/file/serialized_reader.rs | 4 ++-- 2 files changed, 21 insertions(+), 16 deletions(-) diff --git a/parquet/src/encryption/decryption.rs b/parquet/src/encryption/decryption.rs index d5e90f063cc1..5a8d1e994218 100644 --- a/parquet/src/encryption/decryption.rs +++ b/parquet/src/encryption/decryption.rs @@ -81,22 +81,27 @@ impl CryptoContext { } } - pub(crate) fn create_page_aad( - &self, - is_header: bool, - ) -> Result> { + pub(crate) fn create_page_header_aad(&self) -> Result> { + let module_type = if self.dictionary_page { + ModuleType::DictionaryPageHeader + } else { + ModuleType::DataPageHeader + }; + + create_module_aad( + self.file_aad(), + module_type, + self.row_group_ordinal, + self.column_ordinal, + self.page_ordinal, + ) + } + + pub(crate) fn create_page_aad(&self) -> Result> { let module_type = if self.dictionary_page { - if is_header { - ModuleType::DictionaryPageHeader - } else { - ModuleType::DictionaryPage - } + ModuleType::DictionaryPage } else { - if is_header { - ModuleType::DataPageHeader - } else { - ModuleType::DataPage - } + ModuleType::DataPage }; create_module_aad( diff --git a/parquet/src/file/serialized_reader.rs b/parquet/src/file/serialized_reader.rs index 01e7b280e1cd..3ddeb82a0cb3 100644 --- a/parquet/src/file/serialized_reader.rs +++ b/parquet/src/file/serialized_reader.rs @@ -348,7 +348,7 @@ pub(crate) fn read_page_header( #[cfg(feature = "encryption")] if let Some(crypto_context) = crypto_context { let data_decryptor = crypto_context.data_decryptor(); - let aad = crypto_context.create_page_aad(true)?; + let aad = crypto_context.create_page_header_aad()?; let buf = read_and_decrypt(data_decryptor, input, aad.as_ref())?; @@ -430,7 +430,7 @@ pub(crate) fn decode_page( #[cfg(feature = "encryption")] let buffer: Bytes = if let Some(crypto_context) = crypto_context { let decryptor = crypto_context.data_decryptor(); - let aad = crypto_context.create_page_aad(false)?; + let aad = crypto_context.create_page_aad()?; let decrypted = decryptor.decrypt(buffer.as_ref(), &aad)?; Bytes::from(decrypted) } else { From 11c4e7a5c9fb7662b78cfe1e844f2d9db1963bbd Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Fri, 14 Feb 2025 16:42:41 +0100 Subject: [PATCH 68/97] Review feedback --- parquet/src/arrow/arrow_reader/mod.rs | 33 +++---------------------- parquet/src/arrow/async_reader/mod.rs | 5 ++-- parquet/src/arrow/async_reader/store.rs | 15 +++-------- 3 files changed, 10 insertions(+), 43 deletions(-) diff --git a/parquet/src/arrow/arrow_reader/mod.rs b/parquet/src/arrow/arrow_reader/mod.rs index dff7253f42ba..2b0f66d2f22d 100644 --- a/parquet/src/arrow/arrow_reader/mod.rs +++ b/parquet/src/arrow/arrow_reader/mod.rs @@ -554,13 +554,6 @@ impl ParquetRecordBatchReaderBuilder { Ok(Self::new_with_metadata(reader, metadata)) } - /// Create a new [`ParquetRecordBatchReaderBuilder`] with [`ArrowReaderOptions`] and [`FileDecryptionProperties`] - #[cfg(feature = "encryption")] - pub fn try_new_with_decryption(reader: T, options: ArrowReaderOptions) -> Result { - let metadata = ArrowReaderMetadata::load(&reader, options)?; - Ok(Self::new_with_metadata(reader, metadata)) - } - /// Create a [`ParquetRecordBatchReaderBuilder`] from the provided [`ArrowReaderMetadata`] /// /// This interface allows: @@ -854,22 +847,6 @@ impl ParquetRecordBatchReader { .build() } - /// Create a new [`ParquetRecordBatchReader`] from the provided chunk reader and [`FileDecryptionProperties`] - /// - /// Note: this is needed when the parquet file is encrypted - #[cfg(feature = "encryption")] - pub fn try_new_with_decryption( - reader: T, - batch_size: usize, - file_decryption_properties: Option<&FileDecryptionProperties>, - ) -> Result { - let options = ArrowReaderOptions::default() - .with_file_decryption_properties(file_decryption_properties.cloned().unwrap()); - ParquetRecordBatchReaderBuilder::try_new_with_decryption(reader, options)? - .with_batch_size(batch_size) - .build() - } - /// Create a new [`ParquetRecordBatchReader`] from the provided [`RowGroups`] /// /// Note: this is a low-level interface see [`ParquetRecordBatchReader::try_new`] for a @@ -2025,15 +2002,11 @@ mod tests { ) { let options = ArrowReaderOptions::default() .with_file_decryption_properties(decryption_properties.clone()); - let metadata = ArrowReaderMetadata::load(&file, options).unwrap(); + let metadata = ArrowReaderMetadata::load(&file, options.clone()).unwrap(); let file_metadata = metadata.metadata.file_metadata(); - let record_reader = ParquetRecordBatchReader::try_new_with_decryption( - file, - 128, - Some(&decryption_properties), - ) - .unwrap(); + let builder = ParquetRecordBatchReaderBuilder::try_new_with_options(file, options).unwrap(); + let record_reader = builder.build().unwrap(); assert_eq!(file_metadata.num_rows(), 50); assert_eq!(file_metadata.schema_descr().num_columns(), 8); diff --git a/parquet/src/arrow/async_reader/mod.rs b/parquet/src/arrow/async_reader/mod.rs index c6080dd52f74..0180b422bb6a 100644 --- a/parquet/src/arrow/async_reader/mod.rs +++ b/parquet/src/arrow/async_reader/mod.rs @@ -386,8 +386,9 @@ impl ParquetRecordBatchStreamBuilder { Self::new_with_options(input, Default::default()).await } - /// Create a new [`ParquetRecordBatchStreamBuilder`] with the provided async source, - /// [`ArrowReaderOptions`] and [`FileDecryptionProperties`] if the data is encrypted. + /// Create a new [`ParquetRecordBatchStreamBuilder`] with the provided async source + /// and [`ArrowReaderOptions`]. If the data is encrypted, [`ArrowReaderOptions`] should + /// have the [`FileDecryptionProperties`] set. pub async fn new_with_options(mut input: T, options: ArrowReaderOptions) -> Result { let metadata = ArrowReaderMetadata::load_async(&mut input, options).await?; Ok(Self::new_with_metadata(input, metadata)) diff --git a/parquet/src/arrow/async_reader/store.rs b/parquet/src/arrow/async_reader/store.rs index bf1765b09ef8..251c3ea7c341 100644 --- a/parquet/src/arrow/async_reader/store.rs +++ b/parquet/src/arrow/async_reader/store.rs @@ -172,21 +172,14 @@ impl AsyncFileReader for ParquetObjectReader { ) -> BoxFuture<'a, Result>> { Box::pin(async move { let file_size = self.meta.size; - #[cfg(not(feature = "encryption"))] let metadata = ParquetMetaDataReader::new() .with_column_indexes(self.preload_column_index) .with_offset_indexes(self.preload_offset_index) - .with_prefetch_hint(self.metadata_size_hint) - .load_and_finish(self, file_size) - .await?; + .with_prefetch_hint(self.metadata_size_hint); #[cfg(feature = "encryption")] - let metadata = ParquetMetaDataReader::new() - .with_column_indexes(self.preload_column_index) - .with_offset_indexes(self.preload_offset_index) - .with_prefetch_hint(self.metadata_size_hint) - .with_decryption_properties(file_decryption_properties) - .load_and_finish(self, file_size) - .await?; + let metadata = metadata.with_decryption_properties(file_decryption_properties); + + let metadata = metadata.load_and_finish(self, file_size).await?; Ok(Arc::new(metadata)) }) } From 815e35d5d74b3ff18c09189eb4eb7d42a3a7f6d4 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Fri, 14 Feb 2025 17:10:54 +0100 Subject: [PATCH 69/97] Update parquet/src/arrow/async_reader/store.rs Co-authored-by: Ed Seidl --- parquet/src/arrow/async_reader/mod.rs | 7 ++----- parquet/src/arrow/async_reader/store.rs | 1 + parquet/src/encryption/decryption.rs | 2 ++ parquet/src/file/metadata/reader.rs | 2 +- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/parquet/src/arrow/async_reader/mod.rs b/parquet/src/arrow/async_reader/mod.rs index 0180b422bb6a..321ca5f52a5e 100644 --- a/parquet/src/arrow/async_reader/mod.rs +++ b/parquet/src/arrow/async_reader/mod.rs @@ -108,7 +108,6 @@ pub trait AsyncFileReader: Send { /// Provides asynchronous access to the [`ParquetMetaData`] of a parquet file, /// allowing fine-grained control over how metadata is sourced, in particular allowing /// for caching, pre-fetching, catalog metadata, etc... - /// If data is encrypted, the [`FileDecryptionProperties`] should be provided. fn get_metadata<'a>( &'a mut self, #[cfg(feature = "encryption")] file_decryption_properties: Option< @@ -243,8 +242,7 @@ pub struct AsyncReader(T); /// /// This builder handles reading the parquet file metadata, allowing consumers /// to use this information to select what specific columns, row groups, etc... -/// they wish to be read by the resulting stream. If footer or columns are encrypted -/// [`FileDecryptionProperties`] should be provided. +/// they wish to be read by the resulting stream. /// /// See examples on [`ParquetRecordBatchStreamBuilder::new`] /// @@ -387,8 +385,7 @@ impl ParquetRecordBatchStreamBuilder { } /// Create a new [`ParquetRecordBatchStreamBuilder`] with the provided async source - /// and [`ArrowReaderOptions`]. If the data is encrypted, [`ArrowReaderOptions`] should - /// have the [`FileDecryptionProperties`] set. + /// and [`ArrowReaderOptions`]. pub async fn new_with_options(mut input: T, options: ArrowReaderOptions) -> Result { let metadata = ArrowReaderMetadata::load_async(&mut input, options).await?; Ok(Self::new_with_metadata(input, metadata)) diff --git a/parquet/src/arrow/async_reader/store.rs b/parquet/src/arrow/async_reader/store.rs index 251c3ea7c341..0354bbc5616c 100644 --- a/parquet/src/arrow/async_reader/store.rs +++ b/parquet/src/arrow/async_reader/store.rs @@ -23,6 +23,7 @@ use object_store::{path::Path, ObjectMeta, ObjectStore}; use tokio::runtime::Handle; use crate::arrow::async_reader::AsyncFileReader; +#[cfg(feature = "encryption")] use crate::encryption::decryption::FileDecryptionProperties; use crate::errors::{ParquetError, Result}; use crate::file::metadata::{ParquetMetaData, ParquetMetaDataReader}; diff --git a/parquet/src/encryption/decryption.rs b/parquet/src/encryption/decryption.rs index 5a8d1e994218..a019ace580e3 100644 --- a/parquet/src/encryption/decryption.rs +++ b/parquet/src/encryption/decryption.rs @@ -138,6 +138,7 @@ impl CryptoContext { } } +/// FileDecryptionProperties hold keys and AAD data required to decrypt a Parquet file. #[derive(Debug, Clone, PartialEq)] pub struct FileDecryptionProperties { footer_key: Vec, @@ -146,6 +147,7 @@ pub struct FileDecryptionProperties { } impl FileDecryptionProperties { + /// Returns a new FileDecryptionProperties builder pub fn builder(footer_key: Vec) -> DecryptionPropertiesBuilder { DecryptionPropertiesBuilder::new(footer_key) } diff --git a/parquet/src/file/metadata/reader.rs b/parquet/src/file/metadata/reader.rs index 3641caa1695d..87a964b89d30 100644 --- a/parquet/src/file/metadata/reader.rs +++ b/parquet/src/file/metadata/reader.rs @@ -156,7 +156,7 @@ impl ParquetMetaDataReader { self } - /// Provide the [`FileDecryptionProperties`] to use when decrypting the file. + /// Provide the FileDecryptionProperties to use when decrypting the file. /// /// This is only necessary when the file is encrypted. #[cfg(feature = "encryption")] From d3df0abd68cb03c4e75854831596d52f46986b8c Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Sun, 16 Feb 2025 15:02:36 +0100 Subject: [PATCH 70/97] Review feedback --- parquet/src/arrow/async_reader/metadata.rs | 4 ++-- parquet/src/arrow/async_reader/mod.rs | 2 +- parquet/src/file/metadata/reader.rs | 8 ++++---- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/parquet/src/arrow/async_reader/metadata.rs b/parquet/src/arrow/async_reader/metadata.rs index 53f992e424c7..4f41cd6ceebd 100644 --- a/parquet/src/arrow/async_reader/metadata.rs +++ b/parquet/src/arrow/async_reader/metadata.rs @@ -131,7 +131,7 @@ impl MetadataLoader { ( ParquetMetaDataReader::decode_metadata( &meta, - footer.encrypted_footer(), + footer.is_encrypted_footer(), #[cfg(feature = "encryption")] None, )?, @@ -144,7 +144,7 @@ impl MetadataLoader { ( ParquetMetaDataReader::decode_metadata( slice, - footer.encrypted_footer(), + footer.is_encrypted_footer(), #[cfg(feature = "encryption")] None, )?, diff --git a/parquet/src/arrow/async_reader/mod.rs b/parquet/src/arrow/async_reader/mod.rs index 321ca5f52a5e..be41065f71f7 100644 --- a/parquet/src/arrow/async_reader/mod.rs +++ b/parquet/src/arrow/async_reader/mod.rs @@ -179,7 +179,7 @@ impl AsyncFileReader for T { Ok(Arc::new(ParquetMetaDataReader::decode_metadata( &buf, - footer.encrypted_footer(), + footer.is_encrypted_footer(), #[cfg(feature = "encryption")] file_decryption_properties, )?)) diff --git a/parquet/src/file/metadata/reader.rs b/parquet/src/file/metadata/reader.rs index 87a964b89d30..62e0b61bec4c 100644 --- a/parquet/src/file/metadata/reader.rs +++ b/parquet/src/file/metadata/reader.rs @@ -95,7 +95,7 @@ impl FooterTail { } /// Whether the footer metadata is encrypted - pub fn encrypted_footer(&self) -> bool { + pub fn is_encrypted_footer(&self) -> bool { self.encrypted_footer } } @@ -570,7 +570,7 @@ impl ParquetMetaDataReader { let start = file_size - footer_metadata_len as u64; Self::decode_metadata( chunk_reader.get_bytes(start, metadata_len)?.as_ref(), - footer.encrypted_footer(), + footer.is_encrypted_footer(), #[cfg(feature = "encryption")] self.file_decryption_properties.as_ref(), ) @@ -639,7 +639,7 @@ impl ParquetMetaDataReader { Ok(( Self::decode_metadata( &meta, - footer.encrypted_footer(), + footer.is_encrypted_footer(), #[cfg(feature = "encryption")] file_decryption_properties, )?, @@ -651,7 +651,7 @@ impl ParquetMetaDataReader { Ok(( Self::decode_metadata( slice, - footer.encrypted_footer(), + footer.is_encrypted_footer(), #[cfg(feature = "encryption")] file_decryption_properties, )?, From fd1411f8cfc445538f07dbde6ca98c4e06a48661 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Thu, 6 Mar 2025 13:03:34 +0100 Subject: [PATCH 71/97] Update parquet/src/encryption/ciphers.rs Co-authored-by: Adam Reeve --- parquet/src/encryption/ciphers.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/parquet/src/encryption/ciphers.rs b/parquet/src/encryption/ciphers.rs index 8385932671f5..f864c28f4ce4 100644 --- a/parquet/src/encryption/ciphers.rs +++ b/parquet/src/encryption/ciphers.rs @@ -46,7 +46,7 @@ impl RingGcmBlockDecryptor { impl BlockDecryptor for RingGcmBlockDecryptor { fn decrypt(&self, length_and_ciphertext: &[u8], aad: &[u8]) -> Result> { let mut result = - Vec::with_capacity(length_and_ciphertext.len() - SIZE_LEN - NONCE_LEN - TAG_LEN); + Vec::with_capacity(length_and_ciphertext.len() - SIZE_LEN - NONCE_LEN); result.extend_from_slice(&length_and_ciphertext[SIZE_LEN + NONCE_LEN..]); let nonce = ring::aead::Nonce::try_assume_unique_for_key( From 5950f3a35cabda19d14632a142d36d7dac0eb364 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Thu, 6 Mar 2025 15:04:58 +0100 Subject: [PATCH 72/97] Review feedback --- parquet/README.md | 3 + parquet/src/arrow/arrow_reader/mod.rs | 19 +++--- parquet/src/arrow/async_reader/mod.rs | 88 +++++++++++++-------------- parquet/src/encryption/ciphers.rs | 3 +- parquet/src/encryption/decryption.rs | 37 ++++++----- parquet/src/file/metadata/mod.rs | 2 +- 6 files changed, 76 insertions(+), 76 deletions(-) diff --git a/parquet/README.md b/parquet/README.md index 1224e52f3f5a..9245664b4ef0 100644 --- a/parquet/README.md +++ b/parquet/README.md @@ -63,6 +63,7 @@ The `parquet` crate provides the following features which may be enabled in your - `crc` - enables functionality to automatically verify checksums of each page (if present) when decoding - `experimental` - Experimental APIs which may change, even between minor releases - `simdutf8` (default) - Use the [`simdutf8`] crate for SIMD-accelerated UTF-8 validation +- `encryption` - support for reading / writing encrypted Parquet files [`arrow`]: https://crates.io/crates/arrow [`simdutf8`]: https://crates.io/crates/simdutf8 @@ -76,12 +77,14 @@ The `parquet` crate provides the following features which may be enabled in your - [x] Row record reader - [x] Arrow record reader - [x] Async support (to Arrow) + - [x] Encrypted files - [x] Statistics support - [x] Write support - [x] Primitive column value writers - [ ] Row record writer - [x] Arrow record writer - [x] Async support + - [ ] Encrypted files - [x] Predicate pushdown - [x] Parquet format 4.0.0 support diff --git a/parquet/src/arrow/arrow_reader/mod.rs b/parquet/src/arrow/arrow_reader/mod.rs index 2b0f66d2f22d..b5376d9328d9 100644 --- a/parquet/src/arrow/arrow_reader/mod.rs +++ b/parquet/src/arrow/arrow_reader/mod.rs @@ -708,11 +708,10 @@ impl Iterator for ReaderPageIterator { .schema_descr() .column(self.column_idx); - if file_decryptor.is_column_encrypted(column_name.name().as_bytes()) { - let data_decryptor = - file_decryptor.get_column_data_decryptor(column_name.name().as_bytes()); + if file_decryptor.is_column_encrypted(column_name.name()) { + let data_decryptor = file_decryptor.get_column_data_decryptor(column_name.name()); let metadata_decryptor = - file_decryptor.get_column_metadata_decryptor(column_name.name().as_bytes()); + file_decryptor.get_column_metadata_decryptor(column_name.name()); let crypto_context = CryptoContext::new( rg_idx, @@ -1864,8 +1863,8 @@ mod tests { let column_2_key = "1234567890123451".as_bytes(); let decryption_properties = FileDecryptionProperties::builder(footer_key.to_vec()) - .with_column_key("double_field".as_bytes().to_vec(), column_1_key.to_vec()) - .with_column_key("float_field".as_bytes().to_vec(), column_2_key.to_vec()) + .with_column_key("double_field", column_1_key.to_vec()) + .with_column_key("float_field", column_2_key.to_vec()) .build() .unwrap(); @@ -1941,8 +1940,8 @@ mod tests { let column_2_key = "1234567890123451".as_bytes(); let decryption_properties = FileDecryptionProperties::builder(footer_key.to_vec()) - .with_column_key("double_field".as_bytes().to_vec(), column_1_key.to_vec()) - .with_column_key("float_field".as_bytes().to_vec(), column_2_key.to_vec()) + .with_column_key("double_field", column_1_key.to_vec()) + .with_column_key("float_field", column_2_key.to_vec()) .build() .unwrap(); @@ -1976,8 +1975,8 @@ mod tests { let column_2_key = "1234567890123451".as_bytes(); let decryption_properties = FileDecryptionProperties::builder(footer_key.to_vec()) - .with_column_key("double_field".as_bytes().to_vec(), column_1_key.to_vec()) - .with_column_key("float_field".as_bytes().to_vec(), column_2_key.to_vec()) + .with_column_key("double_field", column_1_key.to_vec()) + .with_column_key("float_field", column_2_key.to_vec()) .build() .unwrap(); diff --git a/parquet/src/arrow/async_reader/mod.rs b/parquet/src/arrow/async_reader/mod.rs index be41065f71f7..81ddb3e5ec31 100644 --- a/parquet/src/arrow/async_reader/mod.rs +++ b/parquet/src/arrow/async_reader/mod.rs @@ -1004,35 +1004,35 @@ impl RowGroups for InMemoryRowGroup<'_> { fn column_chunks(&self, i: usize) -> Result> { #[cfg(feature = "encryption")] - let crypto_context = - if let Some(file_decryptor) = &self.parquet_metadata.clone().file_decryptor().clone() { - let column_name = &self - .parquet_metadata - .clone() - .file_metadata() - .schema_descr() - .column(i); - - if file_decryptor.is_column_encrypted(column_name.name().as_bytes()) { - let data_decryptor = - file_decryptor.get_column_data_decryptor(column_name.name().as_bytes()); - let metadata_decryptor = - file_decryptor.get_column_metadata_decryptor(column_name.name().as_bytes()); - - let crypto_context = CryptoContext::new( - self.row_group_ordinal, - i, - data_decryptor, - metadata_decryptor, - file_decryptor.file_aad().clone(), - ); - Some(Arc::new(crypto_context)) - } else { - None - } + let crypto_context = if let Some(file_decryptor) = + &self.parquet_metadata.clone().file_decryptor().clone() + { + let column_name = &self + .parquet_metadata + .clone() + .file_metadata() + .schema_descr() + .column(i); + + if file_decryptor.is_column_encrypted(column_name.name()) { + let data_decryptor = file_decryptor.get_column_data_decryptor(column_name.name()); + let metadata_decryptor = + file_decryptor.get_column_metadata_decryptor(column_name.name()); + + let crypto_context = CryptoContext::new( + self.row_group_ordinal, + i, + data_decryptor, + metadata_decryptor, + file_decryptor.file_aad().clone(), + ); + Some(Arc::new(crypto_context)) } else { None - }; + } + } else { + None + }; match &self.column_chunks[i] { None => Err(ParquetError::General(format!( @@ -2524,13 +2524,13 @@ mod tests { // There is always a footer key even with a plaintext footer, // but this is used for signing the footer. - let footer_key = "0123456789012345".as_bytes(); // 128bit/16 - let column_1_key = "1234567890123450".as_bytes(); - let column_2_key = "1234567890123451".as_bytes(); + let footer_key = "0123456789012345".as_bytes().to_vec(); // 128bit/16 + let column_1_key = "1234567890123450".as_bytes().to_vec(); + let column_2_key = "1234567890123451".as_bytes().to_vec(); - let decryption_properties = FileDecryptionProperties::builder(footer_key.to_vec()) - .with_column_key("double_field".as_bytes().to_vec(), column_1_key.to_vec()) - .with_column_key("float_field".as_bytes().to_vec(), column_2_key.to_vec()) + let decryption_properties = FileDecryptionProperties::builder(footer_key) + .with_column_key("double_field", column_1_key) + .with_column_key("float_field", column_2_key) .build() .unwrap(); @@ -2604,13 +2604,13 @@ mod tests { let path = format!("{testdata}/encrypt_columns_and_footer.parquet.encrypted"); let mut file = File::open(&path).await.unwrap(); - let footer_key = "0123456789012345".as_bytes(); // 128bit/16 - let column_1_key = "1234567890123450".as_bytes(); - let column_2_key = "1234567890123451".as_bytes(); + let footer_key = "0123456789012345".as_bytes().to_vec(); // 128bit/16 + let column_1_key = "1234567890123450".as_bytes().to_vec(); + let column_2_key = "1234567890123451".as_bytes().to_vec(); let decryption_properties = FileDecryptionProperties::builder(footer_key.to_vec()) - .with_column_key("double_field".as_bytes().to_vec(), column_1_key.to_vec()) - .with_column_key("float_field".as_bytes().to_vec(), column_2_key.to_vec()) + .with_column_key("double_field", column_1_key) + .with_column_key("float_field", column_2_key) .build() .unwrap(); @@ -2639,13 +2639,13 @@ mod tests { let path = format!("{testdata}/encrypt_columns_and_footer_ctr.parquet.encrypted"); let mut file = File::open(&path).await.unwrap(); - let footer_key = "0123456789012345".as_bytes(); - let column_1_key = "1234567890123450".as_bytes(); - let column_2_key = "1234567890123451".as_bytes(); + let footer_key = "0123456789012345".as_bytes().to_vec(); + let column_1_key = "1234567890123450".as_bytes().to_vec(); + let column_2_key = "1234567890123451".as_bytes().to_vec(); - let decryption_properties = FileDecryptionProperties::builder(footer_key.to_vec()) - .with_column_key("double_field".as_bytes().to_vec(), column_1_key.to_vec()) - .with_column_key("float_field".as_bytes().to_vec(), column_2_key.to_vec()) + let decryption_properties = FileDecryptionProperties::builder(footer_key) + .with_column_key("double_field", column_1_key) + .with_column_key("float_field", column_2_key) .build() .unwrap(); diff --git a/parquet/src/encryption/ciphers.rs b/parquet/src/encryption/ciphers.rs index f864c28f4ce4..aa2a75893dc7 100644 --- a/parquet/src/encryption/ciphers.rs +++ b/parquet/src/encryption/ciphers.rs @@ -45,8 +45,7 @@ impl RingGcmBlockDecryptor { impl BlockDecryptor for RingGcmBlockDecryptor { fn decrypt(&self, length_and_ciphertext: &[u8], aad: &[u8]) -> Result> { - let mut result = - Vec::with_capacity(length_and_ciphertext.len() - SIZE_LEN - NONCE_LEN); + let mut result = Vec::with_capacity(length_and_ciphertext.len() - SIZE_LEN - NONCE_LEN); result.extend_from_slice(&length_and_ciphertext[SIZE_LEN + NONCE_LEN..]); let nonce = ring::aead::Nonce::try_assume_unique_for_key( diff --git a/parquet/src/encryption/decryption.rs b/parquet/src/encryption/decryption.rs index a019ace580e3..ed6d5921a5d2 100644 --- a/parquet/src/encryption/decryption.rs +++ b/parquet/src/encryption/decryption.rs @@ -142,7 +142,7 @@ impl CryptoContext { #[derive(Debug, Clone, PartialEq)] pub struct FileDecryptionProperties { footer_key: Vec, - column_keys: Option, Vec>>, + column_keys: HashMap>, aad_prefix: Option>, } @@ -155,7 +155,7 @@ impl FileDecryptionProperties { pub struct DecryptionPropertiesBuilder { footer_key: Vec, - column_keys: Option, Vec>>, + column_keys: HashMap>, aad_prefix: Option>, } @@ -163,7 +163,7 @@ impl DecryptionPropertiesBuilder { pub fn new(footer_key: Vec) -> DecryptionPropertiesBuilder { Self { footer_key, - column_keys: None, + column_keys: HashMap::default(), aad_prefix: None, } } @@ -181,10 +181,9 @@ impl DecryptionPropertiesBuilder { self } - pub fn with_column_key(mut self, column_name: Vec, decryption_key: Vec) -> Self { - let mut column_keys = self.column_keys.unwrap_or_default(); - column_keys.insert(column_name, decryption_key); - self.column_keys = Some(column_keys); + pub fn with_column_key(mut self, column_name: &str, decryption_key: Vec) -> Self { + self.column_keys + .insert(column_name.to_string(), decryption_key); self } } @@ -209,10 +208,10 @@ impl FileDecryptor { aad_prefix: Vec, ) -> Self { let file_aad = [aad_prefix.as_slice(), aad_file_unique.as_slice()].concat(); + // todo decr: if no key available yet (not set in properties, should be retrieved from metadata) let footer_decryptor = RingGcmBlockDecryptor::new(&decryption_properties.footer_key); Self { - // todo decr: if no key available yet (not set in properties, will be retrieved from metadata) footer_decryptor: Some(Arc::new(footer_decryptor)), decryption_properties: decryption_properties.clone(), file_aad, @@ -223,19 +222,16 @@ impl FileDecryptor { self.footer_decryptor.clone().unwrap() } - pub(crate) fn get_column_data_decryptor(&self, column_name: &[u8]) -> Arc { - match self.decryption_properties.column_keys.as_ref() { + pub(crate) fn get_column_data_decryptor(&self, column_name: &str) -> Arc { + match self.decryption_properties.column_keys.get(column_name) { + Some(column_key) => Arc::new(RingGcmBlockDecryptor::new(column_key)), None => self.get_footer_decryptor(), - Some(column_keys) => match column_keys.get(column_name) { - None => self.get_footer_decryptor(), - Some(column_key) => Arc::new(RingGcmBlockDecryptor::new(column_key)), - }, } } pub(crate) fn get_column_metadata_decryptor( &self, - column_name: &[u8], + column_name: &str, ) -> Arc { // Once GCM CTR mode is implemented, data and metadata decryptors may be different self.get_column_data_decryptor(column_name) @@ -245,11 +241,14 @@ impl FileDecryptor { &self.file_aad } - pub(crate) fn is_column_encrypted(&self, column_name: &[u8]) -> bool { + pub(crate) fn is_column_encrypted(&self, column_name: &str) -> bool { // Column is encrypted if either uniform encryption is used or an encryption key is set for the column - match self.decryption_properties.column_keys.as_ref() { - None => true, - Some(keys) => keys.contains_key(column_name), + match self.decryption_properties.column_keys.is_empty() { + false => self + .decryption_properties + .column_keys + .contains_key(column_name), + true => true, } } } diff --git a/parquet/src/file/metadata/mod.rs b/parquet/src/file/metadata/mod.rs index e6ec488fe9f4..c754a5693b92 100644 --- a/parquet/src/file/metadata/mod.rs +++ b/parquet/src/file/metadata/mod.rs @@ -666,7 +666,7 @@ impl RowGroupMetaData { } Some(ColumnCryptoMetaData::ENCRYPTIONWITHCOLUMNKEY(crypto_metadata)) => { let column_name = crypto_metadata.path_in_schema.join("."); - decryptor.get_column_metadata_decryptor(column_name.as_bytes()) + decryptor.get_column_metadata_decryptor(column_name.as_str()) } Some(ColumnCryptoMetaData::ENCRYPTIONWITHFOOTERKEY(_)) => { decryptor.get_footer_decryptor() From 1e37bec3fcbed2eb8a63eb7bc25b2421206ca7c8 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Thu, 6 Mar 2025 19:44:38 +0100 Subject: [PATCH 73/97] WIP: Decryption shouldn't change the API --- parquet/examples/read_with_rowgroup.rs | 9 +- parquet/src/arrow/arrow_reader/mod.rs | 22 ++-- parquet/src/arrow/async_reader/metadata.rs | 17 +-- parquet/src/arrow/async_reader/mod.rs | 114 +++++++++++++-------- parquet/src/arrow/async_reader/store.rs | 28 +++-- parquet/src/column/writer/mod.rs | 8 -- parquet/src/file/footer.rs | 7 +- parquet/src/file/metadata/mod.rs | 84 ++++++++------- parquet/src/file/metadata/reader.rs | 96 +++++++++++------ parquet/src/file/serialized_reader.rs | 25 ++--- parquet/src/file/writer.rs | 2 - parquet/tests/arrow_writer_layout.rs | 2 - 12 files changed, 224 insertions(+), 190 deletions(-) diff --git a/parquet/examples/read_with_rowgroup.rs b/parquet/examples/read_with_rowgroup.rs index 44d25596110e..8cccc7fe14ac 100644 --- a/parquet/examples/read_with_rowgroup.rs +++ b/parquet/examples/read_with_rowgroup.rs @@ -35,12 +35,7 @@ async fn main() -> Result<()> { let mut file = File::open(&path).await.unwrap(); // The metadata could be cached in other places, this example only shows how to read - let metadata = file - .get_metadata( - #[cfg(feature = "encryption")] - None, - ) - .await?; + let metadata = file.get_metadata().await?; for rg in metadata.row_groups() { let mut rowgroup = InMemoryRowGroup::create(rg.clone(), ProjectionMask::all()); @@ -126,8 +121,6 @@ impl RowGroups for InMemoryRowGroup { self.metadata.column(i), self.num_rows(), None, - #[cfg(feature = "encryption")] - None, )?); Ok(Box::new(ColumnChunkIterator { diff --git a/parquet/src/arrow/arrow_reader/mod.rs b/parquet/src/arrow/arrow_reader/mod.rs index b5376d9328d9..0d20e5b28a0b 100644 --- a/parquet/src/arrow/arrow_reader/mod.rs +++ b/parquet/src/arrow/arrow_reader/mod.rs @@ -400,10 +400,8 @@ impl ArrowReaderMetadata { pub fn load(reader: &T, options: ArrowReaderOptions) -> Result { let metadata = ParquetMetaDataReader::new().with_page_indexes(options.page_index); #[cfg(feature = "encryption")] - let metadata = metadata - .with_decryption_properties(options.file_decryption_properties.as_ref()) - .parse_and_finish(reader)?; - #[cfg(not(feature = "encryption"))] + let metadata = + metadata.with_decryption_properties(options.file_decryption_properties.as_ref()); let metadata = metadata.parse_and_finish(reader)?; Self::try_new(Arc::new(metadata), options) } @@ -728,14 +726,14 @@ impl Iterator for ReaderPageIterator { None }; - let ret = SerializedPageReader::new( - reader, - meta, - total_rows, - page_locations, - #[cfg(feature = "encryption")] - crypto_context, - ); + let ret = SerializedPageReader::new(reader, meta, total_rows, page_locations); + + #[cfg(feature = "encryption")] + if crypto_context.is_some() { + let ret = Ok(ret.unwrap().with_crypto_context(crypto_context.unwrap())); + return Some(ret.map(|x| Box::new(x) as _)); + } + Some(ret.map(|x| Box::new(x) as _)) } } diff --git a/parquet/src/arrow/async_reader/metadata.rs b/parquet/src/arrow/async_reader/metadata.rs index 4f41cd6ceebd..71d2e57ddd50 100644 --- a/parquet/src/arrow/async_reader/metadata.rs +++ b/parquet/src/arrow/async_reader/metadata.rs @@ -128,26 +128,13 @@ impl MetadataLoader { let (metadata, remainder) = if length > suffix_len - FOOTER_SIZE { let metadata_start = file_size - length - FOOTER_SIZE; let meta = fetch.fetch(metadata_start..file_size - FOOTER_SIZE).await?; - ( - ParquetMetaDataReader::decode_metadata( - &meta, - footer.is_encrypted_footer(), - #[cfg(feature = "encryption")] - None, - )?, - None, - ) + (ParquetMetaDataReader::decode_metadata(&meta)?, None) } else { let metadata_start = file_size - length - FOOTER_SIZE - footer_start; let slice = &suffix[metadata_start..suffix_len - FOOTER_SIZE]; ( - ParquetMetaDataReader::decode_metadata( - slice, - footer.is_encrypted_footer(), - #[cfg(feature = "encryption")] - None, - )?, + ParquetMetaDataReader::decode_metadata(slice)?, Some((footer_start, suffix.slice(..metadata_start))), ) }; diff --git a/parquet/src/arrow/async_reader/mod.rs b/parquet/src/arrow/async_reader/mod.rs index 81ddb3e5ec31..9dfb4cba7407 100644 --- a/parquet/src/arrow/async_reader/mod.rs +++ b/parquet/src/arrow/async_reader/mod.rs @@ -108,12 +108,16 @@ pub trait AsyncFileReader: Send { /// Provides asynchronous access to the [`ParquetMetaData`] of a parquet file, /// allowing fine-grained control over how metadata is sourced, in particular allowing /// for caching, pre-fetching, catalog metadata, etc... - fn get_metadata<'a>( - &'a mut self, - #[cfg(feature = "encryption")] file_decryption_properties: Option< - &'a FileDecryptionProperties, - >, - ) -> BoxFuture<'a, Result>>; + fn get_metadata(&mut self) -> BoxFuture<'_, Result>>; + + #[cfg(feature = "encryption")] + fn with_file_decryption_properties( + &mut self, + file_decryption_properties: FileDecryptionProperties, + ); + + #[cfg(feature = "encryption")] + fn read_encrypted(&self) -> bool; } /// This allows Box to be used as an AsyncFileReader, @@ -126,16 +130,22 @@ impl AsyncFileReader for Box { self.as_mut().get_byte_ranges(ranges) } - fn get_metadata<'a>( - &'a mut self, - #[cfg(feature = "encryption")] file_decryption_properties: Option< - &'a FileDecryptionProperties, - >, - ) -> BoxFuture<'a, Result>> { - self.as_mut().get_metadata( - #[cfg(feature = "encryption")] - file_decryption_properties, - ) + fn get_metadata(&mut self) -> BoxFuture<'_, Result>> { + self.as_mut().get_metadata() + } + + #[cfg(feature = "encryption")] + fn with_file_decryption_properties( + &mut self, + file_decryption_properties: FileDecryptionProperties, + ) { + self.as_mut() + .with_file_decryption_properties(file_decryption_properties); + } + + #[cfg(feature = "encryption")] + fn read_encrypted(&self) -> bool { + self.as_ref().read_encrypted() } } @@ -156,12 +166,20 @@ impl AsyncFileReader for T { .boxed() } - fn get_metadata<'a>( - &'a mut self, - #[cfg(feature = "encryption")] file_decryption_properties: Option< - &'a FileDecryptionProperties, - >, - ) -> BoxFuture<'a, Result>> { + #[cfg(feature = "encryption")] + fn with_file_decryption_properties( + &mut self, + file_decryption_properties: FileDecryptionProperties, + ) { + self.with_file_decryption_properties(file_decryption_properties); + } + + #[cfg(feature = "encryption")] + fn read_encrypted(&self) -> bool { + self.read_encrypted() + } + + fn get_metadata(&mut self) -> BoxFuture<'_, Result>> { const FOOTER_SIZE_I64: i64 = FOOTER_SIZE as i64; async move { self.seek(SeekFrom::End(-FOOTER_SIZE_I64)).await?; @@ -177,12 +195,14 @@ impl AsyncFileReader for T { let mut buf = Vec::with_capacity(metadata_len); self.take(metadata_len as _).read_to_end(&mut buf).await?; - Ok(Arc::new(ParquetMetaDataReader::decode_metadata( - &buf, - footer.is_encrypted_footer(), - #[cfg(feature = "encryption")] - file_decryption_properties, - )?)) + // todo: decrypt + if self.read_encrypted() { + todo!(); + } + let parquet_metadata_reader = ParquetMetaDataReader::decode_metadata(&buf)?; + // #[cfg(feature = "encryption")] + // parquet_metadata_reader.with_file_decryptor(file_decryption_properties) + Ok(Arc::new(parquet_metadata_reader)) } .boxed() } @@ -204,12 +224,7 @@ impl ArrowReaderMetadata { ) -> Result { // TODO: this is all rather awkward. It would be nice if AsyncFileReader::get_metadata // took an argument to fetch the page indexes. - let mut metadata = input - .get_metadata( - #[cfg(feature = "encryption")] - options.file_decryption_properties.as_ref(), - ) - .await?; + let mut metadata = input.get_metadata().await?; if options.page_index && metadata.column_index().is_none() @@ -1044,14 +1059,17 @@ impl RowGroups for InMemoryRowGroup<'_> { // filter out empty offset indexes (old versions specified Some(vec![]) when no present) .filter(|index| !index.is_empty()) .map(|index| index[i].page_locations.clone()); - let page_reader: Box = Box::new(SerializedPageReader::new( + let page_reader = SerializedPageReader::new( data.clone(), self.metadata.column(i), self.row_count, page_locations, - #[cfg(feature = "encryption")] - crypto_context, - )?); + )?; + + #[cfg(feature = "encryption")] + let page_reader = page_reader.with_crypto_context(crypto_context.unwrap()); + + let page_reader: Box = Box::new(page_reader); Ok(Box::new(ColumnChunkIterator { reader: Some(Ok(page_reader)), @@ -1172,14 +1190,22 @@ mod tests { futures::future::ready(Ok(self.data.slice(range))).boxed() } - fn get_metadata<'a>( - &'a mut self, - #[cfg(feature = "encryption")] _file_decryption_properties: Option< - &'a FileDecryptionProperties, - >, - ) -> BoxFuture<'a, Result>> { + fn get_metadata(&mut self) -> BoxFuture<'_, Result>> { futures::future::ready(Ok(self.metadata.clone())).boxed() } + + #[cfg(feature = "encryption")] + fn with_file_decryption_properties( + &mut self, + file_decryption_properties: FileDecryptionProperties, + ) { + todo!("we don't test for decryption yet"); + } + + #[cfg(feature = "encryption")] + fn read_encrypted(&self) -> bool { + todo!("we don't test for decryption yet"); + } } #[tokio::test] diff --git a/parquet/src/arrow/async_reader/store.rs b/parquet/src/arrow/async_reader/store.rs index 0354bbc5616c..9bcb9d29492d 100644 --- a/parquet/src/arrow/async_reader/store.rs +++ b/parquet/src/arrow/async_reader/store.rs @@ -60,6 +60,8 @@ pub struct ParquetObjectReader { preload_column_index: bool, preload_offset_index: bool, runtime: Option, + #[cfg(feature = "encryption")] + file_decryption_properties: Option, } impl ParquetObjectReader { @@ -74,6 +76,8 @@ impl ParquetObjectReader { preload_column_index: false, preload_offset_index: false, runtime: None, + #[cfg(feature = "encryption")] + file_decryption_properties: None, } } @@ -165,12 +169,7 @@ impl AsyncFileReader for ParquetObjectReader { // an `impl MetadataFetch` and calls those methods to get data from it. Due to `Self`'s impl of // `AsyncFileReader`, the calls to `MetadataFetch::fetch` are just delegated to // `Self::get_bytes`. - fn get_metadata<'a>( - &'a mut self, - #[cfg(feature = "encryption")] file_decryption_properties: Option< - &'a FileDecryptionProperties, - >, - ) -> BoxFuture<'a, Result>> { + fn get_metadata(&mut self) -> BoxFuture<'_, Result>> { Box::pin(async move { let file_size = self.meta.size; let metadata = ParquetMetaDataReader::new() @@ -178,12 +177,27 @@ impl AsyncFileReader for ParquetObjectReader { .with_offset_indexes(self.preload_offset_index) .with_prefetch_hint(self.metadata_size_hint); #[cfg(feature = "encryption")] - let metadata = metadata.with_decryption_properties(file_decryption_properties); + let file_decryption_properties = self.file_decryption_properties.clone().unwrap(); + #[cfg(feature = "encryption")] + let metadata = metadata.with_decryption_properties(Some(&file_decryption_properties)); let metadata = metadata.load_and_finish(self, file_size).await?; Ok(Arc::new(metadata)) }) } + + #[cfg(feature = "encryption")] + fn with_file_decryption_properties( + &mut self, + file_decryption_properties: FileDecryptionProperties, + ) { + self.file_decryption_properties = Some(file_decryption_properties); + } + + #[cfg(feature = "encryption")] + fn read_encrypted(&self) -> bool { + self.file_decryption_properties.is_some() + } } #[cfg(test)] diff --git a/parquet/src/column/writer/mod.rs b/parquet/src/column/writer/mod.rs index 50156a26e276..5f34f34cbb7a 100644 --- a/parquet/src/column/writer/mod.rs +++ b/parquet/src/column/writer/mod.rs @@ -2105,8 +2105,6 @@ mod tests { r.rows_written as usize, None, Arc::new(props), - #[cfg(feature = "encryption")] - None, ) .unwrap(); @@ -2159,8 +2157,6 @@ mod tests { r.rows_written as usize, None, Arc::new(props), - #[cfg(feature = "encryption")] - None, ) .unwrap(); @@ -2296,8 +2292,6 @@ mod tests { r.rows_written as usize, None, Arc::new(props), - #[cfg(feature = "encryption")] - None, ) .unwrap(), ); @@ -3747,8 +3741,6 @@ mod tests { result.rows_written as usize, None, Arc::new(props), - #[cfg(feature = "encryption")] - None, ) .unwrap(), ); diff --git a/parquet/src/file/footer.rs b/parquet/src/file/footer.rs index 5be084259e18..85ef30cd0ecc 100644 --- a/parquet/src/file/footer.rs +++ b/parquet/src/file/footer.rs @@ -58,12 +58,7 @@ pub fn parse_metadata(chunk_reader: &R) -> Result Result { - ParquetMetaDataReader::decode_metadata( - buf, - false, - #[cfg(feature = "encryption")] - None, - ) + ParquetMetaDataReader::decode_metadata(buf) } /// Decodes the Parquet footer returning the metadata length in bytes diff --git a/parquet/src/file/metadata/mod.rs b/parquet/src/file/metadata/mod.rs index c754a5693b92..804f33110603 100644 --- a/parquet/src/file/metadata/mod.rs +++ b/parquet/src/file/metadata/mod.rs @@ -189,21 +189,23 @@ pub struct ParquetMetaData { impl ParquetMetaData { /// Creates Parquet metadata from file metadata and a list of row /// group metadata - pub fn new( - file_metadata: FileMetaData, - row_groups: Vec, - #[cfg(feature = "encryption")] file_decryptor: Option, - ) -> Self { + pub fn new(file_metadata: FileMetaData, row_groups: Vec) -> Self { ParquetMetaData { file_metadata, row_groups, #[cfg(feature = "encryption")] - file_decryptor, + file_decryptor: None, column_index: None, offset_index: None, } } + #[allow(missing_docs)] + #[cfg(feature = "encryption")] + pub fn with_file_decryptor(&mut self, file_decryptor: Option) { + self.file_decryptor = file_decryptor; + } + /// Creates Parquet metadata from file metadata, a list of row /// group metadata, and the column index structures. #[deprecated(since = "53.1.0", note = "Use ParquetMetaDataBuilder")] @@ -347,12 +349,7 @@ pub struct ParquetMetaDataBuilder(ParquetMetaData); impl ParquetMetaDataBuilder { /// Create a new builder from a file metadata, with no row groups pub fn new(file_meta_data: FileMetaData) -> Self { - Self(ParquetMetaData::new( - file_meta_data, - vec![], - #[cfg(feature = "encryption")] - None, - )) + Self(ParquetMetaData::new(file_meta_data, vec![])) } /// Create a new builder from an existing ParquetMetaData @@ -626,11 +623,10 @@ impl RowGroupMetaData { self.file_offset } - /// Method to convert from Thrift. - pub fn from_thrift( + pub fn from_encrypted_thrift( schema_descr: SchemaDescPtr, mut rg: RowGroup, - #[cfg(feature = "encryption")] decryptor: Option<&FileDecryptor>, + decryptor: Option<&FileDecryptor>, ) -> Result { if schema_descr.num_columns() != rg.columns.len() { return Err(general_err!( @@ -643,11 +639,6 @@ impl RowGroupMetaData { let num_rows = rg.num_rows; let mut columns = vec![]; - #[cfg(not(feature = "encryption"))] - for (c, d) in rg.columns.drain(0..).zip(schema_descr.columns()) { - columns.push(ColumnChunkMetaData::from_thrift(d.clone(), c)?); - } - #[cfg(feature = "encryption")] for (i, (mut c, d)) in rg .columns @@ -690,6 +681,36 @@ impl RowGroupMetaData { } columns.push(ColumnChunkMetaData::from_thrift(d.clone(), c)?); } + + let sorting_columns = rg.sorting_columns; + Ok(RowGroupMetaData { + columns, + num_rows, + sorting_columns, + total_byte_size, + schema_descr, + file_offset: rg.file_offset, + ordinal: rg.ordinal, + }) + } + + /// Method to convert from Thrift. + pub fn from_thrift(schema_descr: SchemaDescPtr, mut rg: RowGroup) -> Result { + if schema_descr.num_columns() != rg.columns.len() { + return Err(general_err!( + "Column count mismatch. Schema has {} columns while Row Group has {}", + schema_descr.num_columns(), + rg.columns.len() + )); + } + let total_byte_size = rg.total_byte_size; + let num_rows = rg.num_rows; + let mut columns = vec![]; + + for (c, d) in rg.columns.drain(0..).zip(schema_descr.columns()) { + columns.push(ColumnChunkMetaData::from_thrift(d.clone(), c)?); + } + let sorting_columns = rg.sorting_columns; Ok(RowGroupMetaData { columns, @@ -1680,14 +1701,9 @@ mod tests { .unwrap(); let row_group_exp = row_group_meta.to_thrift(); - let row_group_res = RowGroupMetaData::from_thrift( - schema_descr, - row_group_exp.clone(), - #[cfg(feature = "encryption")] - None, - ) - .unwrap() - .to_thrift(); + let row_group_res = RowGroupMetaData::from_thrift(schema_descr, row_group_exp.clone()) + .unwrap() + .to_thrift(); assert_eq!(row_group_res, row_group_exp); } @@ -1766,14 +1782,10 @@ mod tests { .build() .unwrap(); - let err = RowGroupMetaData::from_thrift( - schema_descr_3cols, - row_group_meta_2cols.to_thrift(), - #[cfg(feature = "encryption")] - None, - ) - .unwrap_err() - .to_string(); + let err = + RowGroupMetaData::from_thrift(schema_descr_3cols, row_group_meta_2cols.to_thrift()) + .unwrap_err() + .to_string(); assert_eq!( err, "Parquet error: Column count mismatch. Schema has 3 columns while Row Group has 2" diff --git a/parquet/src/file/metadata/reader.rs b/parquet/src/file/metadata/reader.rs index 62e0b61bec4c..42524f97fcb0 100644 --- a/parquet/src/file/metadata/reader.rs +++ b/parquet/src/file/metadata/reader.rs @@ -168,6 +168,11 @@ impl ParquetMetaDataReader { self } + #[cfg(feature = "encryption")] + fn read_encrypted(&self) -> bool { + self.file_decryption_properties.is_some() + } + /// Indicates whether this reader has a [`ParquetMetaData`] internally. pub fn has_metadata(&self) -> bool { self.metadata.is_some() @@ -568,12 +573,15 @@ impl ParquetMetaDataReader { } let start = file_size - footer_metadata_len as u64; - Self::decode_metadata( - chunk_reader.get_bytes(start, metadata_len)?.as_ref(), - footer.is_encrypted_footer(), - #[cfg(feature = "encryption")] - self.file_decryption_properties.as_ref(), - ) + if self.read_encrypted() { + Self::decrypt_metadata( + chunk_reader.get_bytes(start, metadata_len)?.as_ref(), + true, + self.file_decryption_properties.as_ref(), + ) + } else { + Self::decode_metadata(chunk_reader.get_bytes(start, metadata_len)?.as_ref()) + } } /// Return the number of bytes to read in the initial pass. If `prefetch_size` has @@ -639,9 +647,9 @@ impl ParquetMetaDataReader { Ok(( Self::decode_metadata( &meta, - footer.is_encrypted_footer(), - #[cfg(feature = "encryption")] - file_decryption_properties, + // footer.is_encrypted_footer(), + // #[cfg(feature = "encryption")] + // file_decryption_properties, )?, None, )) @@ -651,9 +659,9 @@ impl ParquetMetaDataReader { Ok(( Self::decode_metadata( slice, - footer.is_encrypted_footer(), - #[cfg(feature = "encryption")] - file_decryption_properties, + // footer.is_encrypted_footer(), + // #[cfg(feature = "encryption")] + // file_decryption_properties, )?, Some((footer_start, suffix.slice(..metadata_start))), )) @@ -695,14 +703,7 @@ impl ParquetMetaDataReader { Self::decode_footer_tail(slice).map(|f| f.metadata_length) } - /// Decodes [`ParquetMetaData`] from the provided bytes. - /// - /// Typically this is used to decode the metadata from the end of a parquet - /// file. The format of `buf` is the Thrift compact binary protocol, as specified - /// by the [Parquet Spec]. - /// - /// [Parquet Spec]: https://github.com/apache/parquet-format#metadata - pub fn decode_metadata( + pub fn decrypt_metadata( buf: &[u8], encrypted_footer: bool, #[cfg(feature = "encryption")] file_decryption_properties: Option< @@ -723,7 +724,6 @@ impl ParquetMetaDataReader { #[cfg(feature = "encryption")] let decrypted_fmd_buf; - #[cfg(feature = "encryption")] if encrypted_footer { if let Some(file_decryption_properties) = file_decryption_properties { let t_file_crypto_metadata: TFileCryptoMetaData = @@ -751,7 +751,6 @@ impl ParquetMetaDataReader { let schema = types::from_thrift(&t_file_metadata.schema)?; let schema_descr = Arc::new(SchemaDescriptor::new(schema)); - #[cfg(feature = "encryption")] if let (Some(algo), Some(file_decryption_properties)) = ( t_file_metadata.encryption_algorithm, file_decryption_properties, @@ -762,12 +761,7 @@ impl ParquetMetaDataReader { let mut row_groups = Vec::new(); for rg in t_file_metadata.row_groups { - let r = RowGroupMetaData::from_thrift( - schema_descr.clone(), - rg, - #[cfg(feature = "encryption")] - file_decryptor.as_ref(), - )?; + let r = RowGroupMetaData::from_thrift(schema_descr.clone(), rg)?; row_groups.push(r); } let column_orders = @@ -781,12 +775,46 @@ impl ParquetMetaDataReader { schema_descr, column_orders, ); - Ok(ParquetMetaData::new( - file_metadata, - row_groups, - #[cfg(feature = "encryption")] - file_decryptor, - )) + let mut metadata = ParquetMetaData::new(file_metadata, row_groups); + + metadata.with_file_decryptor(file_decryptor); + + Ok(metadata) + } + + /// Decodes [`ParquetMetaData`] from the provided bytes. + /// + /// Typically this is used to decode the metadata from the end of a parquet + /// file. The format of `buf` is the Thrift compact binary protocol, as specified + /// by the [Parquet Spec]. + /// + /// [Parquet Spec]: https://github.com/apache/parquet-format#metadata + pub fn decode_metadata(buf: &[u8]) -> Result { + let mut prot = TCompactSliceInputProtocol::new(buf); + + let t_file_metadata: TFileMetaData = TFileMetaData::read_from_in_protocol(&mut prot) + .map_err(|e| general_err!("Could not parse metadata: {}", e))?; + let schema = types::from_thrift(&t_file_metadata.schema)?; + let schema_descr = Arc::new(SchemaDescriptor::new(schema)); + + let mut row_groups = Vec::new(); + for rg in t_file_metadata.row_groups { + let r = RowGroupMetaData::from_thrift(schema_descr.clone(), rg)?; + row_groups.push(r); + } + let column_orders = + Self::parse_column_orders(t_file_metadata.column_orders, &schema_descr)?; + + let file_metadata = FileMetaData::new( + t_file_metadata.version, + t_file_metadata.num_rows, + t_file_metadata.created_by, + t_file_metadata.key_value_metadata, + schema_descr, + column_orders, + ); + + Ok(ParquetMetaData::new(file_metadata, row_groups)) } /// Parses column orders from Thrift definition. diff --git a/parquet/src/file/serialized_reader.rs b/parquet/src/file/serialized_reader.rs index 3ddeb82a0cb3..034a1a3897d7 100644 --- a/parquet/src/file/serialized_reader.rs +++ b/parquet/src/file/serialized_reader.rs @@ -325,8 +325,6 @@ impl RowGroupReader for SerializedRowGroupReader<'_, R self.metadata.num_rows() as usize, page_locations, props, - #[cfg(feature = "encryption")] - None, )?)) } @@ -568,18 +566,16 @@ impl SerializedPageReader { meta: &ColumnChunkMetaData, total_rows: usize, page_locations: Option>, - #[cfg(feature = "encryption")] crypto_context: Option>, ) -> Result { let props = Arc::new(ReaderProperties::builder().build()); - SerializedPageReader::new_with_properties( - reader, - meta, - total_rows, - page_locations, - props, - #[cfg(feature = "encryption")] - crypto_context, - ) + SerializedPageReader::new_with_properties(reader, meta, total_rows, page_locations, props) + } + + #[allow(missing_docs)] + #[cfg(feature = "encryption")] + pub fn with_crypto_context(mut self, crypto_context: Arc) -> Self { + self.crypto_context = Some(crypto_context); + self } /// Creates a new serialized page with custom options. @@ -589,7 +585,6 @@ impl SerializedPageReader { total_rows: usize, page_locations: Option>, props: ReaderPropertiesPtr, - #[cfg(feature = "encryption")] crypto_context: Option>, ) -> Result { let decompressor = create_codec(meta.compression(), props.codec_options())?; let (start, len) = meta.byte_range(); @@ -625,7 +620,7 @@ impl SerializedPageReader { state, physical_type: meta.column_type(), #[cfg(feature = "encryption")] - crypto_context, + crypto_context: None, }) } @@ -1317,8 +1312,6 @@ mod tests { row_group.metadata.num_rows() as usize, page_locations, props, - #[cfg(feature = "encryption")] - None, ) } diff --git a/parquet/src/file/writer.rs b/parquet/src/file/writer.rs index 2fa2d2dcf910..6b7707f03cd9 100644 --- a/parquet/src/file/writer.rs +++ b/parquet/src/file/writer.rs @@ -1314,8 +1314,6 @@ mod tests { total_num_values as usize, None, Arc::new(props), - #[cfg(feature = "encryption")] - None, ) .unwrap(); diff --git a/parquet/tests/arrow_writer_layout.rs b/parquet/tests/arrow_writer_layout.rs index 9297b8d13f07..9a66d13f84d7 100644 --- a/parquet/tests/arrow_writer_layout.rs +++ b/parquet/tests/arrow_writer_layout.rs @@ -141,8 +141,6 @@ fn assert_layout(file_reader: &Bytes, meta: &ParquetMetaData, layout: &Layout) { row_group.num_rows() as usize, None, Arc::new(properties), - #[cfg(feature = "encryption")] - None, ) .unwrap(); From 9a1ebe16815493990bdc315b397a256310439a2e Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Fri, 7 Mar 2025 14:09:39 +0100 Subject: [PATCH 74/97] WIP: Decryption shouldn't change the API --- parquet/src/arrow/async_reader/mod.rs | 22 +---- parquet/src/arrow/async_reader/store.rs | 5 -- parquet/src/file/metadata/reader.rs | 12 +-- parquet/src/file/serialized_reader.rs | 107 ++++++++++++++---------- 4 files changed, 69 insertions(+), 77 deletions(-) diff --git a/parquet/src/arrow/async_reader/mod.rs b/parquet/src/arrow/async_reader/mod.rs index 9dfb4cba7407..e7cc66747e10 100644 --- a/parquet/src/arrow/async_reader/mod.rs +++ b/parquet/src/arrow/async_reader/mod.rs @@ -115,9 +115,6 @@ pub trait AsyncFileReader: Send { &mut self, file_decryption_properties: FileDecryptionProperties, ); - - #[cfg(feature = "encryption")] - fn read_encrypted(&self) -> bool; } /// This allows Box to be used as an AsyncFileReader, @@ -142,11 +139,6 @@ impl AsyncFileReader for Box { self.as_mut() .with_file_decryption_properties(file_decryption_properties); } - - #[cfg(feature = "encryption")] - fn read_encrypted(&self) -> bool { - self.as_ref().read_encrypted() - } } impl AsyncFileReader for T { @@ -174,11 +166,6 @@ impl AsyncFileReader for T { self.with_file_decryption_properties(file_decryption_properties); } - #[cfg(feature = "encryption")] - fn read_encrypted(&self) -> bool { - self.read_encrypted() - } - fn get_metadata(&mut self) -> BoxFuture<'_, Result>> { const FOOTER_SIZE_I64: i64 = FOOTER_SIZE as i64; async move { @@ -196,9 +183,7 @@ impl AsyncFileReader for T { self.take(metadata_len as _).read_to_end(&mut buf).await?; // todo: decrypt - if self.read_encrypted() { - todo!(); - } + let parquet_metadata_reader = ParquetMetaDataReader::decode_metadata(&buf)?; // #[cfg(feature = "encryption")] // parquet_metadata_reader.with_file_decryptor(file_decryption_properties) @@ -1201,11 +1186,6 @@ mod tests { ) { todo!("we don't test for decryption yet"); } - - #[cfg(feature = "encryption")] - fn read_encrypted(&self) -> bool { - todo!("we don't test for decryption yet"); - } } #[tokio::test] diff --git a/parquet/src/arrow/async_reader/store.rs b/parquet/src/arrow/async_reader/store.rs index 9bcb9d29492d..68659effbee2 100644 --- a/parquet/src/arrow/async_reader/store.rs +++ b/parquet/src/arrow/async_reader/store.rs @@ -193,11 +193,6 @@ impl AsyncFileReader for ParquetObjectReader { ) { self.file_decryption_properties = Some(file_decryption_properties); } - - #[cfg(feature = "encryption")] - fn read_encrypted(&self) -> bool { - self.file_decryption_properties.is_some() - } } #[cfg(test)] diff --git a/parquet/src/file/metadata/reader.rs b/parquet/src/file/metadata/reader.rs index 42524f97fcb0..71cf48e8e746 100644 --- a/parquet/src/file/metadata/reader.rs +++ b/parquet/src/file/metadata/reader.rs @@ -573,15 +573,15 @@ impl ParquetMetaDataReader { } let start = file_size - footer_metadata_len as u64; - if self.read_encrypted() { - Self::decrypt_metadata( + #[cfg(feature = "encryption")] + if self.file_decryption_properties.is_some() { + return Self::decrypt_metadata( chunk_reader.get_bytes(start, metadata_len)?.as_ref(), true, - self.file_decryption_properties.as_ref(), + self.file_decryption_properties.as_ref() ) - } else { - Self::decode_metadata(chunk_reader.get_bytes(start, metadata_len)?.as_ref()) } + Self::decode_metadata(chunk_reader.get_bytes(start, metadata_len)?.as_ref()) } /// Return the number of bytes to read in the initial pass. If `prefetch_size` has @@ -761,7 +761,7 @@ impl ParquetMetaDataReader { let mut row_groups = Vec::new(); for rg in t_file_metadata.row_groups { - let r = RowGroupMetaData::from_thrift(schema_descr.clone(), rg)?; + let r = RowGroupMetaData::from_encrypted_thrift(schema_descr.clone(), rg, file_decryptor.as_ref())?; row_groups.push(r); } let column_orders = diff --git a/parquet/src/file/serialized_reader.rs b/parquet/src/file/serialized_reader.rs index 034a1a3897d7..04cd89dc2b42 100644 --- a/parquet/src/file/serialized_reader.rs +++ b/parquet/src/file/serialized_reader.rs @@ -339,32 +339,32 @@ impl RowGroupReader for SerializedRowGroupReader<'_, R } /// Reads a [`PageHeader`] from the provided [`Read`] -pub(crate) fn read_page_header( +pub(crate) fn read_page_header(input: &mut T) -> Result { + let mut prot = TCompactInputProtocol::new(input); + Ok(PageHeader::read_from_in_protocol(&mut prot)?) +} + +#[cfg(feature = "encryption")] +pub(crate) fn read_encrypted_page_header( input: &mut T, - #[cfg(feature = "encryption")] crypto_context: Option>, + crypto_context: Arc, ) -> Result { - #[cfg(feature = "encryption")] - if let Some(crypto_context) = crypto_context { - let data_decryptor = crypto_context.data_decryptor(); - let aad = crypto_context.create_page_header_aad()?; + let data_decryptor = crypto_context.data_decryptor(); + let aad = crypto_context.create_page_header_aad()?; - let buf = read_and_decrypt(data_decryptor, input, aad.as_ref())?; + let buf = read_and_decrypt(data_decryptor, input, aad.as_ref())?; - let mut prot = TCompactSliceInputProtocol::new(buf.as_slice()); - let page_header = PageHeader::read_from_in_protocol(&mut prot)?; - return Ok(page_header); - } - - let mut prot = TCompactInputProtocol::new(input); - let page_header = PageHeader::read_from_in_protocol(&mut prot)?; - Ok(page_header) + let mut prot = TCompactSliceInputProtocol::new(buf.as_slice()); + Ok(PageHeader::read_from_in_protocol(&mut prot)?) } +// todo: decrypt /// Reads a [`PageHeader`] from the provided [`Read`] returning the number of bytes read. /// If the page header is encrypted [`CryptoContext`] must be provided. -fn read_page_header_len( +#[cfg(feature = "encryption")] +fn read_encrypted_page_header_len( input: &mut T, - #[cfg(feature = "encryption")] crypto_context: Option>, + crypto_context: Option>, ) -> Result<(usize, PageHeader)> { /// A wrapper around a [`std::io::Read`] that keeps track of the bytes read struct TrackedRead { @@ -384,14 +384,39 @@ fn read_page_header_len( inner: input, bytes_read: 0, }; - let header = read_page_header( + let header = read_encrypted_page_header( &mut tracked, - #[cfg(feature = "encryption")] - crypto_context, + crypto_context.unwrap(), )?; Ok((tracked.bytes_read, header)) } +/// Reads a [`PageHeader`] from the provided [`Read`] returning the number of bytes read. +fn read_page_header_len( + input: &mut T, +) -> Result<(usize, PageHeader)> { + /// A wrapper around a [`std::io::Read`] that keeps track of the bytes read + struct TrackedRead { + inner: R, + bytes_read: usize, + } + + impl Read for TrackedRead { + fn read(&mut self, buf: &mut [u8]) -> std::io::Result { + let v = self.inner.read(buf)?; + self.bytes_read += v; + Ok(v) + } + } + + let mut tracked = TrackedRead { + inner: input, + bytes_read: 0, + }; + let header = read_page_header(&mut tracked)?; + Ok((tracked.bytes_read, header)) +} + /// Decodes a [`Page`] from the provided `buffer` pub(crate) fn decode_page( page_header: PageHeader, @@ -652,11 +677,7 @@ impl SerializedPageReader { } } else { let mut read = self.reader.get_read(*offset as u64)?; - let (header_len, header) = read_page_header_len( - &mut read, - #[cfg(feature = "encryption")] - None, - )?; + let (header_len, header) = read_page_header_len(&mut read)?; *offset += header_len; *remaining_bytes -= header_len; let page_meta = if let Ok(_page_meta) = PageMetadata::try_from(&header) { @@ -736,16 +757,20 @@ impl PageReader for SerializedPageReader { *header } else { #[cfg(feature = "encryption")] - let crypto_context = page_crypto_context( - &self.crypto_context, - *page_ordinal, - *require_dictionary, - )?; - let (header_len, header) = read_page_header_len( - &mut read, - #[cfg(feature = "encryption")] - crypto_context, - )?; + let (header_len, header) = if self.crypto_context.is_some() { + let crypto_context = page_crypto_context( + &self.crypto_context, + *page_ordinal, + *require_dictionary, + )?; + read_encrypted_page_header_len(&mut read, crypto_context)? + } else { + read_page_header_len(&mut read)? + }; + + #[cfg(not(feature = "encryption"))] + let (header_len, header) = read_page_header_len(&mut read)?; + verify_page_header_len(header_len, *remaining)?; *offset += header_len; *remaining -= header_len; @@ -855,11 +880,7 @@ impl PageReader for SerializedPageReader { } } else { let mut read = self.reader.get_read(*offset as u64)?; - let (header_len, header) = read_page_header_len( - &mut read, - #[cfg(feature = "encryption")] - None, - )?; + let (header_len, header) = read_page_header_len(&mut read)?; verify_page_header_len(header_len, *remaining_bytes)?; *offset += header_len; *remaining_bytes -= header_len; @@ -922,11 +943,7 @@ impl PageReader for SerializedPageReader { *remaining_bytes -= buffered_header.compressed_page_size as usize; } else { let mut read = self.reader.get_read(*offset as u64)?; - let (header_len, header) = read_page_header_len( - &mut read, - #[cfg(feature = "encryption")] - None, - )?; + let (header_len, header) = read_page_header_len(&mut read)?; verify_page_header_len(header_len, *remaining_bytes)?; verify_page_size( header.compressed_page_size, From 3f3e46ec2d909a9fb3af8d1a753ebd867901257a Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Fri, 7 Mar 2025 18:03:08 +0100 Subject: [PATCH 75/97] WIP: Decryption shouldn't change the API --- parquet/src/file/metadata/mod.rs | 2 +- parquet/src/file/metadata/reader.rs | 37 ++++++++++----------------- parquet/src/file/serialized_reader.rs | 36 ++++++++++---------------- 3 files changed, 27 insertions(+), 48 deletions(-) diff --git a/parquet/src/file/metadata/mod.rs b/parquet/src/file/metadata/mod.rs index 804f33110603..d90f0fe97d61 100644 --- a/parquet/src/file/metadata/mod.rs +++ b/parquet/src/file/metadata/mod.rs @@ -623,6 +623,7 @@ impl RowGroupMetaData { self.file_offset } + #[cfg(feature = "encryption")] pub fn from_encrypted_thrift( schema_descr: SchemaDescPtr, mut rg: RowGroup, @@ -639,7 +640,6 @@ impl RowGroupMetaData { let num_rows = rg.num_rows; let mut columns = vec![]; - #[cfg(feature = "encryption")] for (i, (mut c, d)) in rg .columns .drain(0..) diff --git a/parquet/src/file/metadata/reader.rs b/parquet/src/file/metadata/reader.rs index 71cf48e8e746..e2fa8ffaf98d 100644 --- a/parquet/src/file/metadata/reader.rs +++ b/parquet/src/file/metadata/reader.rs @@ -577,9 +577,9 @@ impl ParquetMetaDataReader { if self.file_decryption_properties.is_some() { return Self::decrypt_metadata( chunk_reader.get_bytes(start, metadata_len)?.as_ref(), - true, - self.file_decryption_properties.as_ref() - ) + footer.is_encrypted_footer(), + self.file_decryption_properties.as_ref(), + ); } Self::decode_metadata(chunk_reader.get_bytes(start, metadata_len)?.as_ref()) } @@ -644,25 +644,12 @@ impl ParquetMetaDataReader { if length > suffix_len - FOOTER_SIZE { let metadata_start = file_size - length - FOOTER_SIZE; let meta = fetch.fetch(metadata_start..file_size - FOOTER_SIZE).await?; - Ok(( - Self::decode_metadata( - &meta, - // footer.is_encrypted_footer(), - // #[cfg(feature = "encryption")] - // file_decryption_properties, - )?, - None, - )) + Ok((Self::decode_metadata(&meta)?, None)) } else { let metadata_start = file_size - length - FOOTER_SIZE - footer_start; let slice = &suffix[metadata_start..suffix_len - FOOTER_SIZE]; Ok(( - Self::decode_metadata( - slice, - // footer.is_encrypted_footer(), - // #[cfg(feature = "encryption")] - // file_decryption_properties, - )?, + Self::decode_metadata(slice)?, Some((footer_start, suffix.slice(..metadata_start))), )) } @@ -703,15 +690,15 @@ impl ParquetMetaDataReader { Self::decode_footer_tail(slice).map(|f| f.metadata_length) } + #[cfg(feature = "encryption")] pub fn decrypt_metadata( buf: &[u8], encrypted_footer: bool, - #[cfg(feature = "encryption")] file_decryption_properties: Option< - &FileDecryptionProperties, - >, + file_decryption_properties: Option<&FileDecryptionProperties>, ) -> Result { let mut prot = TCompactSliceInputProtocol::new(buf); + // todo: move to decode_metadata #[cfg(not(feature = "encryption"))] if encrypted_footer { return Err(general_err!( @@ -719,9 +706,7 @@ impl ParquetMetaDataReader { )); } - #[cfg(feature = "encryption")] let mut file_decryptor = None; - #[cfg(feature = "encryption")] let decrypted_fmd_buf; if encrypted_footer { @@ -761,7 +746,11 @@ impl ParquetMetaDataReader { let mut row_groups = Vec::new(); for rg in t_file_metadata.row_groups { - let r = RowGroupMetaData::from_encrypted_thrift(schema_descr.clone(), rg, file_decryptor.as_ref())?; + let r = RowGroupMetaData::from_encrypted_thrift( + schema_descr.clone(), + rg, + file_decryptor.as_ref(), + )?; row_groups.push(r); } let column_orders = diff --git a/parquet/src/file/serialized_reader.rs b/parquet/src/file/serialized_reader.rs index 04cd89dc2b42..59bcb16ffc13 100644 --- a/parquet/src/file/serialized_reader.rs +++ b/parquet/src/file/serialized_reader.rs @@ -384,17 +384,12 @@ fn read_encrypted_page_header_len( inner: input, bytes_read: 0, }; - let header = read_encrypted_page_header( - &mut tracked, - crypto_context.unwrap(), - )?; + let header = read_encrypted_page_header(&mut tracked, crypto_context.unwrap())?; Ok((tracked.bytes_read, header)) } /// Reads a [`PageHeader`] from the provided [`Read`] returning the number of bytes read. -fn read_page_header_len( - input: &mut T, -) -> Result<(usize, PageHeader)> { +fn read_page_header_len(input: &mut T) -> Result<(usize, PageHeader)> { /// A wrapper around a [`std::io::Read`] that keeps track of the bytes read struct TrackedRead { inner: R, @@ -423,7 +418,6 @@ pub(crate) fn decode_page( buffer: Bytes, physical_type: Type, decompressor: Option<&mut Box>, - #[cfg(feature = "encryption")] crypto_context: Option>, ) -> Result { // Verify the 32-bit CRC checksum of the page #[cfg(feature = "crc")] @@ -450,16 +444,6 @@ pub(crate) fn decode_page( can_decompress = header_v2.is_compressed.unwrap_or(true); } - #[cfg(feature = "encryption")] - let buffer: Bytes = if let Some(crypto_context) = crypto_context { - let decryptor = crypto_context.data_decryptor(); - let aad = crypto_context.create_page_aad()?; - let decrypted = decryptor.decrypt(buffer.as_ref(), &aad)?; - Bytes::from(decrypted) - } else { - buffer - }; - // TODO: page header could be huge because of statistics. We should set a // maximum page header size and abort if that is exceeded. let buffer = match decompressor { @@ -596,7 +580,7 @@ impl SerializedPageReader { SerializedPageReader::new_with_properties(reader, meta, total_rows, page_locations, props) } - #[allow(missing_docs)] + // #[allow(missing_docs)] #[cfg(feature = "encryption")] pub fn with_crypto_context(mut self, crypto_context: Arc) -> Self { self.crypto_context = Some(crypto_context); @@ -806,13 +790,21 @@ impl PageReader for SerializedPageReader { *page_ordinal, *require_dictionary, )?; + #[cfg(feature = "encryption")] + let buffer: Vec = if let Some(crypto_context) = crypto_context { + let decryptor = crypto_context.data_decryptor(); + let aad = crypto_context.create_page_aad()?; + let decrypted = decryptor.decrypt(buffer.as_ref(), &aad)?; + decrypted + } else { + buffer + }; + let page = decode_page( header, Bytes::from(buffer), self.physical_type, self.decompressor.as_mut(), - #[cfg(feature = "encryption")] - crypto_context, )?; if page.is_data_page() { *page_ordinal += 1; @@ -848,8 +840,6 @@ impl PageReader for SerializedPageReader { bytes, self.physical_type, self.decompressor.as_mut(), - #[cfg(feature = "encryption")] - None, )? } }; From 22b08ab93a493a325f1249d9b4af5331c5b2382b Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Fri, 7 Mar 2025 20:19:38 +0100 Subject: [PATCH 76/97] WIP: Decryption shouldn't change the API --- parquet/src/arrow/arrow_reader/mod.rs | 4 +- parquet/src/arrow/async_reader/mod.rs | 88 +++++++++++++++++++------ parquet/src/arrow/async_reader/store.rs | 31 +++++++-- parquet/src/file/metadata/reader.rs | 72 +++++++++++++++++--- parquet/src/file/serialized_reader.rs | 5 +- 5 files changed, 157 insertions(+), 43 deletions(-) diff --git a/parquet/src/arrow/arrow_reader/mod.rs b/parquet/src/arrow/arrow_reader/mod.rs index 0d20e5b28a0b..6ee429561625 100644 --- a/parquet/src/arrow/arrow_reader/mod.rs +++ b/parquet/src/arrow/arrow_reader/mod.rs @@ -729,8 +729,8 @@ impl Iterator for ReaderPageIterator { let ret = SerializedPageReader::new(reader, meta, total_rows, page_locations); #[cfg(feature = "encryption")] - if crypto_context.is_some() { - let ret = Ok(ret.unwrap().with_crypto_context(crypto_context.unwrap())); + { + let ret = Ok(ret.unwrap().with_crypto_context(crypto_context)); return Some(ret.map(|x| Box::new(x) as _)); } diff --git a/parquet/src/arrow/async_reader/mod.rs b/parquet/src/arrow/async_reader/mod.rs index e7cc66747e10..6ad2959e7ca3 100644 --- a/parquet/src/arrow/async_reader/mod.rs +++ b/parquet/src/arrow/async_reader/mod.rs @@ -70,6 +70,7 @@ mod store; use crate::arrow::schema::ParquetField; #[cfg(feature = "encryption")] use crate::encryption::decryption::CryptoContext; +use crate::encryption::decryption::FileDecryptor; #[cfg(feature = "object_store")] pub use store::*; @@ -111,10 +112,10 @@ pub trait AsyncFileReader: Send { fn get_metadata(&mut self) -> BoxFuture<'_, Result>>; #[cfg(feature = "encryption")] - fn with_file_decryption_properties( + fn get_encrypted_metadata( &mut self, - file_decryption_properties: FileDecryptionProperties, - ); + file_decryption_properties: Option, + ) -> BoxFuture<'_, Result>>; } /// This allows Box to be used as an AsyncFileReader, @@ -131,13 +132,12 @@ impl AsyncFileReader for Box { self.as_mut().get_metadata() } - #[cfg(feature = "encryption")] - fn with_file_decryption_properties( + fn get_encrypted_metadata( &mut self, - file_decryption_properties: FileDecryptionProperties, - ) { + file_decryption_properties: Option, + ) -> BoxFuture<'_, Result>> { self.as_mut() - .with_file_decryption_properties(file_decryption_properties); + .get_encrypted_metadata(file_decryption_properties) } } @@ -159,11 +159,33 @@ impl AsyncFileReader for T { } #[cfg(feature = "encryption")] - fn with_file_decryption_properties( + fn get_encrypted_metadata( &mut self, - file_decryption_properties: FileDecryptionProperties, - ) { - self.with_file_decryption_properties(file_decryption_properties); + file_decryption_properties: Option, + ) -> BoxFuture<'_, Result>> { + const FOOTER_SIZE_I64: i64 = FOOTER_SIZE as i64; + async move { + self.seek(SeekFrom::End(-FOOTER_SIZE_I64)).await?; + + let mut buf = [0_u8; FOOTER_SIZE]; + self.read_exact(&mut buf).await?; + + let footer = ParquetMetaDataReader::decode_footer_tail(&buf)?; + let metadata_len = footer.metadata_length(); + self.seek(SeekFrom::End(-FOOTER_SIZE_I64 - metadata_len as i64)) + .await?; + + let mut buf = Vec::with_capacity(metadata_len); + self.take(metadata_len as _).read_to_end(&mut buf).await?; + + let parquet_metadata_reader = ParquetMetaDataReader::decrypt_metadata( + &buf, + footer.is_encrypted_footer(), + file_decryption_properties.as_ref(), + )?; + Ok(Arc::new(parquet_metadata_reader)) + } + .boxed() } fn get_metadata(&mut self) -> BoxFuture<'_, Result>> { @@ -182,11 +204,7 @@ impl AsyncFileReader for T { let mut buf = Vec::with_capacity(metadata_len); self.take(metadata_len as _).read_to_end(&mut buf).await?; - // todo: decrypt - let parquet_metadata_reader = ParquetMetaDataReader::decode_metadata(&buf)?; - // #[cfg(feature = "encryption")] - // parquet_metadata_reader.with_file_decryptor(file_decryption_properties) Ok(Arc::new(parquet_metadata_reader)) } .boxed() @@ -209,7 +227,15 @@ impl ArrowReaderMetadata { ) -> Result { // TODO: this is all rather awkward. It would be nice if AsyncFileReader::get_metadata // took an argument to fetch the page indexes. - let mut metadata = input.get_metadata().await?; + #[cfg(feature = "encryption")] + let mut metadata = if options.file_decryption_properties.is_some() { + input + .get_encrypted_metadata(options.file_decryption_properties.clone()) + .await? + } else { + input.get_metadata().await? + }; + // let mut metadata = input.get_metadata().await?; if options.page_index && metadata.column_index().is_none() @@ -535,6 +561,7 @@ impl ParquetRecordBatchStreamBuilder { fields: self.fields, limit: self.limit, offset: self.offset, + file_decryption_properties: None, }; // Ensure schema of ParquetRecordBatchStream respects projection, and does @@ -577,6 +604,8 @@ struct ReaderFactory { limit: Option, offset: Option, + + file_decryption_properties: Option, } impl ReaderFactory @@ -1052,7 +1081,7 @@ impl RowGroups for InMemoryRowGroup<'_> { )?; #[cfg(feature = "encryption")] - let page_reader = page_reader.with_crypto_context(crypto_context.unwrap()); + let page_reader = page_reader.with_crypto_context(crypto_context); let page_reader: Box = Box::new(page_reader); @@ -1167,6 +1196,7 @@ mod tests { data: Bytes, metadata: Arc, requests: Arc>>>, + file_decryption_properties: Option, } impl AsyncFileReader for TestReader { @@ -1180,10 +1210,10 @@ mod tests { } #[cfg(feature = "encryption")] - fn with_file_decryption_properties( + fn get_encrypted_metadata( &mut self, - file_decryption_properties: FileDecryptionProperties, - ) { + file_decryption_properties: Option, + ) -> BoxFuture<'_, Result>> { todo!("we don't test for decryption yet"); } } @@ -1205,6 +1235,7 @@ mod tests { data: data.clone(), metadata: metadata.clone(), requests: Default::default(), + file_decryption_properties: None, }; let requests = async_reader.requests.clone(); @@ -1262,6 +1293,7 @@ mod tests { data: data.clone(), metadata: metadata.clone(), requests: Default::default(), + file_decryption_properties: None, }; let requests = async_reader.requests.clone(); @@ -1327,6 +1359,7 @@ mod tests { data: data.clone(), metadata: metadata.clone(), requests: Default::default(), + file_decryption_properties: None, }; let options = ArrowReaderOptions::new().with_page_index(true); @@ -1395,6 +1428,7 @@ mod tests { data: data.clone(), metadata: metadata.clone(), requests: Default::default(), + file_decryption_properties: None, }; let builder = ParquetRecordBatchStreamBuilder::new(async_reader) @@ -1441,6 +1475,7 @@ mod tests { data: data.clone(), metadata: metadata.clone(), requests: Default::default(), + file_decryption_properties: None, }; let options = ArrowReaderOptions::new().with_page_index(true); @@ -1524,6 +1559,7 @@ mod tests { data: data.clone(), metadata: metadata.clone(), requests: Default::default(), + file_decryption_properties: None, }; let options = ArrowReaderOptions::new().with_page_index(true); @@ -1595,6 +1631,7 @@ mod tests { data: data.clone(), metadata: metadata.clone(), requests: Default::default(), + file_decryption_properties: None, }; let options = ArrowReaderOptions::new().with_page_index(true); @@ -1645,6 +1682,7 @@ mod tests { data, metadata: Arc::new(metadata), requests: Default::default(), + file_decryption_properties: None, }; let requests = test.requests.clone(); @@ -1722,6 +1760,7 @@ mod tests { data, metadata: Arc::new(metadata), requests: Default::default(), + file_decryption_properties: None, }; let stream = ParquetRecordBatchStreamBuilder::new(test.clone()) @@ -1814,6 +1853,7 @@ mod tests { data: data.clone(), metadata: metadata.clone(), requests: Default::default(), + file_decryption_properties: None, }; let a_filter = @@ -1882,6 +1922,7 @@ mod tests { data: data.clone(), metadata: metadata.clone(), requests: Default::default(), + file_decryption_properties: None, }; let requests = async_reader.requests.clone(); @@ -1903,6 +1944,7 @@ mod tests { filter: None, limit: None, offset: None, + file_decryption_properties: None, }; let mut skip = true; @@ -1958,6 +2000,7 @@ mod tests { data: data.clone(), metadata: metadata.clone(), requests: Default::default(), + file_decryption_properties: None, }; let builder = ParquetRecordBatchStreamBuilder::new(async_reader) @@ -2103,6 +2146,7 @@ mod tests { data: data.clone(), metadata: metadata.clone(), requests: Default::default(), + file_decryption_properties: None, }; let builder = ParquetRecordBatchStreamBuilder::new(async_reader) .await @@ -2140,6 +2184,7 @@ mod tests { data: data.clone(), metadata: metadata.clone(), requests: Default::default(), + file_decryption_properties: None, }; let mut builder = ParquetRecordBatchStreamBuilder::new(async_reader) @@ -2277,6 +2322,7 @@ mod tests { data, metadata: Arc::new(metadata), requests: Default::default(), + file_decryption_properties: None, }; let requests = test.requests.clone(); diff --git a/parquet/src/arrow/async_reader/store.rs b/parquet/src/arrow/async_reader/store.rs index 68659effbee2..38e357a9e790 100644 --- a/parquet/src/arrow/async_reader/store.rs +++ b/parquet/src/arrow/async_reader/store.rs @@ -177,9 +177,8 @@ impl AsyncFileReader for ParquetObjectReader { .with_offset_indexes(self.preload_offset_index) .with_prefetch_hint(self.metadata_size_hint); #[cfg(feature = "encryption")] - let file_decryption_properties = self.file_decryption_properties.clone().unwrap(); - #[cfg(feature = "encryption")] - let metadata = metadata.with_decryption_properties(Some(&file_decryption_properties)); + let metadata = metadata + .with_decryption_properties(self.file_decryption_properties.clone().as_ref()); let metadata = metadata.load_and_finish(self, file_size).await?; Ok(Arc::new(metadata)) @@ -187,12 +186,30 @@ impl AsyncFileReader for ParquetObjectReader { } #[cfg(feature = "encryption")] - fn with_file_decryption_properties( + fn get_encrypted_metadata( &mut self, - file_decryption_properties: FileDecryptionProperties, - ) { - self.file_decryption_properties = Some(file_decryption_properties); + file_decryption_properties: Option, + ) -> BoxFuture<'_, Result>> { + Box::pin(async move { + let file_size = self.meta.size; + let metadata = ParquetMetaDataReader::new() + .with_column_indexes(self.preload_column_index) + .with_offset_indexes(self.preload_offset_index) + .with_prefetch_hint(self.metadata_size_hint) + .with_decryption_properties(self.file_decryption_properties.clone().as_ref()); + + let metadata = metadata.load_and_finish(self, file_size).await?; + Ok(Arc::new(metadata)) + }) } + + // #[cfg(feature = "encryption")] + // fn with_file_decryption_properties( + // &mut self, + // file_decryption_properties: FileDecryptionProperties, + // ) { + // self.file_decryption_properties = Some(file_decryption_properties); + // } } #[cfg(test)] diff --git a/parquet/src/file/metadata/reader.rs b/parquet/src/file/metadata/reader.rs index e2fa8ffaf98d..3d0e1c9c5002 100644 --- a/parquet/src/file/metadata/reader.rs +++ b/parquet/src/file/metadata/reader.rs @@ -168,11 +168,6 @@ impl ParquetMetaDataReader { self } - #[cfg(feature = "encryption")] - fn read_encrypted(&self) -> bool { - self.file_decryption_properties.is_some() - } - /// Indicates whether this reader has a [`ParquetMetaData`] internally. pub fn has_metadata(&self) -> bool { self.metadata.is_some() @@ -419,15 +414,19 @@ impl ParquetMetaDataReader { mut fetch: F, file_size: usize, ) -> Result<()> { - let (metadata, remainder) = Self::load_metadata( + #[cfg(feature = "encryption")] + let (metadata, remainder) = Self::load_encrypted_metadata( &mut fetch, file_size, self.get_prefetch_size(), - #[cfg(feature = "encryption")] self.file_decryption_properties.as_ref(), ) .await?; + #[cfg(not(feature = "encryption"))] + let (metadata, remainder) = + Self::load_metadata(&mut fetch, file_size, self.get_prefetch_size()).await?; + self.metadata = Some(metadata); // we can return if page indexes aren't requested @@ -597,14 +596,67 @@ impl ParquetMetaDataReader { FOOTER_SIZE } + #[cfg(all(feature = "async", feature = "arrow", feature = "encryption"))] + async fn load_encrypted_metadata( + fetch: &mut F, + file_size: usize, + prefetch: usize, + file_decryption_properties: Option<&FileDecryptionProperties>, + ) -> Result<(ParquetMetaData, Option<(usize, Bytes)>)> { + if file_size < FOOTER_SIZE { + return Err(eof_err!("file size of {} is less than footer", file_size)); + } + + // If a size hint is provided, read more than the minimum size + // to try and avoid a second fetch. + // Note: prefetch > file_size is ok since we're using saturating_sub. + let footer_start = file_size.saturating_sub(prefetch); + + let suffix = fetch.fetch(footer_start..file_size).await?; + let suffix_len = suffix.len(); + let fetch_len = file_size - footer_start; + if suffix_len < fetch_len { + return Err(eof_err!( + "metadata requires {} bytes, but could only read {}", + fetch_len, + suffix_len + )); + } + + let mut footer = [0; FOOTER_SIZE]; + footer.copy_from_slice(&suffix[suffix_len - FOOTER_SIZE..suffix_len]); + + let footer = Self::decode_footer_tail(&footer)?; + let length = footer.metadata_length(); + + if file_size < length + FOOTER_SIZE { + return Err(eof_err!( + "file size of {} is less than footer + metadata {}", + file_size, + length + FOOTER_SIZE + )); + } + + // Did not fetch the entire file metadata in the initial read, need to make a second request + if length > suffix_len - FOOTER_SIZE { + let metadata_start = file_size - length - FOOTER_SIZE; + let meta = fetch.fetch(metadata_start..file_size - FOOTER_SIZE).await?; + Ok((Self::decode_metadata(&meta)?, None)) + } else { + let metadata_start = file_size - length - FOOTER_SIZE - footer_start; + let slice = &suffix[metadata_start..suffix_len - FOOTER_SIZE]; + Ok(( + Self::decode_metadata(slice)?, + Some((footer_start, suffix.slice(..metadata_start))), + )) + } + } + #[cfg(all(feature = "async", feature = "arrow"))] async fn load_metadata( fetch: &mut F, file_size: usize, prefetch: usize, - #[cfg(feature = "encryption")] file_decryption_properties: Option< - &FileDecryptionProperties, - >, ) -> Result<(ParquetMetaData, Option<(usize, Bytes)>)> { if file_size < FOOTER_SIZE { return Err(eof_err!("file size of {} is less than footer", file_size)); diff --git a/parquet/src/file/serialized_reader.rs b/parquet/src/file/serialized_reader.rs index 59bcb16ffc13..2bf7c3998ae6 100644 --- a/parquet/src/file/serialized_reader.rs +++ b/parquet/src/file/serialized_reader.rs @@ -358,7 +358,6 @@ pub(crate) fn read_encrypted_page_header( Ok(PageHeader::read_from_in_protocol(&mut prot)?) } -// todo: decrypt /// Reads a [`PageHeader`] from the provided [`Read`] returning the number of bytes read. /// If the page header is encrypted [`CryptoContext`] must be provided. #[cfg(feature = "encryption")] @@ -582,8 +581,8 @@ impl SerializedPageReader { // #[allow(missing_docs)] #[cfg(feature = "encryption")] - pub fn with_crypto_context(mut self, crypto_context: Arc) -> Self { - self.crypto_context = Some(crypto_context); + pub fn with_crypto_context(mut self, crypto_context: Option>) -> Self { + self.crypto_context = crypto_context; self } From ff9eb789a9b831e724a36955c90dd09d69bf4d46 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Fri, 7 Mar 2025 22:34:22 +0100 Subject: [PATCH 77/97] WIP: Decryption shouldn't change the API --- parquet/src/arrow/arrow_reader/mod.rs | 1 + parquet/src/arrow/async_reader/mod.rs | 28 ++++++++++++++++++++++----- parquet/src/file/metadata/reader.rs | 15 ++++++-------- 3 files changed, 30 insertions(+), 14 deletions(-) diff --git a/parquet/src/arrow/arrow_reader/mod.rs b/parquet/src/arrow/arrow_reader/mod.rs index 6ee429561625..574e5fc5c7d3 100644 --- a/parquet/src/arrow/arrow_reader/mod.rs +++ b/parquet/src/arrow/arrow_reader/mod.rs @@ -734,6 +734,7 @@ impl Iterator for ReaderPageIterator { return Some(ret.map(|x| Box::new(x) as _)); } + #[cfg(not(feature = "encryption"))] Some(ret.map(|x| Box::new(x) as _)) } } diff --git a/parquet/src/arrow/async_reader/mod.rs b/parquet/src/arrow/async_reader/mod.rs index 6ad2959e7ca3..d69cf0dae667 100644 --- a/parquet/src/arrow/async_reader/mod.rs +++ b/parquet/src/arrow/async_reader/mod.rs @@ -62,15 +62,12 @@ mod metadata; pub use metadata::*; #[cfg(feature = "encryption")] -use crate::encryption::decryption::FileDecryptionProperties; +use crate::encryption::decryption::{CryptoContext, FileDecryptionProperties}; #[cfg(feature = "object_store")] mod store; use crate::arrow::schema::ParquetField; -#[cfg(feature = "encryption")] -use crate::encryption::decryption::CryptoContext; -use crate::encryption::decryption::FileDecryptor; #[cfg(feature = "object_store")] pub use store::*; @@ -132,6 +129,7 @@ impl AsyncFileReader for Box { self.as_mut().get_metadata() } + #[cfg(feature = "encryption")] fn get_encrypted_metadata( &mut self, file_decryption_properties: Option, @@ -235,7 +233,8 @@ impl ArrowReaderMetadata { } else { input.get_metadata().await? }; - // let mut metadata = input.get_metadata().await?; + #[cfg(not(feature = "encryption"))] + let mut metadata = input.get_metadata().await?; if options.page_index && metadata.column_index().is_none() @@ -561,6 +560,7 @@ impl ParquetRecordBatchStreamBuilder { fields: self.fields, limit: self.limit, offset: self.offset, + #[cfg(feature = "encryption")] file_decryption_properties: None, }; @@ -605,6 +605,7 @@ struct ReaderFactory { offset: Option, + #[cfg(feature = "encryption")] file_decryption_properties: Option, } @@ -1196,6 +1197,7 @@ mod tests { data: Bytes, metadata: Arc, requests: Arc>>>, + #[cfg(feature = "encryption")] file_decryption_properties: Option, } @@ -1235,6 +1237,7 @@ mod tests { data: data.clone(), metadata: metadata.clone(), requests: Default::default(), + #[cfg(feature = "encryption")] file_decryption_properties: None, }; @@ -1293,6 +1296,7 @@ mod tests { data: data.clone(), metadata: metadata.clone(), requests: Default::default(), + #[cfg(feature = "encryption")] file_decryption_properties: None, }; @@ -1359,6 +1363,7 @@ mod tests { data: data.clone(), metadata: metadata.clone(), requests: Default::default(), + #[cfg(feature = "encryption")] file_decryption_properties: None, }; @@ -1428,6 +1433,7 @@ mod tests { data: data.clone(), metadata: metadata.clone(), requests: Default::default(), + #[cfg(feature = "encryption")] file_decryption_properties: None, }; @@ -1475,6 +1481,7 @@ mod tests { data: data.clone(), metadata: metadata.clone(), requests: Default::default(), + #[cfg(feature = "encryption")] file_decryption_properties: None, }; @@ -1559,6 +1566,7 @@ mod tests { data: data.clone(), metadata: metadata.clone(), requests: Default::default(), + #[cfg(feature = "encryption")] file_decryption_properties: None, }; @@ -1631,6 +1639,7 @@ mod tests { data: data.clone(), metadata: metadata.clone(), requests: Default::default(), + #[cfg(feature = "encryption")] file_decryption_properties: None, }; @@ -1682,6 +1691,7 @@ mod tests { data, metadata: Arc::new(metadata), requests: Default::default(), + #[cfg(feature = "encryption")] file_decryption_properties: None, }; let requests = test.requests.clone(); @@ -1760,6 +1770,7 @@ mod tests { data, metadata: Arc::new(metadata), requests: Default::default(), + #[cfg(feature = "encryption")] file_decryption_properties: None, }; @@ -1853,6 +1864,7 @@ mod tests { data: data.clone(), metadata: metadata.clone(), requests: Default::default(), + #[cfg(feature = "encryption")] file_decryption_properties: None, }; @@ -1922,6 +1934,7 @@ mod tests { data: data.clone(), metadata: metadata.clone(), requests: Default::default(), + #[cfg(feature = "encryption")] file_decryption_properties: None, }; @@ -1944,6 +1957,7 @@ mod tests { filter: None, limit: None, offset: None, + #[cfg(feature = "encryption")] file_decryption_properties: None, }; @@ -2000,6 +2014,7 @@ mod tests { data: data.clone(), metadata: metadata.clone(), requests: Default::default(), + #[cfg(feature = "encryption")] file_decryption_properties: None, }; @@ -2146,6 +2161,7 @@ mod tests { data: data.clone(), metadata: metadata.clone(), requests: Default::default(), + #[cfg(feature = "encryption")] file_decryption_properties: None, }; let builder = ParquetRecordBatchStreamBuilder::new(async_reader) @@ -2184,6 +2200,7 @@ mod tests { data: data.clone(), metadata: metadata.clone(), requests: Default::default(), + #[cfg(feature = "encryption")] file_decryption_properties: None, }; @@ -2322,6 +2339,7 @@ mod tests { data, metadata: Arc::new(metadata), requests: Default::default(), + #[cfg(feature = "encryption")] file_decryption_properties: None, }; let requests = test.requests.clone(); diff --git a/parquet/src/file/metadata/reader.rs b/parquet/src/file/metadata/reader.rs index 3d0e1c9c5002..a52a4f7d516f 100644 --- a/parquet/src/file/metadata/reader.rs +++ b/parquet/src/file/metadata/reader.rs @@ -570,6 +570,12 @@ impl ParquetMetaDataReader { if footer_metadata_len > file_size as usize { return Err(ParquetError::NeedMoreData(footer_metadata_len)); } + #[cfg(not(feature = "encryption"))] + if footer.encrypted_footer { + return Err(general_err!( + "Parquet file has an encrypted footer but the encryption feature is disabled" + )); + } let start = file_size - footer_metadata_len as u64; #[cfg(feature = "encryption")] @@ -749,15 +755,6 @@ impl ParquetMetaDataReader { file_decryption_properties: Option<&FileDecryptionProperties>, ) -> Result { let mut prot = TCompactSliceInputProtocol::new(buf); - - // todo: move to decode_metadata - #[cfg(not(feature = "encryption"))] - if encrypted_footer { - return Err(general_err!( - "Parquet file has an encrypted footer but the encryption feature is disabled" - )); - } - let mut file_decryptor = None; let decrypted_fmd_buf; From ee7643e643de2d7cf3d52035a5cb1520260d8ccb Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Sat, 8 Mar 2025 00:16:08 +0100 Subject: [PATCH 78/97] WIP: Decryption shouldn't change the API --- parquet/src/arrow/arrow_reader/mod.rs | 2 +- parquet/src/arrow/async_reader/mod.rs | 12 ++-- parquet/src/arrow/async_reader/store.rs | 13 +---- parquet/src/encryption/decryption.rs | 2 + parquet/src/file/metadata/mod.rs | 1 + parquet/src/file/metadata/reader.rs | 77 ++++--------------------- parquet/src/file/serialized_reader.rs | 5 +- 7 files changed, 25 insertions(+), 87 deletions(-) diff --git a/parquet/src/arrow/arrow_reader/mod.rs b/parquet/src/arrow/arrow_reader/mod.rs index 574e5fc5c7d3..eb47012321b8 100644 --- a/parquet/src/arrow/arrow_reader/mod.rs +++ b/parquet/src/arrow/arrow_reader/mod.rs @@ -731,7 +731,7 @@ impl Iterator for ReaderPageIterator { #[cfg(feature = "encryption")] { let ret = Ok(ret.unwrap().with_crypto_context(crypto_context)); - return Some(ret.map(|x| Box::new(x) as _)); + Some(ret.map(|x| Box::new(x) as _)) } #[cfg(not(feature = "encryption"))] diff --git a/parquet/src/arrow/async_reader/mod.rs b/parquet/src/arrow/async_reader/mod.rs index d69cf0dae667..cfdfd00ec59a 100644 --- a/parquet/src/arrow/async_reader/mod.rs +++ b/parquet/src/arrow/async_reader/mod.rs @@ -108,6 +108,8 @@ pub trait AsyncFileReader: Send { /// for caching, pre-fetching, catalog metadata, etc... fn get_metadata(&mut self) -> BoxFuture<'_, Result>>; + /// Provides asynchronous access to the [`ParquetMetaData`] of encrypted parquet + /// files, like get_metadata does for unencrypted ones. #[cfg(feature = "encryption")] fn get_encrypted_metadata( &mut self, @@ -560,8 +562,6 @@ impl ParquetRecordBatchStreamBuilder { fields: self.fields, limit: self.limit, offset: self.offset, - #[cfg(feature = "encryption")] - file_decryption_properties: None, }; // Ensure schema of ParquetRecordBatchStream respects projection, and does @@ -604,9 +604,6 @@ struct ReaderFactory { limit: Option, offset: Option, - - #[cfg(feature = "encryption")] - file_decryption_properties: Option, } impl ReaderFactory @@ -1192,6 +1189,7 @@ mod tests { use tempfile::tempfile; use tokio::fs::File; + #[allow(dead_code)] #[derive(Clone)] struct TestReader { data: Bytes, @@ -1214,7 +1212,7 @@ mod tests { #[cfg(feature = "encryption")] fn get_encrypted_metadata( &mut self, - file_decryption_properties: Option, + _file_decryption_properties: Option, ) -> BoxFuture<'_, Result>> { todo!("we don't test for decryption yet"); } @@ -1957,8 +1955,6 @@ mod tests { filter: None, limit: None, offset: None, - #[cfg(feature = "encryption")] - file_decryption_properties: None, }; let mut skip = true; diff --git a/parquet/src/arrow/async_reader/store.rs b/parquet/src/arrow/async_reader/store.rs index 38e357a9e790..1c909c8ae59f 100644 --- a/parquet/src/arrow/async_reader/store.rs +++ b/parquet/src/arrow/async_reader/store.rs @@ -196,20 +196,13 @@ impl AsyncFileReader for ParquetObjectReader { .with_column_indexes(self.preload_column_index) .with_offset_indexes(self.preload_offset_index) .with_prefetch_hint(self.metadata_size_hint) - .with_decryption_properties(self.file_decryption_properties.clone().as_ref()); + .with_decryption_properties(file_decryption_properties.as_ref()) + .load_and_finish(self, file_size) + .await?; - let metadata = metadata.load_and_finish(self, file_size).await?; Ok(Arc::new(metadata)) }) } - - // #[cfg(feature = "encryption")] - // fn with_file_decryption_properties( - // &mut self, - // file_decryption_properties: FileDecryptionProperties, - // ) { - // self.file_decryption_properties = Some(file_decryption_properties); - // } } #[cfg(test)] diff --git a/parquet/src/encryption/decryption.rs b/parquet/src/encryption/decryption.rs index ed6d5921a5d2..58b202b44199 100644 --- a/parquet/src/encryption/decryption.rs +++ b/parquet/src/encryption/decryption.rs @@ -36,6 +36,8 @@ pub fn read_and_decrypt( decryptor.decrypt(&ciphertext, aad.as_ref()) } +// CryptoContext is a data structure that holds the context required to +// decrypt parquet modules (data pages, dictionary pages, etc.). #[derive(Debug, Clone)] pub struct CryptoContext { pub(crate) row_group_ordinal: usize, diff --git a/parquet/src/file/metadata/mod.rs b/parquet/src/file/metadata/mod.rs index d90f0fe97d61..a671ef7a1051 100644 --- a/parquet/src/file/metadata/mod.rs +++ b/parquet/src/file/metadata/mod.rs @@ -623,6 +623,7 @@ impl RowGroupMetaData { self.file_offset } + /// Method to convert from encrypted Thrift. #[cfg(feature = "encryption")] pub fn from_encrypted_thrift( schema_descr: SchemaDescPtr, diff --git a/parquet/src/file/metadata/reader.rs b/parquet/src/file/metadata/reader.rs index a52a4f7d516f..c5e134fe43c2 100644 --- a/parquet/src/file/metadata/reader.rs +++ b/parquet/src/file/metadata/reader.rs @@ -415,13 +415,8 @@ impl ParquetMetaDataReader { file_size: usize, ) -> Result<()> { #[cfg(feature = "encryption")] - let (metadata, remainder) = Self::load_encrypted_metadata( - &mut fetch, - file_size, - self.get_prefetch_size(), - self.file_decryption_properties.as_ref(), - ) - .await?; + let (metadata, remainder) = + Self::load_metadata(&mut fetch, file_size, self.get_prefetch_size()).await?; #[cfg(not(feature = "encryption"))] let (metadata, remainder) = @@ -602,62 +597,6 @@ impl ParquetMetaDataReader { FOOTER_SIZE } - #[cfg(all(feature = "async", feature = "arrow", feature = "encryption"))] - async fn load_encrypted_metadata( - fetch: &mut F, - file_size: usize, - prefetch: usize, - file_decryption_properties: Option<&FileDecryptionProperties>, - ) -> Result<(ParquetMetaData, Option<(usize, Bytes)>)> { - if file_size < FOOTER_SIZE { - return Err(eof_err!("file size of {} is less than footer", file_size)); - } - - // If a size hint is provided, read more than the minimum size - // to try and avoid a second fetch. - // Note: prefetch > file_size is ok since we're using saturating_sub. - let footer_start = file_size.saturating_sub(prefetch); - - let suffix = fetch.fetch(footer_start..file_size).await?; - let suffix_len = suffix.len(); - let fetch_len = file_size - footer_start; - if suffix_len < fetch_len { - return Err(eof_err!( - "metadata requires {} bytes, but could only read {}", - fetch_len, - suffix_len - )); - } - - let mut footer = [0; FOOTER_SIZE]; - footer.copy_from_slice(&suffix[suffix_len - FOOTER_SIZE..suffix_len]); - - let footer = Self::decode_footer_tail(&footer)?; - let length = footer.metadata_length(); - - if file_size < length + FOOTER_SIZE { - return Err(eof_err!( - "file size of {} is less than footer + metadata {}", - file_size, - length + FOOTER_SIZE - )); - } - - // Did not fetch the entire file metadata in the initial read, need to make a second request - if length > suffix_len - FOOTER_SIZE { - let metadata_start = file_size - length - FOOTER_SIZE; - let meta = fetch.fetch(metadata_start..file_size - FOOTER_SIZE).await?; - Ok((Self::decode_metadata(&meta)?, None)) - } else { - let metadata_start = file_size - length - FOOTER_SIZE - footer_start; - let slice = &suffix[metadata_start..suffix_len - FOOTER_SIZE]; - Ok(( - Self::decode_metadata(slice)?, - Some((footer_start, suffix.slice(..metadata_start))), - )) - } - } - #[cfg(all(feature = "async", feature = "arrow"))] async fn load_metadata( fetch: &mut F, @@ -748,6 +687,15 @@ impl ParquetMetaDataReader { Self::decode_footer_tail(slice).map(|f| f.metadata_length) } + /// Decodes [`ParquetMetaData`] from the provided encrypted bytes. + /// + /// Typically this is used to decode the metadata from the end of a parquet + /// file. The format of `buf` is the Thrift compact binary protocol, as specified + /// by the [Parquet Spec]. Buffer can is encrypted with AES GCM or AES CTR + /// ciphers as specfied in the [Parquet Encryption Spec]. + /// + /// [Parquet Spec]: https://github.com/apache/parquet-format#metadata + /// [Parquet Encryption Spec]: https://parquet.apache.org/docs/file-format/data-pages/encryption/ #[cfg(feature = "encryption")] pub fn decrypt_metadata( buf: &[u8], @@ -837,8 +785,7 @@ impl ParquetMetaDataReader { let mut row_groups = Vec::new(); for rg in t_file_metadata.row_groups { - let r = RowGroupMetaData::from_thrift(schema_descr.clone(), rg)?; - row_groups.push(r); + row_groups.push(RowGroupMetaData::from_thrift(schema_descr.clone(), rg)?); } let column_orders = Self::parse_column_orders(t_file_metadata.column_orders, &schema_descr)?; diff --git a/parquet/src/file/serialized_reader.rs b/parquet/src/file/serialized_reader.rs index 2bf7c3998ae6..e76197503145 100644 --- a/parquet/src/file/serialized_reader.rs +++ b/parquet/src/file/serialized_reader.rs @@ -579,7 +579,7 @@ impl SerializedPageReader { SerializedPageReader::new_with_properties(reader, meta, total_rows, page_locations, props) } - // #[allow(missing_docs)] + /// Adds cryptographical information to the reader. #[cfg(feature = "encryption")] pub fn with_crypto_context(mut self, crypto_context: Option>) -> Self { self.crypto_context = crypto_context; @@ -793,8 +793,7 @@ impl PageReader for SerializedPageReader { let buffer: Vec = if let Some(crypto_context) = crypto_context { let decryptor = crypto_context.data_decryptor(); let aad = crypto_context.create_page_aad()?; - let decrypted = decryptor.decrypt(buffer.as_ref(), &aad)?; - decrypted + decryptor.decrypt(buffer.as_ref(), &aad)? } else { buffer }; From 000334da01080490c6a9d3bef111f2c69cbff559 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Sat, 8 Mar 2025 02:42:14 +0100 Subject: [PATCH 79/97] Review feedback --- parquet/src/arrow/arrow_reader/mod.rs | 81 +--------- parquet/src/arrow/async_reader/mod.rs | 98 +------------ parquet/src/file/metadata/reader.rs | 2 +- .../src/util/test_common/encryption_util.rs | 138 ++++++++++++++++++ parquet/src/util/test_common/mod.rs | 3 + 5 files changed, 150 insertions(+), 172 deletions(-) create mode 100644 parquet/src/util/test_common/encryption_util.rs diff --git a/parquet/src/arrow/arrow_reader/mod.rs b/parquet/src/arrow/arrow_reader/mod.rs index eb47012321b8..5d796effb693 100644 --- a/parquet/src/arrow/arrow_reader/mod.rs +++ b/parquet/src/arrow/arrow_reader/mod.rs @@ -1020,6 +1020,8 @@ mod tests { use crate::file::writer::SerializedFileWriter; use crate::schema::parser::parse_message_type; use crate::schema::types::{Type, TypePtr}; + #[cfg(feature = "encryption")] + use crate::util::test_common::encryption_util::verify_encryption_test_file_read; use crate::util::test_common::rand_gen::RandGen; #[test] @@ -1993,85 +1995,6 @@ mod tests { }; } - #[cfg(feature = "encryption")] - fn verify_encryption_test_file_read( - file: File, - decryption_properties: FileDecryptionProperties, - ) { - let options = ArrowReaderOptions::default() - .with_file_decryption_properties(decryption_properties.clone()); - let metadata = ArrowReaderMetadata::load(&file, options.clone()).unwrap(); - let file_metadata = metadata.metadata.file_metadata(); - - let builder = ParquetRecordBatchReaderBuilder::try_new_with_options(file, options).unwrap(); - let record_reader = builder.build().unwrap(); - - assert_eq!(file_metadata.num_rows(), 50); - assert_eq!(file_metadata.schema_descr().num_columns(), 8); - assert_eq!( - file_metadata.created_by().unwrap(), - "parquet-cpp-arrow version 19.0.0-SNAPSHOT" - ); - - metadata.metadata.row_groups().iter().for_each(|rg| { - assert_eq!(rg.num_columns(), 8); - assert_eq!(rg.num_rows(), 50); - }); - - let mut row_count = 0; - for batch in record_reader { - let batch = batch.unwrap(); - row_count += batch.num_rows(); - - let bool_col = batch.column(0).as_boolean(); - let time_col = batch - .column(1) - .as_primitive::(); - let list_col = batch.column(2).as_list::(); - let timestamp_col = batch - .column(3) - .as_primitive::(); - let f32_col = batch.column(4).as_primitive::(); - let f64_col = batch.column(5).as_primitive::(); - let binary_col = batch.column(6).as_binary::(); - let fixed_size_binary_col = batch.column(7).as_fixed_size_binary(); - - for (i, x) in bool_col.iter().enumerate() { - assert_eq!(x.unwrap(), i % 2 == 0); - } - for (i, x) in time_col.iter().enumerate() { - assert_eq!(x.unwrap(), i as i32); - } - for (i, list_item) in list_col.iter().enumerate() { - let list_item = list_item.unwrap(); - let list_item = list_item.as_primitive::(); - assert_eq!(list_item.len(), 2); - assert_eq!(list_item.value(0), ((i * 2) * 1000000000000) as i64); - assert_eq!(list_item.value(1), ((i * 2 + 1) * 1000000000000) as i64); - } - for x in timestamp_col.iter() { - assert!(x.is_some()); - } - for (i, x) in f32_col.iter().enumerate() { - assert_eq!(x.unwrap(), i as f32 * 1.1f32); - } - for (i, x) in f64_col.iter().enumerate() { - assert_eq!(x.unwrap(), i as f64 * 1.1111111f64); - } - for (i, x) in binary_col.iter().enumerate() { - assert_eq!(x.is_some(), i % 2 == 0); - if let Some(x) = x { - assert_eq!(&x[0..7], b"parquet"); - } - } - for (i, x) in fixed_size_binary_col.iter().enumerate() { - assert_eq!(x.unwrap(), &[i as u8; 10]); - } - } - - assert_eq!(row_count, file_metadata.num_rows() as usize); - } - #[test] fn test_read_float32_float64_byte_stream_split() { let path = format!( diff --git a/parquet/src/arrow/async_reader/mod.rs b/parquet/src/arrow/async_reader/mod.rs index cfdfd00ec59a..ca66d20994b4 100644 --- a/parquet/src/arrow/async_reader/mod.rs +++ b/parquet/src/arrow/async_reader/mod.rs @@ -1167,10 +1167,13 @@ mod tests { use crate::arrow::arrow_reader::{ ArrowPredicateFn, ParquetRecordBatchReaderBuilder, RowSelector, }; + use crate::arrow::arrow_reader::{ArrowReaderMetadata, ArrowReaderOptions}; use crate::arrow::schema::parquet_to_arrow_schema_and_fields; use crate::arrow::ArrowWriter; use crate::file::metadata::ParquetMetaDataReader; use crate::file::properties::WriterProperties; + #[cfg(feature = "encryption")] + use crate::util::test_common::encryption_util::verify_encryption_test_file_read_async; use arrow::compute::kernels::cmp::eq; use arrow::error::Result as ArrowResult; use arrow_array::builder::{ListBuilder, StringBuilder}; @@ -2492,95 +2495,6 @@ mod tests { assert_eq!(result.len(), 1); } - #[cfg(feature = "encryption")] - async fn verify_encryption_test_file_read( - file: &mut File, - decryption_properties: FileDecryptionProperties, - ) { - let options = - ArrowReaderOptions::new().with_file_decryption_properties(decryption_properties); - - let metadata = ArrowReaderMetadata::load_async(file, options.clone()) - .await - .unwrap(); - let arrow_reader_metadata = ArrowReaderMetadata::load_async(file, options) - .await - .unwrap(); - let file_metadata = metadata.metadata.file_metadata(); - - let record_reader = ParquetRecordBatchStreamBuilder::new_with_metadata( - file.try_clone().await.unwrap(), - arrow_reader_metadata.clone(), - ) - .build() - .unwrap(); - let record_batches = record_reader.try_collect::>().await.unwrap(); - - assert_eq!(file_metadata.num_rows(), 50); - assert_eq!(file_metadata.schema_descr().num_columns(), 8); - assert_eq!( - file_metadata.created_by().unwrap(), - "parquet-cpp-arrow version 19.0.0-SNAPSHOT" - ); - - metadata.metadata.row_groups().iter().for_each(|rg| { - assert_eq!(rg.num_columns(), 8); - assert_eq!(rg.num_rows(), 50); - }); - - let mut row_count = 0; - for batch in record_batches { - row_count += batch.num_rows(); - - let bool_col = batch.column(0).as_boolean(); - let time_col = batch - .column(1) - .as_primitive::(); - let list_col = batch.column(2).as_list::(); - let timestamp_col = batch - .column(3) - .as_primitive::(); - let f32_col = batch.column(4).as_primitive::(); - let f64_col = batch.column(5).as_primitive::(); - let binary_col = batch.column(6).as_binary::(); - let fixed_size_binary_col = batch.column(7).as_fixed_size_binary(); - - for (i, x) in bool_col.iter().enumerate() { - assert_eq!(x.unwrap(), i % 2 == 0); - } - for (i, x) in time_col.iter().enumerate() { - assert_eq!(x.unwrap(), i as i32); - } - for (i, list_item) in list_col.iter().enumerate() { - let list_item = list_item.unwrap(); - let list_item = list_item.as_primitive::(); - assert_eq!(list_item.len(), 2); - assert_eq!(list_item.value(0), ((i * 2) * 1000000000000) as i64); - assert_eq!(list_item.value(1), ((i * 2 + 1) * 1000000000000) as i64); - } - for x in timestamp_col.iter() { - assert!(x.is_some()); - } - for (i, x) in f32_col.iter().enumerate() { - assert_eq!(x.unwrap(), i as f32 * 1.1f32); - } - for (i, x) in f64_col.iter().enumerate() { - assert_eq!(x.unwrap(), i as f64 * 1.1111111f64); - } - for (i, x) in binary_col.iter().enumerate() { - assert_eq!(x.is_some(), i % 2 == 0); - if let Some(x) = x { - assert_eq!(&x[0..7], b"parquet"); - } - } - for (i, x) in fixed_size_binary_col.iter().enumerate() { - assert_eq!(x.unwrap(), &[i as u8; 10]); - } - } - - assert_eq!(row_count, file_metadata.num_rows() as usize); - } - #[tokio::test] #[cfg(feature = "encryption")] async fn test_non_uniform_encryption_plaintext_footer() { @@ -2600,7 +2514,7 @@ mod tests { .build() .unwrap(); - verify_encryption_test_file_read(&mut file, decryption_properties).await; + verify_encryption_test_file_read_async(&mut file, decryption_properties).await; } #[tokio::test] @@ -2680,7 +2594,7 @@ mod tests { .build() .unwrap(); - verify_encryption_test_file_read(&mut file, decryption_properties).await; + verify_encryption_test_file_read_async(&mut file, decryption_properties).await; } #[tokio::test] @@ -2695,7 +2609,7 @@ mod tests { .build() .unwrap(); - verify_encryption_test_file_read(&mut file, decryption_properties).await; + verify_encryption_test_file_read_async(&mut file, decryption_properties).await; } #[tokio::test] diff --git a/parquet/src/file/metadata/reader.rs b/parquet/src/file/metadata/reader.rs index c5e134fe43c2..6e797e174964 100644 --- a/parquet/src/file/metadata/reader.rs +++ b/parquet/src/file/metadata/reader.rs @@ -697,7 +697,7 @@ impl ParquetMetaDataReader { /// [Parquet Spec]: https://github.com/apache/parquet-format#metadata /// [Parquet Encryption Spec]: https://parquet.apache.org/docs/file-format/data-pages/encryption/ #[cfg(feature = "encryption")] - pub fn decrypt_metadata( + pub(crate) fn decrypt_metadata( buf: &[u8], encrypted_footer: bool, file_decryption_properties: Option<&FileDecryptionProperties>, diff --git a/parquet/src/util/test_common/encryption_util.rs b/parquet/src/util/test_common/encryption_util.rs new file mode 100644 index 000000000000..c4e06df1d0bc --- /dev/null +++ b/parquet/src/util/test_common/encryption_util.rs @@ -0,0 +1,138 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::arrow::arrow_reader::{ + ArrowReaderMetadata, ArrowReaderOptions, ParquetRecordBatchReaderBuilder, +}; +use crate::arrow::ParquetRecordBatchStreamBuilder; +use crate::encryption::decryption::FileDecryptionProperties; +use crate::file::metadata::FileMetaData; +use arrow_array::cast::AsArray; +use arrow_array::{types, RecordBatch}; +use futures::TryStreamExt; +use std::fs::File; + +pub(crate) fn verify_encryption_test_file_read( + file: File, + decryption_properties: FileDecryptionProperties, +) { + let options = ArrowReaderOptions::default() + .with_file_decryption_properties(decryption_properties.clone()); + let metadata = ArrowReaderMetadata::load(&file, options.clone()).unwrap(); + let file_metadata = metadata.metadata.file_metadata(); + + let builder = ParquetRecordBatchReaderBuilder::try_new_with_options(file, options).unwrap(); + let record_reader = builder.build().unwrap(); + let record_batches = record_reader + .map(|x| x.unwrap()) + .collect::>(); + + verify_encryption_test_data(record_batches, file_metadata.clone(), metadata); +} + +pub(crate) async fn verify_encryption_test_file_read_async( + file: &mut tokio::fs::File, + decryption_properties: FileDecryptionProperties, +) { + let options = ArrowReaderOptions::new().with_file_decryption_properties(decryption_properties); + + let metadata = ArrowReaderMetadata::load_async(file, options.clone()) + .await + .unwrap(); + let arrow_reader_metadata = ArrowReaderMetadata::load_async(file, options) + .await + .unwrap(); + let file_metadata = metadata.metadata.file_metadata(); + + let record_reader = ParquetRecordBatchStreamBuilder::new_with_metadata( + file.try_clone().await.unwrap(), + arrow_reader_metadata.clone(), + ) + .build() + .unwrap(); + let record_batches = record_reader.try_collect::>().await.unwrap(); + + verify_encryption_test_data(record_batches, file_metadata.clone(), metadata); +} + +/// Tests reading an encrypted file from the parquet-testing repository +fn verify_encryption_test_data( + record_batches: Vec, + file_metadata: FileMetaData, + metadata: ArrowReaderMetadata, +) { + assert_eq!(file_metadata.num_rows(), 50); + assert_eq!(file_metadata.schema_descr().num_columns(), 8); + + metadata.metadata.row_groups().iter().for_each(|rg| { + assert_eq!(rg.num_columns(), 8); + assert_eq!(rg.num_rows(), 50); + }); + + let mut row_count = 0; + for batch in record_batches { + let batch = batch; + row_count += batch.num_rows(); + + let bool_col = batch.column(0).as_boolean(); + let time_col = batch + .column(1) + .as_primitive::(); + let list_col = batch.column(2).as_list::(); + let timestamp_col = batch + .column(3) + .as_primitive::(); + let f32_col = batch.column(4).as_primitive::(); + let f64_col = batch.column(5).as_primitive::(); + let binary_col = batch.column(6).as_binary::(); + let fixed_size_binary_col = batch.column(7).as_fixed_size_binary(); + + for (i, x) in bool_col.iter().enumerate() { + assert_eq!(x.unwrap(), i % 2 == 0); + } + for (i, x) in time_col.iter().enumerate() { + assert_eq!(x.unwrap(), i as i32); + } + for (i, list_item) in list_col.iter().enumerate() { + let list_item = list_item.unwrap(); + let list_item = list_item.as_primitive::(); + assert_eq!(list_item.len(), 2); + assert_eq!(list_item.value(0), ((i * 2) * 1000000000000) as i64); + assert_eq!(list_item.value(1), ((i * 2 + 1) * 1000000000000) as i64); + } + for x in timestamp_col.iter() { + assert!(x.is_some()); + } + for (i, x) in f32_col.iter().enumerate() { + assert_eq!(x.unwrap(), i as f32 * 1.1f32); + } + for (i, x) in f64_col.iter().enumerate() { + assert_eq!(x.unwrap(), i as f64 * 1.1111111f64); + } + for (i, x) in binary_col.iter().enumerate() { + assert_eq!(x.is_some(), i % 2 == 0); + if let Some(x) = x { + assert_eq!(&x[0..7], b"parquet"); + } + } + for (i, x) in fixed_size_binary_col.iter().enumerate() { + assert_eq!(x.unwrap(), &[i as u8; 10]); + } + } + + assert_eq!(row_count, file_metadata.num_rows() as usize); +} diff --git a/parquet/src/util/test_common/mod.rs b/parquet/src/util/test_common/mod.rs index 8cfc1e6dd423..ac36118c3702 100644 --- a/parquet/src/util/test_common/mod.rs +++ b/parquet/src/util/test_common/mod.rs @@ -22,3 +22,6 @@ pub mod file_util; #[cfg(test)] pub mod rand_gen; + +#[cfg(all(test, feature = "encryption", feature = "arrow"))] +pub mod encryption_util; From 276fc1a153c7a64bdf50401d007224b12ee1ddd8 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Sat, 8 Mar 2025 03:46:16 +0100 Subject: [PATCH 80/97] Handle common encryption errors Co-authored-by: Corwin Joy Co-authored-by: Adam Reeve --- parquet/src/arrow/arrow_reader/mod.rs | 9 ++ parquet/src/arrow/async_reader/mod.rs | 153 ++++++++++++++---- parquet/src/encryption/ciphers.rs | 10 +- parquet/src/encryption/decryption.rs | 28 ++-- parquet/src/file/metadata/mod.rs | 13 +- parquet/src/file/metadata/reader.rs | 13 +- .../src/util/test_common/encryption_util.rs | 19 +-- 7 files changed, 178 insertions(+), 67 deletions(-) diff --git a/parquet/src/arrow/arrow_reader/mod.rs b/parquet/src/arrow/arrow_reader/mod.rs index 5d796effb693..063d055773b3 100644 --- a/parquet/src/arrow/arrow_reader/mod.rs +++ b/parquet/src/arrow/arrow_reader/mod.rs @@ -708,8 +708,17 @@ impl Iterator for ReaderPageIterator { if file_decryptor.is_column_encrypted(column_name.name()) { let data_decryptor = file_decryptor.get_column_data_decryptor(column_name.name()); + let data_decryptor = match data_decryptor { + Ok(data_decryptor) => data_decryptor, + Err(err) => return Some(Err(err)), + }; + let metadata_decryptor = file_decryptor.get_column_metadata_decryptor(column_name.name()); + let metadata_decryptor = match metadata_decryptor { + Ok(metadata_decryptor) => metadata_decryptor, + Err(err) => return Some(Err(err)), + }; let crypto_context = CryptoContext::new( rg_idx, diff --git a/parquet/src/arrow/async_reader/mod.rs b/parquet/src/arrow/async_reader/mod.rs index ca66d20994b4..5d1743ad7511 100644 --- a/parquet/src/arrow/async_reader/mod.rs +++ b/parquet/src/arrow/async_reader/mod.rs @@ -1031,35 +1031,35 @@ impl RowGroups for InMemoryRowGroup<'_> { fn column_chunks(&self, i: usize) -> Result> { #[cfg(feature = "encryption")] - let crypto_context = if let Some(file_decryptor) = - &self.parquet_metadata.clone().file_decryptor().clone() - { - let column_name = &self - .parquet_metadata - .clone() - .file_metadata() - .schema_descr() - .column(i); - - if file_decryptor.is_column_encrypted(column_name.name()) { - let data_decryptor = file_decryptor.get_column_data_decryptor(column_name.name()); - let metadata_decryptor = - file_decryptor.get_column_metadata_decryptor(column_name.name()); - - let crypto_context = CryptoContext::new( - self.row_group_ordinal, - i, - data_decryptor, - metadata_decryptor, - file_decryptor.file_aad().clone(), - ); - Some(Arc::new(crypto_context)) + let crypto_context = + if let Some(file_decryptor) = &self.parquet_metadata.clone().file_decryptor().clone() { + let column_name = &self + .parquet_metadata + .clone() + .file_metadata() + .schema_descr() + .column(i); + + if file_decryptor.is_column_encrypted(column_name.name()) { + let data_decryptor = + file_decryptor.get_column_data_decryptor(column_name.name())?; + let metadata_decryptor = + file_decryptor.get_column_metadata_decryptor(column_name.name())?; + + let crypto_context = CryptoContext::new( + self.row_group_ordinal, + i, + data_decryptor, + metadata_decryptor, + file_decryptor.file_aad().clone(), + ); + Some(Arc::new(crypto_context)) + } else { + None + } } else { None - } - } else { - None - }; + }; match &self.column_chunks[i] { None => Err(ParquetError::General(format!( @@ -2514,7 +2514,100 @@ mod tests { .build() .unwrap(); - verify_encryption_test_file_read_async(&mut file, decryption_properties).await; + let _ = verify_encryption_test_file_read_async(&mut file, decryption_properties).await; + } + + #[tokio::test] + #[cfg(feature = "encryption")] + async fn test_misspecified_encryption_keys() { + let testdata = arrow::util::test_util::parquet_test_data(); + let path = format!("{testdata}/encrypt_columns_and_footer.parquet.encrypted"); + + // There is always a footer key even with a plaintext footer, + // but this is used for signing the footer. + let footer_key = "0123456789012345".as_bytes(); // 128bit/16 + let column_1_key = "1234567890123450".as_bytes(); + let column_2_key = "1234567890123451".as_bytes(); + + // read file with keys and check for expected error message + async fn check_for_error( + expected_message: &str, + path: &String, + footer_key: &[u8], + column_1_key: &[u8], + column_2_key: &[u8], + ) { + let mut file = File::open(&path).await.unwrap(); + + let mut decryption_properties = FileDecryptionProperties::builder(footer_key.to_vec()); + + if column_1_key.is_empty() { + decryption_properties = + decryption_properties.with_column_key("double_field", column_1_key.to_vec()); + } + + if column_2_key.is_empty() { + decryption_properties = + decryption_properties.with_column_key("float_field", column_2_key.to_vec()); + } + + let decryption_properties = decryption_properties.build().unwrap(); + + match verify_encryption_test_file_read_async(&mut file, decryption_properties).await { + Ok(_) => { + panic!("did not get expected error") + } + Err(e) => { + assert_eq!(e.to_string(), expected_message); + } + } + } + + // Too short footer key + check_for_error( + "Parquet error: Invalid footer key. Failed to create AES key", + &path, + "bad_pwd".as_bytes(), + column_1_key, + column_2_key, + ) + .await; + + // Wrong footer key + check_for_error( + "Parquet error: Provided footer key was unable to decrypt parquet footer", + &path, + "1123456789012345".as_bytes(), + column_1_key, + column_2_key, + ) + .await; + + // todo: should this be double_field? + // Missing column key + check_for_error("Parquet error: Unable to decrypt column 'float_field', perhaps the column key is wrong or missing?", + &path, footer_key, "".as_bytes(), column_2_key).await; + + // Too short column key + check_for_error( + // todo: should report key length error + // "Parquet error: Failed to create AES key", + "Parquet error: Unable to decrypt column 'float_field', perhaps the column key is wrong or missing?", + &path, + footer_key, + "abc".as_bytes(), + column_2_key, + ) + .await; + + // todo: should this be double_field? + // Wrong column key + check_for_error("Parquet error: Unable to decrypt column 'float_field', perhaps the column key is wrong or missing?", + &path, footer_key, "1123456789012345".as_bytes(), column_2_key).await; + + // Mixed up keys + check_for_error("Parquet error: Unable to decrypt column 'float_field', perhaps the column key is wrong or missing?", + &path, footer_key, column_2_key, column_1_key).await; } #[tokio::test] @@ -2594,7 +2687,7 @@ mod tests { .build() .unwrap(); - verify_encryption_test_file_read_async(&mut file, decryption_properties).await; + let _ = verify_encryption_test_file_read_async(&mut file, decryption_properties).await; } #[tokio::test] @@ -2609,7 +2702,7 @@ mod tests { .build() .unwrap(); - verify_encryption_test_file_read_async(&mut file, decryption_properties).await; + let _ = verify_encryption_test_file_read_async(&mut file, decryption_properties).await; } #[tokio::test] diff --git a/parquet/src/encryption/ciphers.rs b/parquet/src/encryption/ciphers.rs index aa2a75893dc7..9b5a04d622f0 100644 --- a/parquet/src/encryption/ciphers.rs +++ b/parquet/src/encryption/ciphers.rs @@ -15,6 +15,7 @@ // specific language governing permissions and limitations // under the License. +use crate::errors::ParquetError::General; use crate::errors::Result; use ring::aead::{Aad, LessSafeKey, UnboundKey, AES_128_GCM}; use std::fmt::Debug; @@ -33,13 +34,14 @@ pub(crate) struct RingGcmBlockDecryptor { } impl RingGcmBlockDecryptor { - pub(crate) fn new(key_bytes: &[u8]) -> Self { + pub(crate) fn new(key_bytes: &[u8]) -> Result { // todo support other key sizes - let key = UnboundKey::new(&AES_128_GCM, key_bytes).unwrap(); + let key = UnboundKey::new(&AES_128_GCM, key_bytes) + .map_err(|_| General("Failed to create AES key".to_string()))?; - Self { + Ok(Self { key: LessSafeKey::new(key), - } + }) } } diff --git a/parquet/src/encryption/decryption.rs b/parquet/src/encryption/decryption.rs index 58b202b44199..1a8761885e46 100644 --- a/parquet/src/encryption/decryption.rs +++ b/parquet/src/encryption/decryption.rs @@ -17,7 +17,7 @@ use crate::encryption::ciphers::{BlockDecryptor, RingGcmBlockDecryptor}; use crate::encryption::modules::{create_module_aad, ModuleType}; -use crate::errors::Result; +use crate::errors::{ParquetError, Result}; use std::collections::HashMap; use std::io::Read; use std::sync::Arc; @@ -208,25 +208,33 @@ impl FileDecryptor { decryption_properties: &FileDecryptionProperties, aad_file_unique: Vec, aad_prefix: Vec, - ) -> Self { + ) -> Result { let file_aad = [aad_prefix.as_slice(), aad_file_unique.as_slice()].concat(); // todo decr: if no key available yet (not set in properties, should be retrieved from metadata) - let footer_decryptor = RingGcmBlockDecryptor::new(&decryption_properties.footer_key); + let footer_decryptor = RingGcmBlockDecryptor::new(&decryption_properties.footer_key) + .map_err(|e| { + let msg = String::from("Invalid footer key. ") + + e.to_string().replace("Parquet error: ", "").as_str(); + ParquetError::General(msg) + })?; - Self { + Ok(Self { footer_decryptor: Some(Arc::new(footer_decryptor)), decryption_properties: decryption_properties.clone(), file_aad, - } + }) } - pub(crate) fn get_footer_decryptor(&self) -> Arc { - self.footer_decryptor.clone().unwrap() + pub(crate) fn get_footer_decryptor(&self) -> Result, ParquetError> { + Ok(self.footer_decryptor.clone().unwrap()) } - pub(crate) fn get_column_data_decryptor(&self, column_name: &str) -> Arc { + pub(crate) fn get_column_data_decryptor( + &self, + column_name: &str, + ) -> Result, ParquetError> { match self.decryption_properties.column_keys.get(column_name) { - Some(column_key) => Arc::new(RingGcmBlockDecryptor::new(column_key)), + Some(column_key) => Ok(Arc::new(RingGcmBlockDecryptor::new(column_key)?)), None => self.get_footer_decryptor(), } } @@ -234,7 +242,7 @@ impl FileDecryptor { pub(crate) fn get_column_metadata_decryptor( &self, column_name: &str, - ) -> Arc { + ) -> Result, ParquetError> { // Once GCM CTR mode is implemented, data and metadata decryptors may be different self.get_column_data_decryptor(column_name) } diff --git a/parquet/src/file/metadata/mod.rs b/parquet/src/file/metadata/mod.rs index a671ef7a1051..bfa0ea6aba17 100644 --- a/parquet/src/file/metadata/mod.rs +++ b/parquet/src/file/metadata/mod.rs @@ -652,16 +652,16 @@ impl RowGroupMetaData { let column_decryptor = match c.crypto_metadata.as_ref() { None => { return Err(general_err!( - "No crypto_metadata is set for column {}, which has encrypted metadata", - i + "No crypto_metadata is set for column '{}', which has encrypted metadata", + d.path().string() )); } Some(ColumnCryptoMetaData::ENCRYPTIONWITHCOLUMNKEY(crypto_metadata)) => { let column_name = crypto_metadata.path_in_schema.join("."); - decryptor.get_column_metadata_decryptor(column_name.as_str()) + decryptor.get_column_metadata_decryptor(column_name.as_str())? } Some(ColumnCryptoMetaData::ENCRYPTIONWITHFOOTERKEY(_)) => { - decryptor.get_footer_decryptor() + decryptor.get_footer_decryptor()? } }; @@ -675,7 +675,10 @@ impl RowGroupMetaData { let buf = c.encrypted_column_metadata.clone().unwrap(); let decrypted_cc_buf = - column_decryptor.decrypt(buf.as_slice(), column_aad.as_ref())?; + column_decryptor.decrypt(buf.as_slice(), column_aad.as_ref()).map_err(|_| { + general_err!("Unable to decrypt column '{}', perhaps the column key is wrong or missing?", + d.path().string()) + })?; let mut prot = TCompactSliceInputProtocol::new(decrypted_cc_buf.as_slice()); c.meta_data = Some(ColumnMetaData::read_from_in_protocol(&mut prot)?); diff --git a/parquet/src/file/metadata/reader.rs b/parquet/src/file/metadata/reader.rs index 6e797e174964..f3baf0c16cb1 100644 --- a/parquet/src/file/metadata/reader.rs +++ b/parquet/src/file/metadata/reader.rs @@ -718,8 +718,11 @@ impl ParquetMetaDataReader { let footer_decryptor = decryptor.get_footer_decryptor(); let aad_footer = create_footer_aad(decryptor.file_aad())?; - decrypted_fmd_buf = - footer_decryptor.decrypt(prot.as_slice().as_ref(), aad_footer.as_ref())?; + decrypted_fmd_buf = footer_decryptor? + .decrypt(prot.as_slice().as_ref(), aad_footer.as_ref()) + .map_err(|_| { + general_err!("Provided footer key was unable to decrypt parquet footer") + })?; prot = TCompactSliceInputProtocol::new(decrypted_fmd_buf.as_ref()); file_decryptor = Some(decryptor); @@ -846,11 +849,7 @@ fn get_file_decryptor( .ok_or_else(|| general_err!("AAD unique file identifier is not set"))?; let aad_prefix: Vec = algo.aad_prefix.unwrap_or_default(); - Ok(FileDecryptor::new( - file_decryption_properties, - aad_file_unique, - aad_prefix, - )) + FileDecryptor::new(file_decryption_properties, aad_file_unique, aad_prefix) } EncryptionAlgorithm::AESGCMCTRV1(_) => Err(nyi_err!( "The AES_GCM_CTR_V1 encryption algorithm is not yet supported" diff --git a/parquet/src/util/test_common/encryption_util.rs b/parquet/src/util/test_common/encryption_util.rs index c4e06df1d0bc..51d6f701db80 100644 --- a/parquet/src/util/test_common/encryption_util.rs +++ b/parquet/src/util/test_common/encryption_util.rs @@ -20,6 +20,7 @@ use crate::arrow::arrow_reader::{ }; use crate::arrow::ParquetRecordBatchStreamBuilder; use crate::encryption::decryption::FileDecryptionProperties; +use crate::errors::ParquetError; use crate::file::metadata::FileMetaData; use arrow_array::cast::AsArray; use arrow_array::{types, RecordBatch}; @@ -47,26 +48,22 @@ pub(crate) fn verify_encryption_test_file_read( pub(crate) async fn verify_encryption_test_file_read_async( file: &mut tokio::fs::File, decryption_properties: FileDecryptionProperties, -) { +) -> Result<(), ParquetError> { let options = ArrowReaderOptions::new().with_file_decryption_properties(decryption_properties); - let metadata = ArrowReaderMetadata::load_async(file, options.clone()) - .await - .unwrap(); - let arrow_reader_metadata = ArrowReaderMetadata::load_async(file, options) - .await - .unwrap(); + let metadata = ArrowReaderMetadata::load_async(file, options.clone()).await?; + let arrow_reader_metadata = ArrowReaderMetadata::load_async(file, options).await?; let file_metadata = metadata.metadata.file_metadata(); let record_reader = ParquetRecordBatchStreamBuilder::new_with_metadata( - file.try_clone().await.unwrap(), + file.try_clone().await?, arrow_reader_metadata.clone(), ) - .build() - .unwrap(); - let record_batches = record_reader.try_collect::>().await.unwrap(); + .build()?; + let record_batches = record_reader.try_collect::>().await?; verify_encryption_test_data(record_batches, file_metadata.clone(), metadata); + Ok(()) } /// Tests reading an encrypted file from the parquet-testing repository From c805b516a63177b8aeb2c0053e0efd08c520c659 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Mon, 10 Mar 2025 10:19:23 +0100 Subject: [PATCH 81/97] Apply suggestions from code review Co-authored-by: Adam Reeve --- parquet/src/encryption/decryption.rs | 5 +---- parquet/src/file/metadata/mod.rs | 4 ++-- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/parquet/src/encryption/decryption.rs b/parquet/src/encryption/decryption.rs index 1a8761885e46..4afe6abebc55 100644 --- a/parquet/src/encryption/decryption.rs +++ b/parquet/src/encryption/decryption.rs @@ -212,10 +212,7 @@ impl FileDecryptor { let file_aad = [aad_prefix.as_slice(), aad_file_unique.as_slice()].concat(); // todo decr: if no key available yet (not set in properties, should be retrieved from metadata) let footer_decryptor = RingGcmBlockDecryptor::new(&decryption_properties.footer_key) - .map_err(|e| { - let msg = String::from("Invalid footer key. ") - + e.to_string().replace("Parquet error: ", "").as_str(); - ParquetError::General(msg) + .map_err(|e| general_err!("Invalid footer key. {}", e.to_string().replace("Parquet error: ", "")))?; })?; Ok(Self { diff --git a/parquet/src/file/metadata/mod.rs b/parquet/src/file/metadata/mod.rs index bfa0ea6aba17..cfe34f4c3e58 100644 --- a/parquet/src/file/metadata/mod.rs +++ b/parquet/src/file/metadata/mod.rs @@ -234,8 +234,8 @@ impl ParquetMetaData { /// Returns file decryptor as reference. #[cfg(feature = "encryption")] - pub fn file_decryptor(&self) -> &Option { - &self.file_decryptor + pub fn file_decryptor(&self) -> Option<&FileDecryptor> { + self.file_decryptor.as_ref() } /// Returns number of row groups in this file. From d7eefd910ac5d0d57dafd0bfc16c6bb4a14e8e8d Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Mon, 10 Mar 2025 10:45:47 +0100 Subject: [PATCH 82/97] Update parquet/src/arrow/async_reader/mod.rs Co-authored-by: Adam Reeve --- parquet/src/arrow/async_reader/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/parquet/src/arrow/async_reader/mod.rs b/parquet/src/arrow/async_reader/mod.rs index 5d1743ad7511..6cad667d0c6e 100644 --- a/parquet/src/arrow/async_reader/mod.rs +++ b/parquet/src/arrow/async_reader/mod.rs @@ -2514,7 +2514,7 @@ mod tests { .build() .unwrap(); - let _ = verify_encryption_test_file_read_async(&mut file, decryption_properties).await; + verify_encryption_test_file_read_async(&mut file, decryption_properties).await.unwrap(); } #[tokio::test] From 751b54e71a9f1e05373795cc6d6ed8903a42d1a7 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Mon, 10 Mar 2025 11:04:22 +0100 Subject: [PATCH 83/97] Fix previous commit --- parquet/src/encryption/decryption.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/parquet/src/encryption/decryption.rs b/parquet/src/encryption/decryption.rs index 4afe6abebc55..06704d705710 100644 --- a/parquet/src/encryption/decryption.rs +++ b/parquet/src/encryption/decryption.rs @@ -213,7 +213,6 @@ impl FileDecryptor { // todo decr: if no key available yet (not set in properties, should be retrieved from metadata) let footer_decryptor = RingGcmBlockDecryptor::new(&decryption_properties.footer_key) .map_err(|e| general_err!("Invalid footer key. {}", e.to_string().replace("Parquet error: ", "")))?; - })?; Ok(Self { footer_decryptor: Some(Arc::new(footer_decryptor)), From fec6880df033c2c965168ec2dc6b68aea2f8ff86 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Mon, 10 Mar 2025 14:04:52 +0100 Subject: [PATCH 84/97] Add TestReader::new, less pub functions, add test_non_uniform_encryption_disabled_aad_storage --- parquet/src/arrow/arrow_reader/mod.rs | 58 +++++++++++++++++++++++++ parquet/src/arrow/async_reader/mod.rs | 57 +++++++++--------------- parquet/src/arrow/async_reader/store.rs | 9 ++-- parquet/src/encryption/decryption.rs | 18 +++++--- parquet/src/encryption/mod.rs | 6 +-- parquet/src/file/metadata/reader.rs | 10 ++++- 6 files changed, 104 insertions(+), 54 deletions(-) diff --git a/parquet/src/arrow/arrow_reader/mod.rs b/parquet/src/arrow/arrow_reader/mod.rs index 063d055773b3..bfccbde9eb98 100644 --- a/parquet/src/arrow/arrow_reader/mod.rs +++ b/parquet/src/arrow/arrow_reader/mod.rs @@ -1881,6 +1881,64 @@ mod tests { verify_encryption_test_file_read(file, decryption_properties); } + #[test] + #[cfg(feature = "encryption")] + fn test_non_uniform_encryption_disabled_aad_storage() { + let testdata = arrow::util::test_util::parquet_test_data(); + let path = + format!("{testdata}/encrypt_columns_and_footer_disable_aad_storage.parquet.encrypted"); + let file = File::open(path.clone()).unwrap(); + + let footer_key = "0123456789012345".as_bytes(); // 128bit/16 + let column_1_key = "1234567890123450".as_bytes(); + let column_2_key = "1234567890123451".as_bytes(); + + // Provided AAD prefix overrides the one stored in the file + let decryption_properties = FileDecryptionProperties::builder(footer_key.to_vec()) + .with_column_key("double_field", column_1_key.to_vec()) + .with_column_key("float_field", column_2_key.to_vec()) + .with_aad_prefix("tester") + .build() + .unwrap(); + + verify_encryption_test_file_read(file, decryption_properties); + + // Using wrong AAD prefix should fail + let decryption_properties = FileDecryptionProperties::builder(footer_key.to_vec()) + .with_column_key("double_field", column_1_key.to_vec()) + .with_column_key("float_field", column_2_key.to_vec()) + .with_aad_prefix("wrong_aad_prefix") + .build() + .unwrap(); + + let file = File::open(path.clone()).unwrap(); + let options = ArrowReaderOptions::default() + .with_file_decryption_properties(decryption_properties.clone()); + let result = ArrowReaderMetadata::load(&file, options.clone()); + assert!(result.is_err()); + assert_eq!( + result.unwrap_err().to_string(), + "Parquet error: Provided footer key and AAD were unable to decrypt parquet footer" + ); + + // Using wrong AAD prefix stored in the file should fail + let decryption_properties = FileDecryptionProperties::builder(footer_key.to_vec()) + .with_column_key("double_field", column_1_key.to_vec()) + .with_column_key("float_field", column_2_key.to_vec()) + .build() + .unwrap(); + + let file = File::open(path).unwrap(); + let options = ArrowReaderOptions::default() + .with_file_decryption_properties(decryption_properties.clone()); + let result = ArrowReaderMetadata::load(&file, options.clone()); + assert!(result.is_err()); + assert_eq!( + result.unwrap_err().to_string(), + "Parquet error: Provided footer key and AAD were unable to decrypt parquet footer" + ); + } + #[test] fn test_non_uniform_encryption_plaintext_footer_without_decryption() { let testdata = arrow::util::test_util::parquet_test_data(); diff --git a/parquet/src/arrow/async_reader/mod.rs b/parquet/src/arrow/async_reader/mod.rs index 6cad667d0c6e..73cbe166f40a 100644 --- a/parquet/src/arrow/async_reader/mod.rs +++ b/parquet/src/arrow/async_reader/mod.rs @@ -1198,8 +1198,21 @@ mod tests { data: Bytes, metadata: Arc, requests: Arc>>>, - #[cfg(feature = "encryption")] - file_decryption_properties: Option, + } + + #[cfg(feature = "encryption")] + impl TestReader { + async fn new( + data: Bytes, + metadata: Arc, + requests: Arc>>>, + ) -> Self { + Self { + data, + metadata, + requests, + } + } } impl AsyncFileReader for TestReader { @@ -1238,8 +1251,6 @@ mod tests { data: data.clone(), metadata: metadata.clone(), requests: Default::default(), - #[cfg(feature = "encryption")] - file_decryption_properties: None, }; let requests = async_reader.requests.clone(); @@ -1297,8 +1308,6 @@ mod tests { data: data.clone(), metadata: metadata.clone(), requests: Default::default(), - #[cfg(feature = "encryption")] - file_decryption_properties: None, }; let requests = async_reader.requests.clone(); @@ -1364,8 +1373,6 @@ mod tests { data: data.clone(), metadata: metadata.clone(), requests: Default::default(), - #[cfg(feature = "encryption")] - file_decryption_properties: None, }; let options = ArrowReaderOptions::new().with_page_index(true); @@ -1434,8 +1441,6 @@ mod tests { data: data.clone(), metadata: metadata.clone(), requests: Default::default(), - #[cfg(feature = "encryption")] - file_decryption_properties: None, }; let builder = ParquetRecordBatchStreamBuilder::new(async_reader) @@ -1482,8 +1487,6 @@ mod tests { data: data.clone(), metadata: metadata.clone(), requests: Default::default(), - #[cfg(feature = "encryption")] - file_decryption_properties: None, }; let options = ArrowReaderOptions::new().with_page_index(true); @@ -1567,8 +1570,6 @@ mod tests { data: data.clone(), metadata: metadata.clone(), requests: Default::default(), - #[cfg(feature = "encryption")] - file_decryption_properties: None, }; let options = ArrowReaderOptions::new().with_page_index(true); @@ -1640,8 +1641,6 @@ mod tests { data: data.clone(), metadata: metadata.clone(), requests: Default::default(), - #[cfg(feature = "encryption")] - file_decryption_properties: None, }; let options = ArrowReaderOptions::new().with_page_index(true); @@ -1692,8 +1691,6 @@ mod tests { data, metadata: Arc::new(metadata), requests: Default::default(), - #[cfg(feature = "encryption")] - file_decryption_properties: None, }; let requests = test.requests.clone(); @@ -1771,8 +1768,6 @@ mod tests { data, metadata: Arc::new(metadata), requests: Default::default(), - #[cfg(feature = "encryption")] - file_decryption_properties: None, }; let stream = ParquetRecordBatchStreamBuilder::new(test.clone()) @@ -1865,8 +1860,6 @@ mod tests { data: data.clone(), metadata: metadata.clone(), requests: Default::default(), - #[cfg(feature = "encryption")] - file_decryption_properties: None, }; let a_filter = @@ -1935,8 +1928,6 @@ mod tests { data: data.clone(), metadata: metadata.clone(), requests: Default::default(), - #[cfg(feature = "encryption")] - file_decryption_properties: None, }; let requests = async_reader.requests.clone(); @@ -2013,8 +2004,6 @@ mod tests { data: data.clone(), metadata: metadata.clone(), requests: Default::default(), - #[cfg(feature = "encryption")] - file_decryption_properties: None, }; let builder = ParquetRecordBatchStreamBuilder::new(async_reader) @@ -2160,8 +2149,6 @@ mod tests { data: data.clone(), metadata: metadata.clone(), requests: Default::default(), - #[cfg(feature = "encryption")] - file_decryption_properties: None, }; let builder = ParquetRecordBatchStreamBuilder::new(async_reader) .await @@ -2199,8 +2186,6 @@ mod tests { data: data.clone(), metadata: metadata.clone(), requests: Default::default(), - #[cfg(feature = "encryption")] - file_decryption_properties: None, }; let mut builder = ParquetRecordBatchStreamBuilder::new(async_reader) @@ -2338,8 +2323,6 @@ mod tests { data, metadata: Arc::new(metadata), requests: Default::default(), - #[cfg(feature = "encryption")] - file_decryption_properties: None, }; let requests = test.requests.clone(); @@ -2514,7 +2497,9 @@ mod tests { .build() .unwrap(); - verify_encryption_test_file_read_async(&mut file, decryption_properties).await.unwrap(); + verify_encryption_test_file_read_async(&mut file, decryption_properties) + .await + .unwrap(); } #[tokio::test] @@ -2575,7 +2560,7 @@ mod tests { // Wrong footer key check_for_error( - "Parquet error: Provided footer key was unable to decrypt parquet footer", + "Parquet error: Provided footer key and AAD were unable to decrypt parquet footer", &path, "1123456789012345".as_bytes(), column_1_key, @@ -2687,7 +2672,7 @@ mod tests { .build() .unwrap(); - let _ = verify_encryption_test_file_read_async(&mut file, decryption_properties).await; + verify_encryption_test_file_read_async(&mut file, decryption_properties).await; } #[tokio::test] @@ -2702,7 +2687,7 @@ mod tests { .build() .unwrap(); - let _ = verify_encryption_test_file_read_async(&mut file, decryption_properties).await; + verify_encryption_test_file_read_async(&mut file, decryption_properties).await; } #[tokio::test] diff --git a/parquet/src/arrow/async_reader/store.rs b/parquet/src/arrow/async_reader/store.rs index 1c909c8ae59f..78114c840701 100644 --- a/parquet/src/arrow/async_reader/store.rs +++ b/parquet/src/arrow/async_reader/store.rs @@ -175,12 +175,9 @@ impl AsyncFileReader for ParquetObjectReader { let metadata = ParquetMetaDataReader::new() .with_column_indexes(self.preload_column_index) .with_offset_indexes(self.preload_offset_index) - .with_prefetch_hint(self.metadata_size_hint); - #[cfg(feature = "encryption")] - let metadata = metadata - .with_decryption_properties(self.file_decryption_properties.clone().as_ref()); - - let metadata = metadata.load_and_finish(self, file_size).await?; + .with_prefetch_hint(self.metadata_size_hint) + .load_and_finish(self, file_size) + .await?; Ok(Arc::new(metadata)) }) } diff --git a/parquet/src/encryption/decryption.rs b/parquet/src/encryption/decryption.rs index 06704d705710..931132cdd63f 100644 --- a/parquet/src/encryption/decryption.rs +++ b/parquet/src/encryption/decryption.rs @@ -39,7 +39,7 @@ pub fn read_and_decrypt( // CryptoContext is a data structure that holds the context required to // decrypt parquet modules (data pages, dictionary pages, etc.). #[derive(Debug, Clone)] -pub struct CryptoContext { +pub(crate) struct CryptoContext { pub(crate) row_group_ordinal: usize, pub(crate) column_ordinal: usize, pub(crate) page_ordinal: Option, @@ -145,7 +145,7 @@ impl CryptoContext { pub struct FileDecryptionProperties { footer_key: Vec, column_keys: HashMap>, - aad_prefix: Option>, + pub(crate) aad_prefix: Option>, } impl FileDecryptionProperties { @@ -178,8 +178,8 @@ impl DecryptionPropertiesBuilder { }) } - pub fn with_aad_prefix(mut self, value: Vec) -> Self { - self.aad_prefix = Some(value); + pub fn with_aad_prefix(mut self, value: &str) -> Self { + self.aad_prefix = Some(value.as_bytes().to_vec()); self } @@ -191,7 +191,7 @@ impl DecryptionPropertiesBuilder { } #[derive(Clone, Debug)] -pub struct FileDecryptor { +pub(crate) struct FileDecryptor { decryption_properties: FileDecryptionProperties, footer_decryptor: Option>, file_aad: Vec, @@ -212,8 +212,12 @@ impl FileDecryptor { let file_aad = [aad_prefix.as_slice(), aad_file_unique.as_slice()].concat(); // todo decr: if no key available yet (not set in properties, should be retrieved from metadata) let footer_decryptor = RingGcmBlockDecryptor::new(&decryption_properties.footer_key) - .map_err(|e| general_err!("Invalid footer key. {}", e.to_string().replace("Parquet error: ", "")))?; - + .map_err(|e| { + general_err!( + "Invalid footer key. {}", + e.to_string().replace("Parquet error: ", "") + ) + })?; Ok(Self { footer_decryptor: Some(Arc::new(footer_decryptor)), decryption_properties: decryption_properties.clone(), diff --git a/parquet/src/encryption/mod.rs b/parquet/src/encryption/mod.rs index 1e33bf4fbd6d..1deba749f8e6 100644 --- a/parquet/src/encryption/mod.rs +++ b/parquet/src/encryption/mod.rs @@ -18,6 +18,6 @@ //! Encryption implementation specific to Parquet, as described //! in the [spec](https://github.com/apache/parquet-format/blob/master/Encryption.md). -pub mod ciphers; -pub mod decryption; -pub mod modules; +mod ciphers; +pub(crate) mod decryption; +pub(crate) mod modules; diff --git a/parquet/src/file/metadata/reader.rs b/parquet/src/file/metadata/reader.rs index f3baf0c16cb1..23720f25b2bf 100644 --- a/parquet/src/file/metadata/reader.rs +++ b/parquet/src/file/metadata/reader.rs @@ -721,7 +721,9 @@ impl ParquetMetaDataReader { decrypted_fmd_buf = footer_decryptor? .decrypt(prot.as_slice().as_ref(), aad_footer.as_ref()) .map_err(|_| { - general_err!("Provided footer key was unable to decrypt parquet footer") + general_err!( + "Provided footer key and AAD were unable to decrypt parquet footer" + ) })?; prot = TCompactSliceInputProtocol::new(decrypted_fmd_buf.as_ref()); @@ -847,7 +849,11 @@ fn get_file_decryptor( let aad_file_unique = algo .aad_file_unique .ok_or_else(|| general_err!("AAD unique file identifier is not set"))?; - let aad_prefix: Vec = algo.aad_prefix.unwrap_or_default(); + let aad_prefix = if file_decryption_properties.aad_prefix.is_some() { + file_decryption_properties.aad_prefix.clone().unwrap() + } else { + algo.aad_prefix.unwrap_or_default() + }; FileDecryptor::new(file_decryption_properties, aad_file_unique, aad_prefix) } From 7e9e7aa35c9daf235db652d982e582eb4cb52b2f Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Mon, 10 Mar 2025 15:21:04 +0100 Subject: [PATCH 85/97] Review feedback --- parquet/src/arrow/arrow_reader/mod.rs | 6 +--- parquet/src/arrow/async_reader/mod.rs | 38 +++++++------------------ parquet/src/arrow/async_reader/store.rs | 7 +---- parquet/src/encryption/decryption.rs | 14 ++++----- parquet/src/encryption/mod.rs | 2 +- parquet/src/file/metadata/mod.rs | 9 +++--- parquet/src/file/metadata/reader.rs | 5 ---- parquet/src/file/serialized_reader.rs | 5 +++- 8 files changed, 28 insertions(+), 58 deletions(-) diff --git a/parquet/src/arrow/arrow_reader/mod.rs b/parquet/src/arrow/arrow_reader/mod.rs index bfccbde9eb98..8bfc38f7c876 100644 --- a/parquet/src/arrow/arrow_reader/mod.rs +++ b/parquet/src/arrow/arrow_reader/mod.rs @@ -738,12 +738,8 @@ impl Iterator for ReaderPageIterator { let ret = SerializedPageReader::new(reader, meta, total_rows, page_locations); #[cfg(feature = "encryption")] - { - let ret = Ok(ret.unwrap().with_crypto_context(crypto_context)); - Some(ret.map(|x| Box::new(x) as _)) - } + let ret = ret.map(|reader| reader.with_crypto_context(crypto_context)); - #[cfg(not(feature = "encryption"))] Some(ret.map(|x| Box::new(x) as _)) } } diff --git a/parquet/src/arrow/async_reader/mod.rs b/parquet/src/arrow/async_reader/mod.rs index 73cbe166f40a..3a4be3789c36 100644 --- a/parquet/src/arrow/async_reader/mod.rs +++ b/parquet/src/arrow/async_reader/mod.rs @@ -1192,7 +1192,6 @@ mod tests { use tempfile::tempfile; use tokio::fs::File; - #[allow(dead_code)] #[derive(Clone)] struct TestReader { data: Bytes, @@ -1200,21 +1199,6 @@ mod tests { requests: Arc>>>, } - #[cfg(feature = "encryption")] - impl TestReader { - async fn new( - data: Bytes, - metadata: Arc, - requests: Arc>>>, - ) -> Self { - Self { - data, - metadata, - requests, - } - } - } - impl AsyncFileReader for TestReader { fn get_bytes(&mut self, range: Range) -> BoxFuture<'_, Result> { self.requests.lock().unwrap().push(range.clone()); @@ -2526,12 +2510,12 @@ mod tests { let mut decryption_properties = FileDecryptionProperties::builder(footer_key.to_vec()); - if column_1_key.is_empty() { + if !column_1_key.is_empty() { decryption_properties = decryption_properties.with_column_key("double_field", column_1_key.to_vec()); } - if column_2_key.is_empty() { + if !column_2_key.is_empty() { decryption_properties = decryption_properties.with_column_key("float_field", column_2_key.to_vec()); } @@ -2568,16 +2552,13 @@ mod tests { ) .await; - // todo: should this be double_field? // Missing column key - check_for_error("Parquet error: Unable to decrypt column 'float_field', perhaps the column key is wrong or missing?", + check_for_error("Parquet error: Unable to decrypt column 'double_field', perhaps the column key is wrong or missing?", &path, footer_key, "".as_bytes(), column_2_key).await; // Too short column key check_for_error( - // todo: should report key length error - // "Parquet error: Failed to create AES key", - "Parquet error: Unable to decrypt column 'float_field', perhaps the column key is wrong or missing?", + "Parquet error: Failed to create AES key", &path, footer_key, "abc".as_bytes(), @@ -2585,9 +2566,8 @@ mod tests { ) .await; - // todo: should this be double_field? // Wrong column key - check_for_error("Parquet error: Unable to decrypt column 'float_field', perhaps the column key is wrong or missing?", + check_for_error("Parquet error: Unable to decrypt column 'double_field', perhaps the column key is wrong or missing?", &path, footer_key, "1123456789012345".as_bytes(), column_2_key).await; // Mixed up keys @@ -2672,7 +2652,9 @@ mod tests { .build() .unwrap(); - verify_encryption_test_file_read_async(&mut file, decryption_properties).await; + verify_encryption_test_file_read_async(&mut file, decryption_properties) + .await + .unwrap(); } #[tokio::test] @@ -2687,7 +2669,9 @@ mod tests { .build() .unwrap(); - verify_encryption_test_file_read_async(&mut file, decryption_properties).await; + verify_encryption_test_file_read_async(&mut file, decryption_properties) + .await + .unwrap(); } #[tokio::test] diff --git a/parquet/src/arrow/async_reader/store.rs b/parquet/src/arrow/async_reader/store.rs index 78114c840701..3fe8d2d655ef 100644 --- a/parquet/src/arrow/async_reader/store.rs +++ b/parquet/src/arrow/async_reader/store.rs @@ -60,8 +60,6 @@ pub struct ParquetObjectReader { preload_column_index: bool, preload_offset_index: bool, runtime: Option, - #[cfg(feature = "encryption")] - file_decryption_properties: Option, } impl ParquetObjectReader { @@ -76,8 +74,6 @@ impl ParquetObjectReader { preload_column_index: false, preload_offset_index: false, runtime: None, - #[cfg(feature = "encryption")] - file_decryption_properties: None, } } @@ -185,7 +181,7 @@ impl AsyncFileReader for ParquetObjectReader { #[cfg(feature = "encryption")] fn get_encrypted_metadata( &mut self, - file_decryption_properties: Option, + _file_decryption_properties: Option, ) -> BoxFuture<'_, Result>> { Box::pin(async move { let file_size = self.meta.size; @@ -193,7 +189,6 @@ impl AsyncFileReader for ParquetObjectReader { .with_column_indexes(self.preload_column_index) .with_offset_indexes(self.preload_offset_index) .with_prefetch_hint(self.metadata_size_hint) - .with_decryption_properties(file_decryption_properties.as_ref()) .load_and_finish(self, file_size) .await?; diff --git a/parquet/src/encryption/decryption.rs b/parquet/src/encryption/decryption.rs index 931132cdd63f..9c8435fef8b6 100644 --- a/parquet/src/encryption/decryption.rs +++ b/parquet/src/encryption/decryption.rs @@ -53,7 +53,7 @@ pub(crate) struct CryptoContext { } impl CryptoContext { - pub fn new( + pub(crate) fn new( row_group_ordinal: usize, column_ordinal: usize, data_decryptor: Arc, @@ -71,7 +71,7 @@ impl CryptoContext { } } - pub fn with_page_ordinal(&self, page_ordinal: usize) -> Self { + pub(crate) fn with_page_ordinal(&self, page_ordinal: usize) -> Self { Self { row_group_ordinal: self.row_group_ordinal, column_ordinal: self.column_ordinal, @@ -115,7 +115,7 @@ impl CryptoContext { ) } - pub fn for_dictionary_page(&self) -> Self { + pub(crate) fn for_dictionary_page(&self) -> Self { Self { row_group_ordinal: self.row_group_ordinal, column_ordinal: self.column_ordinal, @@ -127,15 +127,11 @@ impl CryptoContext { } } - pub fn data_decryptor(&self) -> &Arc { + pub(crate) fn data_decryptor(&self) -> &Arc { &self.data_decryptor } - pub fn metadata_decryptor(&self) -> &Arc { - &self.metadata_decryptor - } - - pub fn file_aad(&self) -> &Vec { + pub(crate) fn file_aad(&self) -> &Vec { &self.file_aad } } diff --git a/parquet/src/encryption/mod.rs b/parquet/src/encryption/mod.rs index 1deba749f8e6..362906f50134 100644 --- a/parquet/src/encryption/mod.rs +++ b/parquet/src/encryption/mod.rs @@ -18,6 +18,6 @@ //! Encryption implementation specific to Parquet, as described //! in the [spec](https://github.com/apache/parquet-format/blob/master/Encryption.md). -mod ciphers; +pub(crate) mod ciphers; pub(crate) mod decryption; pub(crate) mod modules; diff --git a/parquet/src/file/metadata/mod.rs b/parquet/src/file/metadata/mod.rs index cfe34f4c3e58..76ba935a91fa 100644 --- a/parquet/src/file/metadata/mod.rs +++ b/parquet/src/file/metadata/mod.rs @@ -200,9 +200,10 @@ impl ParquetMetaData { } } - #[allow(missing_docs)] + /// Adds [`FileDecryptor`] to this metadata instance to enable decryption of + /// encrypted data. #[cfg(feature = "encryption")] - pub fn with_file_decryptor(&mut self, file_decryptor: Option) { + pub(crate) fn with_file_decryptor(&mut self, file_decryptor: Option) { self.file_decryptor = file_decryptor; } @@ -234,7 +235,7 @@ impl ParquetMetaData { /// Returns file decryptor as reference. #[cfg(feature = "encryption")] - pub fn file_decryptor(&self) -> Option<&FileDecryptor> { + pub(crate) fn file_decryptor(&self) -> Option<&FileDecryptor> { self.file_decryptor.as_ref() } @@ -625,7 +626,7 @@ impl RowGroupMetaData { /// Method to convert from encrypted Thrift. #[cfg(feature = "encryption")] - pub fn from_encrypted_thrift( + fn from_encrypted_thrift( schema_descr: SchemaDescPtr, mut rg: RowGroup, decryptor: Option<&FileDecryptor>, diff --git a/parquet/src/file/metadata/reader.rs b/parquet/src/file/metadata/reader.rs index 23720f25b2bf..b5b1fe99ff0d 100644 --- a/parquet/src/file/metadata/reader.rs +++ b/parquet/src/file/metadata/reader.rs @@ -414,11 +414,6 @@ impl ParquetMetaDataReader { mut fetch: F, file_size: usize, ) -> Result<()> { - #[cfg(feature = "encryption")] - let (metadata, remainder) = - Self::load_metadata(&mut fetch, file_size, self.get_prefetch_size()).await?; - - #[cfg(not(feature = "encryption"))] let (metadata, remainder) = Self::load_metadata(&mut fetch, file_size, self.get_prefetch_size()).await?; diff --git a/parquet/src/file/serialized_reader.rs b/parquet/src/file/serialized_reader.rs index e76197503145..2888eb11c34c 100644 --- a/parquet/src/file/serialized_reader.rs +++ b/parquet/src/file/serialized_reader.rs @@ -581,7 +581,10 @@ impl SerializedPageReader { /// Adds cryptographical information to the reader. #[cfg(feature = "encryption")] - pub fn with_crypto_context(mut self, crypto_context: Option>) -> Self { + pub(crate) fn with_crypto_context( + mut self, + crypto_context: Option>, + ) -> Self { self.crypto_context = crypto_context; self } From c4737a4939dbcfd31252fbf747cebb42ffb455ca Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Mon, 10 Mar 2025 15:49:35 +0100 Subject: [PATCH 86/97] Add new CI check --- .github/workflows/parquet.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/parquet.yml b/.github/workflows/parquet.yml index 4c46fde198bd..96c7ab8f4e3a 100644 --- a/.github/workflows/parquet.yml +++ b/.github/workflows/parquet.yml @@ -111,6 +111,8 @@ jobs: run: cargo check -p parquet --all-targets --all-features - name: Check compilation --all-targets --no-default-features --features json run: cargo check -p parquet --all-targets --no-default-features --features json + - name: Check compilation --no-default-features --features encryption --features async + run: cargo check -p parquet --no-default-features --features encryption --features async # test the parquet crate builds against wasm32 in stable rust wasm32-build: From 99e2b6d61951c78da90b152f014581718fc9979d Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Mon, 10 Mar 2025 15:53:31 +0100 Subject: [PATCH 87/97] Rename decryption module to decrypt. This is because we'll introduce encryption module later and we'll have to name it encrypt to not clash with the super module name (encryption). It would be odd to have sub modules called encrypt and decryption. --- parquet/src/arrow/arrow_reader/mod.rs | 4 ++-- parquet/src/arrow/async_reader/mod.rs | 2 +- parquet/src/arrow/async_reader/store.rs | 2 +- parquet/src/encryption/{decryption.rs => decrypt.rs} | 0 parquet/src/encryption/mod.rs | 2 +- parquet/src/file/metadata/mod.rs | 2 +- parquet/src/file/metadata/reader.rs | 2 +- parquet/src/file/serialized_reader.rs | 2 +- parquet/src/util/test_common/encryption_util.rs | 2 +- 9 files changed, 9 insertions(+), 9 deletions(-) rename parquet/src/encryption/{decryption.rs => decrypt.rs} (100%) diff --git a/parquet/src/arrow/arrow_reader/mod.rs b/parquet/src/arrow/arrow_reader/mod.rs index 8bfc38f7c876..9eb03640c8b3 100644 --- a/parquet/src/arrow/arrow_reader/mod.rs +++ b/parquet/src/arrow/arrow_reader/mod.rs @@ -33,7 +33,7 @@ use crate::arrow::schema::{parquet_to_arrow_schema_and_fields, ParquetField}; use crate::arrow::{parquet_to_arrow_field_levels, FieldLevels, ProjectionMask}; use crate::column::page::{PageIterator, PageReader}; #[cfg(feature = "encryption")] -use crate::encryption::decryption::{CryptoContext, FileDecryptionProperties}; +use crate::encryption::decrypt::{CryptoContext, FileDecryptionProperties}; use crate::errors::{ParquetError, Result}; use crate::file::metadata::{ParquetMetaData, ParquetMetaDataReader}; use crate::file::reader::{ChunkReader, SerializedPageReader}; @@ -1019,7 +1019,7 @@ mod tests { FloatType, Int32Type, Int64Type, Int96Type, }; #[cfg(feature = "encryption")] - use crate::encryption::decryption::FileDecryptionProperties; + use crate::encryption::decrypt::FileDecryptionProperties; use crate::errors::Result; use crate::file::properties::{EnabledStatistics, WriterProperties, WriterVersion}; use crate::file::writer::SerializedFileWriter; diff --git a/parquet/src/arrow/async_reader/mod.rs b/parquet/src/arrow/async_reader/mod.rs index 3a4be3789c36..9f75fabc3088 100644 --- a/parquet/src/arrow/async_reader/mod.rs +++ b/parquet/src/arrow/async_reader/mod.rs @@ -62,7 +62,7 @@ mod metadata; pub use metadata::*; #[cfg(feature = "encryption")] -use crate::encryption::decryption::{CryptoContext, FileDecryptionProperties}; +use crate::encryption::decrypt::{CryptoContext, FileDecryptionProperties}; #[cfg(feature = "object_store")] mod store; diff --git a/parquet/src/arrow/async_reader/store.rs b/parquet/src/arrow/async_reader/store.rs index 3fe8d2d655ef..a2c730e786af 100644 --- a/parquet/src/arrow/async_reader/store.rs +++ b/parquet/src/arrow/async_reader/store.rs @@ -24,7 +24,7 @@ use tokio::runtime::Handle; use crate::arrow::async_reader::AsyncFileReader; #[cfg(feature = "encryption")] -use crate::encryption::decryption::FileDecryptionProperties; +use crate::encryption::decrypt::FileDecryptionProperties; use crate::errors::{ParquetError, Result}; use crate::file::metadata::{ParquetMetaData, ParquetMetaDataReader}; diff --git a/parquet/src/encryption/decryption.rs b/parquet/src/encryption/decrypt.rs similarity index 100% rename from parquet/src/encryption/decryption.rs rename to parquet/src/encryption/decrypt.rs diff --git a/parquet/src/encryption/mod.rs b/parquet/src/encryption/mod.rs index 362906f50134..6e9168edb9f2 100644 --- a/parquet/src/encryption/mod.rs +++ b/parquet/src/encryption/mod.rs @@ -19,5 +19,5 @@ //! in the [spec](https://github.com/apache/parquet-format/blob/master/Encryption.md). pub(crate) mod ciphers; -pub(crate) mod decryption; +pub(crate) mod decrypt; pub(crate) mod modules; diff --git a/parquet/src/file/metadata/mod.rs b/parquet/src/file/metadata/mod.rs index 76ba935a91fa..217685049ea9 100644 --- a/parquet/src/file/metadata/mod.rs +++ b/parquet/src/file/metadata/mod.rs @@ -98,7 +98,7 @@ mod writer; use crate::basic::{ColumnOrder, Compression, Encoding, Type}; #[cfg(feature = "encryption")] use crate::encryption::{ - decryption::FileDecryptor, + decrypt::FileDecryptor, modules::{create_module_aad, ModuleType}, }; use crate::errors::{ParquetError, Result}; diff --git a/parquet/src/file/metadata/reader.rs b/parquet/src/file/metadata/reader.rs index b5b1fe99ff0d..1899db0deb6e 100644 --- a/parquet/src/file/metadata/reader.rs +++ b/parquet/src/file/metadata/reader.rs @@ -22,7 +22,7 @@ use bytes::Bytes; use crate::basic::ColumnOrder; #[cfg(feature = "encryption")] use crate::encryption::{ - decryption::{FileDecryptionProperties, FileDecryptor}, + decrypt::{FileDecryptionProperties, FileDecryptor}, modules::create_footer_aad, }; diff --git a/parquet/src/file/serialized_reader.rs b/parquet/src/file/serialized_reader.rs index 2888eb11c34c..2673f4ac52a7 100644 --- a/parquet/src/file/serialized_reader.rs +++ b/parquet/src/file/serialized_reader.rs @@ -23,7 +23,7 @@ use crate::bloom_filter::Sbbf; use crate::column::page::{Page, PageMetadata, PageReader}; use crate::compression::{create_codec, Codec}; #[cfg(feature = "encryption")] -use crate::encryption::decryption::{read_and_decrypt, CryptoContext}; +use crate::encryption::decrypt::{read_and_decrypt, CryptoContext}; use crate::errors::{ParquetError, Result}; use crate::file::page_index::offset_index::OffsetIndexMetaData; use crate::file::{ diff --git a/parquet/src/util/test_common/encryption_util.rs b/parquet/src/util/test_common/encryption_util.rs index 51d6f701db80..2f6e5bc45636 100644 --- a/parquet/src/util/test_common/encryption_util.rs +++ b/parquet/src/util/test_common/encryption_util.rs @@ -19,7 +19,7 @@ use crate::arrow::arrow_reader::{ ArrowReaderMetadata, ArrowReaderOptions, ParquetRecordBatchReaderBuilder, }; use crate::arrow::ParquetRecordBatchStreamBuilder; -use crate::encryption::decryption::FileDecryptionProperties; +use crate::encryption::decrypt::FileDecryptionProperties; use crate::errors::ParquetError; use crate::file::metadata::FileMetaData; use arrow_array::cast::AsArray; From d9e597c3f060bd151709b3ed370f3b31f543f93b Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Mon, 10 Mar 2025 17:18:45 +0100 Subject: [PATCH 88/97] Fix failing where encryption is enabled but no decryption properties are provided or where encyption is disabled but file is encrypted. --- parquet/src/arrow/arrow_reader/mod.rs | 32 +++++++++++++ parquet/src/arrow/async_reader/mod.rs | 64 +++++++++++++++++++------ parquet/src/arrow/async_reader/store.rs | 2 +- parquet/src/file/metadata/reader.rs | 17 ++++--- 4 files changed, 91 insertions(+), 24 deletions(-) diff --git a/parquet/src/arrow/arrow_reader/mod.rs b/parquet/src/arrow/arrow_reader/mod.rs index 9eb03640c8b3..15b60ef98431 100644 --- a/parquet/src/arrow/arrow_reader/mod.rs +++ b/parquet/src/arrow/arrow_reader/mod.rs @@ -2027,6 +2027,38 @@ mod tests { verify_encryption_test_file_read(file, decryption_properties); } + #[test] + #[cfg(not(feature = "encryption"))] + fn test_decrypting_without_encryption_flag_fails() { + let testdata = arrow::util::test_util::parquet_test_data(); + let path = format!("{testdata}/uniform_encryption.parquet.encrypted"); + let file = File::open(path).unwrap(); + + let options = ArrowReaderOptions::default(); + let result = ArrowReaderMetadata::load(&file, options.clone()); + assert!(result.is_err()); + assert_eq!( + result.unwrap_err().to_string(), + "Parquet error: Parquet file has an encrypted footer but the encryption feature is disabled" + ); + } + + #[test] + #[cfg(feature = "encryption")] + fn test_decrypting_without_decryption_properties_fails() { + let testdata = arrow::util::test_util::parquet_test_data(); + let path = format!("{testdata}/uniform_encryption.parquet.encrypted"); + let file = File::open(path).unwrap(); + + let options = ArrowReaderOptions::default(); + let result = ArrowReaderMetadata::load(&file, options.clone()); + assert!(result.is_err()); + assert_eq!( + result.unwrap_err().to_string(), + "Parquet error: Parquet file has an encrypted footer but no decryption properties were provided" + ); + } + #[test] #[cfg(feature = "encryption")] fn test_aes_ctr_encryption() { diff --git a/parquet/src/arrow/async_reader/mod.rs b/parquet/src/arrow/async_reader/mod.rs index 9f75fabc3088..808fa0136f56 100644 --- a/parquet/src/arrow/async_reader/mod.rs +++ b/parquet/src/arrow/async_reader/mod.rs @@ -111,7 +111,7 @@ pub trait AsyncFileReader: Send { /// Provides asynchronous access to the [`ParquetMetaData`] of encrypted parquet /// files, like get_metadata does for unencrypted ones. #[cfg(feature = "encryption")] - fn get_encrypted_metadata( + fn get_metadata_with_encryption( &mut self, file_decryption_properties: Option, ) -> BoxFuture<'_, Result>>; @@ -132,12 +132,12 @@ impl AsyncFileReader for Box { } #[cfg(feature = "encryption")] - fn get_encrypted_metadata( + fn get_metadata_with_encryption( &mut self, file_decryption_properties: Option, ) -> BoxFuture<'_, Result>> { self.as_mut() - .get_encrypted_metadata(file_decryption_properties) + .get_metadata_with_encryption(file_decryption_properties) } } @@ -159,7 +159,7 @@ impl AsyncFileReader for T { } #[cfg(feature = "encryption")] - fn get_encrypted_metadata( + fn get_metadata_with_encryption( &mut self, file_decryption_properties: Option, ) -> BoxFuture<'_, Result>> { @@ -178,7 +178,7 @@ impl AsyncFileReader for T { let mut buf = Vec::with_capacity(metadata_len); self.take(metadata_len as _).read_to_end(&mut buf).await?; - let parquet_metadata_reader = ParquetMetaDataReader::decrypt_metadata( + let parquet_metadata_reader = ParquetMetaDataReader::decode_metadata_with_encryption( &buf, footer.is_encrypted_footer(), file_decryption_properties.as_ref(), @@ -198,6 +198,13 @@ impl AsyncFileReader for T { let footer = ParquetMetaDataReader::decode_footer_tail(&buf)?; let metadata_len = footer.metadata_length(); + #[cfg(not(feature = "encryption"))] + if footer.encrypted_footer { + return Err(general_err!( + "Parquet file has an encrypted footer but the encryption feature is disabled" + )); + } + self.seek(SeekFrom::End(-FOOTER_SIZE_I64 - metadata_len as i64)) .await?; @@ -228,13 +235,10 @@ impl ArrowReaderMetadata { // TODO: this is all rather awkward. It would be nice if AsyncFileReader::get_metadata // took an argument to fetch the page indexes. #[cfg(feature = "encryption")] - let mut metadata = if options.file_decryption_properties.is_some() { - input - .get_encrypted_metadata(options.file_decryption_properties.clone()) - .await? - } else { - input.get_metadata().await? - }; + let mut metadata = input + .get_metadata_with_encryption(options.file_decryption_properties.clone()) + .await?; + #[cfg(not(feature = "encryption"))] let mut metadata = input.get_metadata().await?; @@ -1210,11 +1214,11 @@ mod tests { } #[cfg(feature = "encryption")] - fn get_encrypted_metadata( + fn get_metadata_with_encryption( &mut self, _file_decryption_properties: Option, ) -> BoxFuture<'_, Result>> { - todo!("we don't test for decryption yet"); + futures::future::ready(Ok(self.metadata.clone())).boxed() } } @@ -2704,4 +2708,36 @@ mod tests { } }; } + + #[tokio::test] + #[cfg(not(feature = "encryption"))] + async fn test_decrypting_without_encryption_flag_fails() { + let testdata = arrow::util::test_util::parquet_test_data(); + let path = format!("{testdata}/uniform_encryption.parquet.encrypted"); + let mut file = File::open(&path).await.unwrap(); + + let options = ArrowReaderOptions::new(); + let result = ArrowReaderMetadata::load_async(&mut file, options).await; + assert!(result.is_err()); + assert_eq!( + result.unwrap_err().to_string(), + "Parquet error: Parquet file has an encrypted footer but the encryption feature is disabled" + ); + } + + #[tokio::test] + #[cfg(feature = "encryption")] + async fn test_decrypting_without_decryption_properties_fails() { + let testdata = arrow::util::test_util::parquet_test_data(); + let path = format!("{testdata}/uniform_encryption.parquet.encrypted"); + let mut file = File::open(&path).await.unwrap(); + + let options = ArrowReaderOptions::new(); + let result = ArrowReaderMetadata::load_async(&mut file, options).await; + assert!(result.is_err()); + assert_eq!( + result.unwrap_err().to_string(), + "Parquet error: Parquet file has an encrypted footer but no decryption properties were provided" + ); + } } diff --git a/parquet/src/arrow/async_reader/store.rs b/parquet/src/arrow/async_reader/store.rs index a2c730e786af..0d9e9b9a633b 100644 --- a/parquet/src/arrow/async_reader/store.rs +++ b/parquet/src/arrow/async_reader/store.rs @@ -179,7 +179,7 @@ impl AsyncFileReader for ParquetObjectReader { } #[cfg(feature = "encryption")] - fn get_encrypted_metadata( + fn get_metadata_with_encryption( &mut self, _file_decryption_properties: Option, ) -> BoxFuture<'_, Result>> { diff --git a/parquet/src/file/metadata/reader.rs b/parquet/src/file/metadata/reader.rs index 1899db0deb6e..ad45a6fe8245 100644 --- a/parquet/src/file/metadata/reader.rs +++ b/parquet/src/file/metadata/reader.rs @@ -85,7 +85,7 @@ pub struct ParquetMetaDataReader { /// This is parsed from the last 8 bytes of the Parquet file pub struct FooterTail { metadata_length: usize, - encrypted_footer: bool, + pub(crate) encrypted_footer: bool, } impl FooterTail { @@ -569,13 +569,12 @@ impl ParquetMetaDataReader { let start = file_size - footer_metadata_len as u64; #[cfg(feature = "encryption")] - if self.file_decryption_properties.is_some() { - return Self::decrypt_metadata( - chunk_reader.get_bytes(start, metadata_len)?.as_ref(), - footer.is_encrypted_footer(), - self.file_decryption_properties.as_ref(), - ); - } + return Self::decode_metadata_with_encryption( + chunk_reader.get_bytes(start, metadata_len)?.as_ref(), + footer.is_encrypted_footer(), + self.file_decryption_properties.as_ref(), + ); + #[cfg(not(feature = "encryption"))] Self::decode_metadata(chunk_reader.get_bytes(start, metadata_len)?.as_ref()) } @@ -692,7 +691,7 @@ impl ParquetMetaDataReader { /// [Parquet Spec]: https://github.com/apache/parquet-format#metadata /// [Parquet Encryption Spec]: https://parquet.apache.org/docs/file-format/data-pages/encryption/ #[cfg(feature = "encryption")] - pub(crate) fn decrypt_metadata( + pub(crate) fn decode_metadata_with_encryption( buf: &[u8], encrypted_footer: bool, file_decryption_properties: Option<&FileDecryptionProperties>, From cc27421806316b3151497c7266269abcd21f88f9 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Tue, 11 Mar 2025 01:13:14 +0100 Subject: [PATCH 89/97] Apply suggestions from code review Co-authored-by: Adam Reeve --- parquet/src/arrow/arrow_reader/mod.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/parquet/src/arrow/arrow_reader/mod.rs b/parquet/src/arrow/arrow_reader/mod.rs index 15b60ef98431..a6e98d44e840 100644 --- a/parquet/src/arrow/arrow_reader/mod.rs +++ b/parquet/src/arrow/arrow_reader/mod.rs @@ -1889,7 +1889,7 @@ mod tests { let column_1_key = "1234567890123450".as_bytes(); let column_2_key = "1234567890123451".as_bytes(); - // Provided AAD prefix overrides the one stored in the file + // Can read successfully when providing the correct AAD prefix let decryption_properties = FileDecryptionProperties::builder(footer_key.to_vec()) .with_column_key("double_field", column_1_key.to_vec()) .with_column_key("float_field", column_2_key.to_vec()) @@ -1917,7 +1917,7 @@ mod tests { "Parquet error: Provided footer key and AAD were unable to decrypt parquet footer" ); - // Using wrong AAD prefix stored in the file should fail + // Not providing any AAD prefix should fail as it isn't stored in the file let decryption_properties = FileDecryptionProperties::builder(footer_key.to_vec()) .with_column_key("double_field", column_1_key.to_vec()) .with_column_key("float_field", column_2_key.to_vec()) From 90434d67e4cda052da8b3695da3b539e510e6e3c Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Tue, 11 Mar 2025 00:50:34 +0100 Subject: [PATCH 90/97] get_metadata_with_encryption -> get_metadata_with_options --- parquet/src/arrow/async_reader/mod.rs | 53 ++++++++++---------- parquet/src/arrow/async_reader/store.rs | 66 +++++++++++++++++++------ 2 files changed, 79 insertions(+), 40 deletions(-) diff --git a/parquet/src/arrow/async_reader/mod.rs b/parquet/src/arrow/async_reader/mod.rs index 808fa0136f56..867bfd83eadc 100644 --- a/parquet/src/arrow/async_reader/mod.rs +++ b/parquet/src/arrow/async_reader/mod.rs @@ -108,13 +108,15 @@ pub trait AsyncFileReader: Send { /// for caching, pre-fetching, catalog metadata, etc... fn get_metadata(&mut self) -> BoxFuture<'_, Result>>; - /// Provides asynchronous access to the [`ParquetMetaData`] of encrypted parquet - /// files, like get_metadata does for unencrypted ones. - #[cfg(feature = "encryption")] - fn get_metadata_with_encryption( - &mut self, - file_decryption_properties: Option, - ) -> BoxFuture<'_, Result>>; + /// Provides asynchronous access to the [`ParquetMetaData`] of a parquet file, + /// allowing fine-grained control over how metadata is sourced, in particular allowing + /// for caching, pre-fetching, catalog metadata, decrypting, etc... + /// + /// By default calls `get_metadata()` + fn get_metadata_with_options<'a>( + &'a mut self, + options: &'a ArrowReaderOptions, + ) -> BoxFuture<'a, Result>>; } /// This allows Box to be used as an AsyncFileReader, @@ -131,13 +133,14 @@ impl AsyncFileReader for Box { self.as_mut().get_metadata() } - #[cfg(feature = "encryption")] - fn get_metadata_with_encryption( - &mut self, - file_decryption_properties: Option, - ) -> BoxFuture<'_, Result>> { - self.as_mut() - .get_metadata_with_encryption(file_decryption_properties) + fn get_metadata_with_options<'a>( + &'a mut self, + options: &'a ArrowReaderOptions, + ) -> BoxFuture<'a, Result>> { + #[cfg(feature = "encryption")] + return self.as_mut().get_metadata_with_options(options); + #[cfg(not(feature = "encryption"))] + self.as_mut().get_metadata() } } @@ -159,10 +162,10 @@ impl AsyncFileReader for T { } #[cfg(feature = "encryption")] - fn get_metadata_with_encryption( - &mut self, - file_decryption_properties: Option, - ) -> BoxFuture<'_, Result>> { + fn get_metadata_with_options<'a>( + &'a mut self, + options: &'a ArrowReaderOptions, + ) -> BoxFuture<'a, Result>> { const FOOTER_SIZE_I64: i64 = FOOTER_SIZE as i64; async move { self.seek(SeekFrom::End(-FOOTER_SIZE_I64)).await?; @@ -181,7 +184,7 @@ impl AsyncFileReader for T { let parquet_metadata_reader = ParquetMetaDataReader::decode_metadata_with_encryption( &buf, footer.is_encrypted_footer(), - file_decryption_properties.as_ref(), + options.file_decryption_properties.as_ref(), )?; Ok(Arc::new(parquet_metadata_reader)) } @@ -235,9 +238,7 @@ impl ArrowReaderMetadata { // TODO: this is all rather awkward. It would be nice if AsyncFileReader::get_metadata // took an argument to fetch the page indexes. #[cfg(feature = "encryption")] - let mut metadata = input - .get_metadata_with_encryption(options.file_decryption_properties.clone()) - .await?; + let mut metadata = input.get_metadata_with_options(&options).await?; #[cfg(not(feature = "encryption"))] let mut metadata = input.get_metadata().await?; @@ -1214,10 +1215,10 @@ mod tests { } #[cfg(feature = "encryption")] - fn get_metadata_with_encryption( - &mut self, - _file_decryption_properties: Option, - ) -> BoxFuture<'_, Result>> { + fn get_metadata_with_options<'a>( + &'a mut self, + options: &'a ArrowReaderOptions, + ) -> BoxFuture<'a, Result>> { futures::future::ready(Ok(self.metadata.clone())).boxed() } } diff --git a/parquet/src/arrow/async_reader/store.rs b/parquet/src/arrow/async_reader/store.rs index 0d9e9b9a633b..d47629d1c15f 100644 --- a/parquet/src/arrow/async_reader/store.rs +++ b/parquet/src/arrow/async_reader/store.rs @@ -17,17 +17,15 @@ use std::{ops::Range, sync::Arc}; +use crate::arrow::arrow_reader::ArrowReaderOptions; +use crate::arrow::async_reader::AsyncFileReader; +use crate::errors::{ParquetError, Result}; +use crate::file::metadata::{ParquetMetaData, ParquetMetaDataReader}; use bytes::Bytes; use futures::{future::BoxFuture, FutureExt, TryFutureExt}; use object_store::{path::Path, ObjectMeta, ObjectStore}; use tokio::runtime::Handle; -use crate::arrow::async_reader::AsyncFileReader; -#[cfg(feature = "encryption")] -use crate::encryption::decrypt::FileDecryptionProperties; -use crate::errors::{ParquetError, Result}; -use crate::file::metadata::{ParquetMetaData, ParquetMetaDataReader}; - /// Reads Parquet files in object storage using [`ObjectStore`]. /// /// ```no_run @@ -179,16 +177,17 @@ impl AsyncFileReader for ParquetObjectReader { } #[cfg(feature = "encryption")] - fn get_metadata_with_encryption( - &mut self, - _file_decryption_properties: Option, - ) -> BoxFuture<'_, Result>> { + fn get_metadata_with_options<'a>( + &'a mut self, + options: &'a ArrowReaderOptions, + ) -> BoxFuture<'a, Result>> { Box::pin(async move { let file_size = self.meta.size; let metadata = ParquetMetaDataReader::new() .with_column_indexes(self.preload_column_index) .with_offset_indexes(self.preload_offset_index) .with_prefetch_hint(self.metadata_size_hint) + .with_decryption_properties(options.file_decryption_properties.as_ref()) .load_and_finish(self, file_size) .await?; @@ -206,16 +205,17 @@ mod tests { use futures::TryStreamExt; + use crate::arrow::arrow_reader::ArrowReaderOptions; + use crate::arrow::async_reader::{AsyncFileReader, ParquetObjectReader}; + use crate::arrow::ParquetRecordBatchStreamBuilder; + use crate::encryption::decrypt::FileDecryptionProperties; + use crate::errors::ParquetError; use arrow::util::test_util::parquet_test_data; use futures::FutureExt; use object_store::local::LocalFileSystem; use object_store::path::Path; use object_store::{ObjectMeta, ObjectStore}; - use crate::arrow::async_reader::{AsyncFileReader, ParquetObjectReader}; - use crate::arrow::ParquetRecordBatchStreamBuilder; - use crate::errors::ParquetError; - async fn get_meta_store() -> (ObjectMeta, Arc) { let res = parquet_test_data(); let store = LocalFileSystem::new_with_prefix(res).unwrap(); @@ -228,6 +228,44 @@ mod tests { (meta, Arc::new(store) as Arc) } + #[cfg(feature = "encryption")] + async fn get_encrypted_meta_store() -> (ObjectMeta, Arc) { + let res = parquet_test_data(); + let store = LocalFileSystem::new_with_prefix(res).unwrap(); + + let meta = store + .head(&Path::from("uniform_encryption.parquet.encrypted")) + .await + .unwrap(); + + (meta, Arc::new(store) as Arc) + } + + #[tokio::test] + #[cfg(feature = "encryption")] + async fn test_encrypted() { + let (meta, store) = get_encrypted_meta_store().await; + + let key_code: &[u8] = "0123456789012345".as_bytes(); + let decryption_properties = FileDecryptionProperties::builder(key_code.to_vec()) + .build() + .unwrap(); + let options = + ArrowReaderOptions::new().with_file_decryption_properties(decryption_properties); + let mut binding = ParquetObjectReader::new(store, meta); + let binding = binding.get_metadata_with_options(&options); + + let object_reader = binding.await.unwrap(); + // todo: this should pass + // let builder = ParquetRecordBatchStreamBuilder::new_with_options(object_reader, options) + // .await + // .unwrap(); + // let batches: Vec<_> = builder.build().unwrap().try_collect().await.unwrap(); + // + // assert_eq!(batches.len(), 1); + // assert_eq!(batches[0].num_rows(), 8); + } + #[tokio::test] async fn test_simple() { let (meta, store) = get_meta_store().await; From 5273244be60b84b31c56530be42b9fedf901cca4 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Tue, 11 Mar 2025 01:10:59 +0100 Subject: [PATCH 91/97] Use ParquetMetaData instead of RowGroupMetaData in InMemoryRowGroup. Change row_group_ordinal to row_group_idx. --- parquet/src/arrow/async_reader/mod.rs | 32 +++++++++++++-------------- parquet/src/encryption/decrypt.rs | 14 ++++++------ parquet/src/encryption/modules.rs | 10 ++++----- 3 files changed, 27 insertions(+), 29 deletions(-) diff --git a/parquet/src/arrow/async_reader/mod.rs b/parquet/src/arrow/async_reader/mod.rs index 867bfd83eadc..687a7f6e4d8f 100644 --- a/parquet/src/arrow/async_reader/mod.rs +++ b/parquet/src/arrow/async_reader/mod.rs @@ -52,7 +52,7 @@ use crate::bloom_filter::{ }; use crate::column::page::{PageIterator, PageReader}; use crate::errors::{ParquetError, Result}; -use crate::file::metadata::{ParquetMetaData, ParquetMetaDataReader, RowGroupMetaData}; +use crate::file::metadata::{ParquetMetaData, ParquetMetaDataReader}; use crate::file::page_index::offset_index::OffsetIndexMetaData; use crate::file::reader::{ChunkReader, Length, SerializedPageReader}; use crate::file::FOOTER_SIZE; @@ -636,15 +636,13 @@ where .map(|x| x[row_group_idx].as_slice()); let mut row_group = InMemoryRowGroup { - metadata: meta, // schema: meta.schema_descr_ptr(), row_count: meta.num_rows() as usize, column_chunks: vec![None; meta.columns().len()], offset_index, #[cfg(feature = "encryption")] - row_group_ordinal: row_group_idx, - #[cfg(feature = "encryption")] - parquet_metadata: self.metadata.clone(), + row_group_idx, + metadata: self.metadata.as_ref(), }; if let Some(filter) = self.filter.as_mut() { @@ -926,14 +924,12 @@ where /// An in-memory collection of column chunks struct InMemoryRowGroup<'a> { - metadata: &'a RowGroupMetaData, offset_index: Option<&'a [OffsetIndexMetaData]>, column_chunks: Vec>>, row_count: usize, #[cfg(feature = "encryption")] - row_group_ordinal: usize, - #[cfg(feature = "encryption")] - parquet_metadata: Arc, + row_group_idx: usize, + metadata: &'a ParquetMetaData, } impl InMemoryRowGroup<'_> { @@ -944,6 +940,7 @@ impl InMemoryRowGroup<'_> { projection: &ProjectionMask, selection: Option<&RowSelection>, ) -> Result<()> { + let metadata = self.metadata.row_group(self.row_group_idx); if let Some((selection, offset_index)) = selection.zip(self.offset_index) { // If we have a `RowSelection` and an `OffsetIndex` then only fetch pages required for the // `RowSelection` @@ -952,7 +949,7 @@ impl InMemoryRowGroup<'_> { let fetch_ranges = self .column_chunks .iter() - .zip(self.metadata.columns()) + .zip(metadata.columns()) .enumerate() .filter(|&(idx, (chunk, _chunk_meta))| { chunk.is_none() && projection.leaf_included(idx) @@ -991,7 +988,7 @@ impl InMemoryRowGroup<'_> { } *chunk = Some(Arc::new(ColumnChunkData::Sparse { - length: self.metadata.column(idx).byte_range().1 as usize, + length: metadata.column(idx).byte_range().1 as usize, data: offsets.into_iter().zip(chunks.into_iter()).collect(), })) } @@ -1003,7 +1000,7 @@ impl InMemoryRowGroup<'_> { .enumerate() .filter(|&(idx, chunk)| chunk.is_none() && projection.leaf_included(idx)) .map(|(idx, _chunk)| { - let column = self.metadata.column(idx); + let column = metadata.column(idx); let (start, length) = column.byte_range(); start as usize..(start + length) as usize }) @@ -1018,7 +1015,7 @@ impl InMemoryRowGroup<'_> { if let Some(data) = chunk_data.next() { *chunk = Some(Arc::new(ColumnChunkData::Dense { - offset: self.metadata.column(idx).byte_range().0 as usize, + offset: metadata.column(idx).byte_range().0 as usize, data, })); } @@ -1037,9 +1034,9 @@ impl RowGroups for InMemoryRowGroup<'_> { fn column_chunks(&self, i: usize) -> Result> { #[cfg(feature = "encryption")] let crypto_context = - if let Some(file_decryptor) = &self.parquet_metadata.clone().file_decryptor().clone() { + if let Some(file_decryptor) = self.metadata.clone().file_decryptor().clone() { let column_name = &self - .parquet_metadata + .metadata .clone() .file_metadata() .schema_descr() @@ -1052,7 +1049,7 @@ impl RowGroups for InMemoryRowGroup<'_> { file_decryptor.get_column_metadata_decryptor(column_name.name())?; let crypto_context = CryptoContext::new( - self.row_group_ordinal, + self.row_group_idx, i, data_decryptor, metadata_decryptor, @@ -1076,9 +1073,10 @@ impl RowGroups for InMemoryRowGroup<'_> { // filter out empty offset indexes (old versions specified Some(vec![]) when no present) .filter(|index| !index.is_empty()) .map(|index| index[i].page_locations.clone()); + let metadata = self.metadata.row_group(self.row_group_idx); let page_reader = SerializedPageReader::new( data.clone(), - self.metadata.column(i), + metadata.column(i), self.row_count, page_locations, )?; diff --git a/parquet/src/encryption/decrypt.rs b/parquet/src/encryption/decrypt.rs index 9c8435fef8b6..dd9bf0b31502 100644 --- a/parquet/src/encryption/decrypt.rs +++ b/parquet/src/encryption/decrypt.rs @@ -40,7 +40,7 @@ pub fn read_and_decrypt( // decrypt parquet modules (data pages, dictionary pages, etc.). #[derive(Debug, Clone)] pub(crate) struct CryptoContext { - pub(crate) row_group_ordinal: usize, + pub(crate) row_group_idx: usize, pub(crate) column_ordinal: usize, pub(crate) page_ordinal: Option, pub(crate) dictionary_page: bool, @@ -54,14 +54,14 @@ pub(crate) struct CryptoContext { impl CryptoContext { pub(crate) fn new( - row_group_ordinal: usize, + row_group_idx: usize, column_ordinal: usize, data_decryptor: Arc, metadata_decryptor: Arc, file_aad: Vec, ) -> Self { Self { - row_group_ordinal, + row_group_idx, column_ordinal, page_ordinal: None, dictionary_page: false, @@ -73,7 +73,7 @@ impl CryptoContext { pub(crate) fn with_page_ordinal(&self, page_ordinal: usize) -> Self { Self { - row_group_ordinal: self.row_group_ordinal, + row_group_idx: self.row_group_idx, column_ordinal: self.column_ordinal, page_ordinal: Some(page_ordinal), dictionary_page: false, @@ -93,7 +93,7 @@ impl CryptoContext { create_module_aad( self.file_aad(), module_type, - self.row_group_ordinal, + self.row_group_idx, self.column_ordinal, self.page_ordinal, ) @@ -109,7 +109,7 @@ impl CryptoContext { create_module_aad( self.file_aad(), module_type, - self.row_group_ordinal, + self.row_group_idx, self.column_ordinal, self.page_ordinal, ) @@ -117,7 +117,7 @@ impl CryptoContext { pub(crate) fn for_dictionary_page(&self) -> Self { Self { - row_group_ordinal: self.row_group_ordinal, + row_group_idx: self.row_group_idx, column_ordinal: self.column_ordinal, page_ordinal: self.page_ordinal, dictionary_page: true, diff --git a/parquet/src/encryption/modules.rs b/parquet/src/encryption/modules.rs index 0f2f8083f5ed..6bf9306b256d 100644 --- a/parquet/src/encryption/modules.rs +++ b/parquet/src/encryption/modules.rs @@ -34,7 +34,7 @@ pub fn create_footer_aad(file_aad: &[u8]) -> crate::errors::Result> { pub(crate) fn create_module_aad( file_aad: &[u8], module_type: ModuleType, - row_group_ordinal: usize, + row_group_idx: usize, column_ordinal: usize, page_ordinal: Option, ) -> crate::errors::Result> { @@ -47,11 +47,11 @@ pub(crate) fn create_module_aad( return Ok(aad); } - if row_group_ordinal > i16::MAX as usize { + if row_group_idx > i16::MAX as usize { return Err(general_err!( "Encrypted parquet files can't have more than {} row groups: {}", i16::MAX, - row_group_ordinal + row_group_idx )); } if column_ordinal > i16::MAX as usize { @@ -68,7 +68,7 @@ pub(crate) fn create_module_aad( let mut aad = Vec::with_capacity(file_aad.len() + 5); aad.extend_from_slice(file_aad); aad.extend_from_slice(module_buf.as_ref()); - aad.extend_from_slice((row_group_ordinal as i16).to_le_bytes().as_ref()); + aad.extend_from_slice((row_group_idx as i16).to_le_bytes().as_ref()); aad.extend_from_slice((column_ordinal as i16).to_le_bytes().as_ref()); return Ok(aad); } @@ -87,7 +87,7 @@ pub(crate) fn create_module_aad( let mut aad = Vec::with_capacity(file_aad.len() + 7); aad.extend_from_slice(file_aad); aad.extend_from_slice(module_buf.as_ref()); - aad.extend_from_slice((row_group_ordinal as i16).to_le_bytes().as_ref()); + aad.extend_from_slice((row_group_idx as i16).to_le_bytes().as_ref()); aad.extend_from_slice((column_ordinal as i16).to_le_bytes().as_ref()); aad.extend_from_slice((page_ordinal as i16).to_le_bytes().as_ref()); Ok(aad) From d8fd4f2e8aa1c72b90afd55f77db893adc0b34f8 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Tue, 11 Mar 2025 01:17:22 +0100 Subject: [PATCH 92/97] Review feedback --- parquet/src/arrow/arrow_reader/mod.rs | 4 ++-- parquet/src/arrow/async_reader/store.rs | 3 ++- parquet/src/encryption/decrypt.rs | 4 ++-- parquet/src/file/metadata/reader.rs | 2 +- 4 files changed, 7 insertions(+), 6 deletions(-) diff --git a/parquet/src/arrow/arrow_reader/mod.rs b/parquet/src/arrow/arrow_reader/mod.rs index a6e98d44e840..2ee69dcf1068 100644 --- a/parquet/src/arrow/arrow_reader/mod.rs +++ b/parquet/src/arrow/arrow_reader/mod.rs @@ -1893,7 +1893,7 @@ mod tests { let decryption_properties = FileDecryptionProperties::builder(footer_key.to_vec()) .with_column_key("double_field", column_1_key.to_vec()) .with_column_key("float_field", column_2_key.to_vec()) - .with_aad_prefix("tester") + .with_aad_prefix("tester".as_bytes().to_vec()) .build() .unwrap(); @@ -1903,7 +1903,7 @@ mod tests { let decryption_properties = FileDecryptionProperties::builder(footer_key.to_vec()) .with_column_key("double_field", column_1_key.to_vec()) .with_column_key("float_field", column_2_key.to_vec()) - .with_aad_prefix("wrong_aad_prefix") + .with_aad_prefix("wrong_aad_prefix".as_bytes().to_vec()) .build() .unwrap(); diff --git a/parquet/src/arrow/async_reader/store.rs b/parquet/src/arrow/async_reader/store.rs index d47629d1c15f..0060ed6525ed 100644 --- a/parquet/src/arrow/async_reader/store.rs +++ b/parquet/src/arrow/async_reader/store.rs @@ -255,8 +255,9 @@ mod tests { let mut binding = ParquetObjectReader::new(store, meta); let binding = binding.get_metadata_with_options(&options); - let object_reader = binding.await.unwrap(); // todo: this should pass + // let object_reader = binding.await.unwrap(); + // let builder = ParquetRecordBatchStreamBuilder::new_with_options(object_reader, options) // .await // .unwrap(); diff --git a/parquet/src/encryption/decrypt.rs b/parquet/src/encryption/decrypt.rs index dd9bf0b31502..d5bfe3cfc076 100644 --- a/parquet/src/encryption/decrypt.rs +++ b/parquet/src/encryption/decrypt.rs @@ -174,8 +174,8 @@ impl DecryptionPropertiesBuilder { }) } - pub fn with_aad_prefix(mut self, value: &str) -> Self { - self.aad_prefix = Some(value.as_bytes().to_vec()); + pub fn with_aad_prefix(mut self, value: Vec) -> Self { + self.aad_prefix = Some(value); self } diff --git a/parquet/src/file/metadata/reader.rs b/parquet/src/file/metadata/reader.rs index ad45a6fe8245..e0746ec0bcb9 100644 --- a/parquet/src/file/metadata/reader.rs +++ b/parquet/src/file/metadata/reader.rs @@ -85,7 +85,7 @@ pub struct ParquetMetaDataReader { /// This is parsed from the last 8 bytes of the Parquet file pub struct FooterTail { metadata_length: usize, - pub(crate) encrypted_footer: bool, + encrypted_footer: bool, } impl FooterTail { From 67b8cb9f78351ba9a72b6010ae55d35a508d317b Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Tue, 11 Mar 2025 01:32:58 +0100 Subject: [PATCH 93/97] Fixes --- parquet/src/arrow/async_reader/mod.rs | 58 ++++++++++++------------- parquet/src/arrow/async_reader/store.rs | 2 +- parquet/src/file/metadata/reader.rs | 2 +- 3 files changed, 29 insertions(+), 33 deletions(-) diff --git a/parquet/src/arrow/async_reader/mod.rs b/parquet/src/arrow/async_reader/mod.rs index 687a7f6e4d8f..a705dad5e686 100644 --- a/parquet/src/arrow/async_reader/mod.rs +++ b/parquet/src/arrow/async_reader/mod.rs @@ -161,7 +161,6 @@ impl AsyncFileReader for T { .boxed() } - #[cfg(feature = "encryption")] fn get_metadata_with_options<'a>( &'a mut self, options: &'a ArrowReaderOptions, @@ -202,7 +201,7 @@ impl AsyncFileReader for T { let footer = ParquetMetaDataReader::decode_footer_tail(&buf)?; let metadata_len = footer.metadata_length(); #[cfg(not(feature = "encryption"))] - if footer.encrypted_footer { + if footer.is_encrypted_footer() { return Err(general_err!( "Parquet file has an encrypted footer but the encryption feature is disabled" )); @@ -927,7 +926,6 @@ struct InMemoryRowGroup<'a> { offset_index: Option<&'a [OffsetIndexMetaData]>, column_chunks: Vec>>, row_count: usize, - #[cfg(feature = "encryption")] row_group_idx: usize, metadata: &'a ParquetMetaData, } @@ -1033,35 +1031,34 @@ impl RowGroups for InMemoryRowGroup<'_> { fn column_chunks(&self, i: usize) -> Result> { #[cfg(feature = "encryption")] - let crypto_context = - if let Some(file_decryptor) = self.metadata.clone().file_decryptor().clone() { - let column_name = &self - .metadata - .clone() - .file_metadata() - .schema_descr() - .column(i); - - if file_decryptor.is_column_encrypted(column_name.name()) { - let data_decryptor = - file_decryptor.get_column_data_decryptor(column_name.name())?; - let metadata_decryptor = - file_decryptor.get_column_metadata_decryptor(column_name.name())?; - - let crypto_context = CryptoContext::new( - self.row_group_idx, - i, - data_decryptor, - metadata_decryptor, - file_decryptor.file_aad().clone(), - ); - Some(Arc::new(crypto_context)) - } else { - None - } + let crypto_context = if let Some(file_decryptor) = self.metadata.clone().file_decryptor() { + let column_name = &self + .metadata + .clone() + .file_metadata() + .schema_descr() + .column(i); + + if file_decryptor.is_column_encrypted(column_name.name()) { + let data_decryptor = + file_decryptor.get_column_data_decryptor(column_name.name())?; + let metadata_decryptor = + file_decryptor.get_column_metadata_decryptor(column_name.name())?; + + let crypto_context = CryptoContext::new( + self.row_group_idx, + i, + data_decryptor, + metadata_decryptor, + file_decryptor.file_aad().clone(), + ); + Some(Arc::new(crypto_context)) } else { None - }; + } + } else { + None + }; match &self.column_chunks[i] { None => Err(ParquetError::General(format!( @@ -1212,7 +1209,6 @@ mod tests { futures::future::ready(Ok(self.metadata.clone())).boxed() } - #[cfg(feature = "encryption")] fn get_metadata_with_options<'a>( &'a mut self, options: &'a ArrowReaderOptions, diff --git a/parquet/src/arrow/async_reader/store.rs b/parquet/src/arrow/async_reader/store.rs index 0060ed6525ed..128316b51316 100644 --- a/parquet/src/arrow/async_reader/store.rs +++ b/parquet/src/arrow/async_reader/store.rs @@ -253,7 +253,7 @@ mod tests { let options = ArrowReaderOptions::new().with_file_decryption_properties(decryption_properties); let mut binding = ParquetObjectReader::new(store, meta); - let binding = binding.get_metadata_with_options(&options); + let _binding = binding.get_metadata_with_options(&options); // todo: this should pass // let object_reader = binding.await.unwrap(); diff --git a/parquet/src/file/metadata/reader.rs b/parquet/src/file/metadata/reader.rs index e0746ec0bcb9..63f9c8e3b5c9 100644 --- a/parquet/src/file/metadata/reader.rs +++ b/parquet/src/file/metadata/reader.rs @@ -561,7 +561,7 @@ impl ParquetMetaDataReader { return Err(ParquetError::NeedMoreData(footer_metadata_len)); } #[cfg(not(feature = "encryption"))] - if footer.encrypted_footer { + if footer.is_encrypted_footer() { return Err(general_err!( "Parquet file has an encrypted footer but the encryption feature is disabled" )); From 34919309d746abbfdcbf387b5dc4b286ecc6c2ea Mon Sep 17 00:00:00 2001 From: Adam Reeve Date: Tue, 11 Mar 2025 14:17:02 +1300 Subject: [PATCH 94/97] Continue refactor away from encryption specific APIs and fix async reader load --- parquet/src/arrow/async_reader/mod.rs | 35 ++++++------- parquet/src/arrow/async_reader/store.rs | 38 ++++++++------- parquet/src/file/metadata/reader.rs | 65 +++++++++++++++++-------- 3 files changed, 81 insertions(+), 57 deletions(-) diff --git a/parquet/src/arrow/async_reader/mod.rs b/parquet/src/arrow/async_reader/mod.rs index a705dad5e686..34c43a863ce9 100644 --- a/parquet/src/arrow/async_reader/mod.rs +++ b/parquet/src/arrow/async_reader/mod.rs @@ -62,7 +62,7 @@ mod metadata; pub use metadata::*; #[cfg(feature = "encryption")] -use crate::encryption::decrypt::{CryptoContext, FileDecryptionProperties}; +use crate::encryption::decrypt::CryptoContext; #[cfg(feature = "object_store")] mod store; @@ -137,10 +137,7 @@ impl AsyncFileReader for Box { &'a mut self, options: &'a ArrowReaderOptions, ) -> BoxFuture<'a, Result>> { - #[cfg(feature = "encryption")] - return self.as_mut().get_metadata_with_options(options); - #[cfg(not(feature = "encryption"))] - self.as_mut().get_metadata() + self.as_mut().get_metadata_with_options(options) } } @@ -180,12 +177,15 @@ impl AsyncFileReader for T { let mut buf = Vec::with_capacity(metadata_len); self.take(metadata_len as _).read_to_end(&mut buf).await?; - let parquet_metadata_reader = ParquetMetaDataReader::decode_metadata_with_encryption( - &buf, - footer.is_encrypted_footer(), - options.file_decryption_properties.as_ref(), - )?; - Ok(Arc::new(parquet_metadata_reader)) + let metadata_reader = ParquetMetaDataReader::new(); + + #[cfg(feature = "encryption")] + let metadata_reader = metadata_reader + .with_decryption_properties(options.file_decryption_properties.as_ref()); + + let parquet_metadata = metadata_reader.decode_footer_metadata(&buf, &footer)?; + + Ok(Arc::new(parquet_metadata)) } .boxed() } @@ -200,10 +200,10 @@ impl AsyncFileReader for T { let footer = ParquetMetaDataReader::decode_footer_tail(&buf)?; let metadata_len = footer.metadata_length(); - #[cfg(not(feature = "encryption"))] + if footer.is_encrypted_footer() { return Err(general_err!( - "Parquet file has an encrypted footer but the encryption feature is disabled" + "Parquet file has an encrypted footer but decryption properties were not provided" )); } @@ -236,12 +236,8 @@ impl ArrowReaderMetadata { ) -> Result { // TODO: this is all rather awkward. It would be nice if AsyncFileReader::get_metadata // took an argument to fetch the page indexes. - #[cfg(feature = "encryption")] let mut metadata = input.get_metadata_with_options(&options).await?; - #[cfg(not(feature = "encryption"))] - let mut metadata = input.get_metadata().await?; - if options.page_index && metadata.column_index().is_none() && metadata.offset_index().is_none() @@ -639,7 +635,6 @@ where row_count: meta.num_rows() as usize, column_chunks: vec![None; meta.columns().len()], offset_index, - #[cfg(feature = "encryption")] row_group_idx, metadata: self.metadata.as_ref(), }; @@ -1170,6 +1165,8 @@ mod tests { use crate::arrow::arrow_reader::{ArrowReaderMetadata, ArrowReaderOptions}; use crate::arrow::schema::parquet_to_arrow_schema_and_fields; use crate::arrow::ArrowWriter; + #[cfg(feature = "encryption")] + use crate::encryption::decrypt::FileDecryptionProperties; use crate::file::metadata::ParquetMetaDataReader; use crate::file::properties::WriterProperties; #[cfg(feature = "encryption")] @@ -1211,7 +1208,7 @@ mod tests { fn get_metadata_with_options<'a>( &'a mut self, - options: &'a ArrowReaderOptions, + _options: &'a ArrowReaderOptions, ) -> BoxFuture<'a, Result>> { futures::future::ready(Ok(self.metadata.clone())).boxed() } diff --git a/parquet/src/arrow/async_reader/store.rs b/parquet/src/arrow/async_reader/store.rs index 128316b51316..6922f3d1f7a3 100644 --- a/parquet/src/arrow/async_reader/store.rs +++ b/parquet/src/arrow/async_reader/store.rs @@ -176,7 +176,6 @@ impl AsyncFileReader for ParquetObjectReader { }) } - #[cfg(feature = "encryption")] fn get_metadata_with_options<'a>( &'a mut self, options: &'a ArrowReaderOptions, @@ -186,10 +185,13 @@ impl AsyncFileReader for ParquetObjectReader { let metadata = ParquetMetaDataReader::new() .with_column_indexes(self.preload_column_index) .with_offset_indexes(self.preload_offset_index) - .with_prefetch_hint(self.metadata_size_hint) - .with_decryption_properties(options.file_decryption_properties.as_ref()) - .load_and_finish(self, file_size) - .await?; + .with_prefetch_hint(self.metadata_size_hint); + + #[cfg(feature = "encryption")] + let metadata = + metadata.with_decryption_properties(options.file_decryption_properties.as_ref()); + + let metadata = metadata.load_and_finish(self, file_size).await?; Ok(Arc::new(metadata)) }) @@ -252,19 +254,19 @@ mod tests { .unwrap(); let options = ArrowReaderOptions::new().with_file_decryption_properties(decryption_properties); - let mut binding = ParquetObjectReader::new(store, meta); - let _binding = binding.get_metadata_with_options(&options); - - // todo: this should pass - // let object_reader = binding.await.unwrap(); - - // let builder = ParquetRecordBatchStreamBuilder::new_with_options(object_reader, options) - // .await - // .unwrap(); - // let batches: Vec<_> = builder.build().unwrap().try_collect().await.unwrap(); - // - // assert_eq!(batches.len(), 1); - // assert_eq!(batches[0].num_rows(), 8); + let mut reader = ParquetObjectReader::new(store.clone(), meta.clone()); + let metadata = reader.get_metadata_with_options(&options).await.unwrap(); + + assert_eq!(metadata.num_row_groups(), 1); + + let reader = ParquetObjectReader::new(store, meta); + let builder = ParquetRecordBatchStreamBuilder::new_with_options(reader, options) + .await + .unwrap(); + let batches: Vec<_> = builder.build().unwrap().try_collect().await.unwrap(); + + assert_eq!(batches.len(), 1); + assert_eq!(batches[0].num_rows(), 50); } #[tokio::test] diff --git a/parquet/src/file/metadata/reader.rs b/parquet/src/file/metadata/reader.rs index 63f9c8e3b5c9..f308f1619997 100644 --- a/parquet/src/file/metadata/reader.rs +++ b/parquet/src/file/metadata/reader.rs @@ -414,8 +414,9 @@ impl ParquetMetaDataReader { mut fetch: F, file_size: usize, ) -> Result<()> { - let (metadata, remainder) = - Self::load_metadata(&mut fetch, file_size, self.get_prefetch_size()).await?; + let (metadata, remainder) = self + .load_metadata(&mut fetch, file_size, self.get_prefetch_size()) + .await?; self.metadata = Some(metadata); @@ -560,22 +561,12 @@ impl ParquetMetaDataReader { if footer_metadata_len > file_size as usize { return Err(ParquetError::NeedMoreData(footer_metadata_len)); } - #[cfg(not(feature = "encryption"))] - if footer.is_encrypted_footer() { - return Err(general_err!( - "Parquet file has an encrypted footer but the encryption feature is disabled" - )); - } let start = file_size - footer_metadata_len as u64; - #[cfg(feature = "encryption")] - return Self::decode_metadata_with_encryption( + self.decode_footer_metadata( chunk_reader.get_bytes(start, metadata_len)?.as_ref(), - footer.is_encrypted_footer(), - self.file_decryption_properties.as_ref(), - ); - #[cfg(not(feature = "encryption"))] - Self::decode_metadata(chunk_reader.get_bytes(start, metadata_len)?.as_ref()) + &footer, + ) } /// Return the number of bytes to read in the initial pass. If `prefetch_size` has @@ -593,6 +584,7 @@ impl ParquetMetaDataReader { #[cfg(all(feature = "async", feature = "arrow"))] async fn load_metadata( + &self, fetch: &mut F, file_size: usize, prefetch: usize, @@ -635,12 +627,12 @@ impl ParquetMetaDataReader { if length > suffix_len - FOOTER_SIZE { let metadata_start = file_size - length - FOOTER_SIZE; let meta = fetch.fetch(metadata_start..file_size - FOOTER_SIZE).await?; - Ok((Self::decode_metadata(&meta)?, None)) + Ok((self.decode_footer_metadata(&meta, &footer)?, None)) } else { let metadata_start = file_size - length - FOOTER_SIZE - footer_start; let slice = &suffix[metadata_start..suffix_len - FOOTER_SIZE]; Ok(( - Self::decode_metadata(slice)?, + self.decode_footer_metadata(slice, &footer)?, Some((footer_start, suffix.slice(..metadata_start))), )) } @@ -681,17 +673,50 @@ impl ParquetMetaDataReader { Self::decode_footer_tail(slice).map(|f| f.metadata_length) } - /// Decodes [`ParquetMetaData`] from the provided encrypted bytes. + /// Decodes [`ParquetMetaData`] from the provided bytes. + /// + /// Typically, this is used to decode the metadata from the end of a parquet + /// file. The format of `buf` is the Thrift compact binary protocol, as specified + /// by the [Parquet Spec]. + /// + /// This method handles using either decode_footer of + /// decode_metadata_with_encryption depending on whether the encryption + /// feature is enabled. + /// + /// [Parquet Spec]: https://github.com/apache/parquet-format#metadata + pub(crate) fn decode_footer_metadata( + &self, + buf: &[u8], + footer_tail: &FooterTail, + ) -> Result { + #[cfg(feature = "encryption")] + let result = Self::decode_metadata_with_encryption( + buf, + footer_tail.is_encrypted_footer(), + self.file_decryption_properties.as_ref(), + ); + #[cfg(not(feature = "encryption"))] + let result = { + if footer_tail.is_encrypted_footer() { + Err(general_err!("Parquet error: Parquet file has an encrypted footer but the encryption feature is disabled")) + } else { + Self::decode_metadata(buf) + } + }; + result + } + + /// Decodes [`ParquetMetaData`] from the provided bytes, handling metadata that may be encrypted. /// /// Typically this is used to decode the metadata from the end of a parquet /// file. The format of `buf` is the Thrift compact binary protocol, as specified - /// by the [Parquet Spec]. Buffer can is encrypted with AES GCM or AES CTR + /// by the [Parquet Spec]. Buffer can be encrypted with AES GCM or AES CTR /// ciphers as specfied in the [Parquet Encryption Spec]. /// /// [Parquet Spec]: https://github.com/apache/parquet-format#metadata /// [Parquet Encryption Spec]: https://parquet.apache.org/docs/file-format/data-pages/encryption/ #[cfg(feature = "encryption")] - pub(crate) fn decode_metadata_with_encryption( + fn decode_metadata_with_encryption( buf: &[u8], encrypted_footer: bool, file_decryption_properties: Option<&FileDecryptionProperties>, From 07ad9dddcb0de2d7cdf19c2b36cc6015888d7e7d Mon Sep 17 00:00:00 2001 From: Adam Reeve Date: Tue, 11 Mar 2025 15:32:49 +1300 Subject: [PATCH 95/97] Add default get_metadata_with_options implementation --- parquet/src/arrow/async_reader/mod.rs | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/parquet/src/arrow/async_reader/mod.rs b/parquet/src/arrow/async_reader/mod.rs index 34c43a863ce9..d05bc2472b19 100644 --- a/parquet/src/arrow/async_reader/mod.rs +++ b/parquet/src/arrow/async_reader/mod.rs @@ -116,7 +116,10 @@ pub trait AsyncFileReader: Send { fn get_metadata_with_options<'a>( &'a mut self, options: &'a ArrowReaderOptions, - ) -> BoxFuture<'a, Result>>; + ) -> BoxFuture<'a, Result>> { + let _ = options; + self.get_metadata() + } } /// This allows Box to be used as an AsyncFileReader, From e58ae7c5831977a2834490a14a10c919c517a69a Mon Sep 17 00:00:00 2001 From: Adam Reeve Date: Tue, 11 Mar 2025 15:52:49 +1300 Subject: [PATCH 96/97] Minor tidy ups and test fix --- parquet/src/arrow/async_reader/mod.rs | 3 +-- parquet/src/file/metadata/reader.rs | 8 +++++--- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/parquet/src/arrow/async_reader/mod.rs b/parquet/src/arrow/async_reader/mod.rs index d05bc2472b19..9afd7d835528 100644 --- a/parquet/src/arrow/async_reader/mod.rs +++ b/parquet/src/arrow/async_reader/mod.rs @@ -216,8 +216,7 @@ impl AsyncFileReader for T { let mut buf = Vec::with_capacity(metadata_len); self.take(metadata_len as _).read_to_end(&mut buf).await?; - let parquet_metadata_reader = ParquetMetaDataReader::decode_metadata(&buf)?; - Ok(Arc::new(parquet_metadata_reader)) + Ok(Arc::new(ParquetMetaDataReader::decode_metadata(&buf)?)) } .boxed() } diff --git a/parquet/src/file/metadata/reader.rs b/parquet/src/file/metadata/reader.rs index f308f1619997..b80e76d7929a 100644 --- a/parquet/src/file/metadata/reader.rs +++ b/parquet/src/file/metadata/reader.rs @@ -679,8 +679,8 @@ impl ParquetMetaDataReader { /// file. The format of `buf` is the Thrift compact binary protocol, as specified /// by the [Parquet Spec]. /// - /// This method handles using either decode_footer of - /// decode_metadata_with_encryption depending on whether the encryption + /// This method handles using either `decode_metadata` or + /// `decode_metadata_with_encryption` depending on whether the encryption /// feature is enabled. /// /// [Parquet Spec]: https://github.com/apache/parquet-format#metadata @@ -698,7 +698,9 @@ impl ParquetMetaDataReader { #[cfg(not(feature = "encryption"))] let result = { if footer_tail.is_encrypted_footer() { - Err(general_err!("Parquet error: Parquet file has an encrypted footer but the encryption feature is disabled")) + Err(general_err!( + "Parquet file has an encrypted footer but the encryption feature is disabled" + )) } else { Self::decode_metadata(buf) } From 65f9bc41467f3db6c428570e692139b37a5069c5 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Tue, 11 Mar 2025 15:14:43 +0100 Subject: [PATCH 97/97] Update parquet/src/encryption/mod.rs --- parquet/src/encryption/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/parquet/src/encryption/mod.rs b/parquet/src/encryption/mod.rs index 6e9168edb9f2..b4f4d6d221be 100644 --- a/parquet/src/encryption/mod.rs +++ b/parquet/src/encryption/mod.rs @@ -19,5 +19,5 @@ //! in the [spec](https://github.com/apache/parquet-format/blob/master/Encryption.md). pub(crate) mod ciphers; -pub(crate) mod decrypt; +pub mod decrypt; pub(crate) mod modules;