Skip to content

Commit 619d77e

Browse files
authored
Revert "Write Bloom filters between row groups instead of the end (#5860)"
This reverts commit 3930d5b.
1 parent 3930d5b commit 619d77e

File tree

7 files changed

+52
-277
lines changed

7 files changed

+52
-277
lines changed

parquet/Cargo.toml

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,6 @@ hashbrown = { version = "0.14", default-features = false }
6767
twox-hash = { version = "1.6", default-features = false }
6868
paste = { version = "1.0" }
6969
half = { version = "2.1", default-features = false, features = ["num-traits"] }
70-
sysinfo = { version = "0.30.12", optional = true, default-features = false }
7170

7271
[dev-dependencies]
7372
base64 = { version = "0.22", default-features = false, features = ["std"] }
@@ -115,19 +114,12 @@ async = ["futures", "tokio"]
115114
object_store = ["dep:object_store", "async"]
116115
# Group Zstd dependencies
117116
zstd = ["dep:zstd", "zstd-sys"]
118-
# Display memory in example/write_parquet.rs
119-
sysinfo = ["dep:sysinfo"]
120117

121118
[[example]]
122119
name = "read_parquet"
123120
required-features = ["arrow"]
124121
path = "./examples/read_parquet.rs"
125122

126-
[[example]]
127-
name = "write_parquet"
128-
required-features = ["cli", "sysinfo"]
129-
path = "./examples/write_parquet.rs"
130-
131123
[[example]]
132124
name = "async_read_parquet"
133125
required-features = ["arrow", "async"]

parquet/examples/write_parquet.rs

Lines changed: 0 additions & 131 deletions
This file was deleted.

parquet/src/arrow/arrow_writer/mod.rs

Lines changed: 3 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ use crate::column::writer::{
4343
};
4444
use crate::data_type::{ByteArray, FixedLenByteArray};
4545
use crate::errors::{ParquetError, Result};
46-
use crate::file::metadata::{ColumnChunkMetaData, KeyValue, RowGroupMetaData};
46+
use crate::file::metadata::{ColumnChunkMetaData, KeyValue, RowGroupMetaDataPtr};
4747
use crate::file::properties::{WriterProperties, WriterPropertiesPtr};
4848
use crate::file::reader::{ChunkReader, Length};
4949
use crate::file::writer::{SerializedFileWriter, SerializedRowGroupWriter};
@@ -199,7 +199,7 @@ impl<W: Write + Send> ArrowWriter<W> {
199199
}
200200

201201
/// Returns metadata for any flushed row groups
202-
pub fn flushed_row_groups(&self) -> &[RowGroupMetaData] {
202+
pub fn flushed_row_groups(&self) -> &[RowGroupMetaDataPtr] {
203203
self.writer.flushed_row_groups()
204204
}
205205

@@ -1053,9 +1053,7 @@ mod tests {
10531053
use crate::file::metadata::ParquetMetaData;
10541054
use crate::file::page_index::index::Index;
10551055
use crate::file::page_index::index_reader::read_pages_locations;
1056-
use crate::file::properties::{
1057-
BloomFilterPosition, EnabledStatistics, ReaderProperties, WriterVersion,
1058-
};
1056+
use crate::file::properties::{EnabledStatistics, ReaderProperties, WriterVersion};
10591057
use crate::file::serialized_reader::ReadOptionsBuilder;
10601058
use crate::file::{
10611059
reader::{FileReader, SerializedFileReader},
@@ -1703,7 +1701,6 @@ mod tests {
17031701
values: ArrayRef,
17041702
schema: SchemaRef,
17051703
bloom_filter: bool,
1706-
bloom_filter_position: BloomFilterPosition,
17071704
}
17081705

17091706
impl RoundTripOptions {
@@ -1714,7 +1711,6 @@ mod tests {
17141711
values,
17151712
schema: Arc::new(schema),
17161713
bloom_filter: false,
1717-
bloom_filter_position: BloomFilterPosition::AfterRowGroup,
17181714
}
17191715
}
17201716
}
@@ -1734,7 +1730,6 @@ mod tests {
17341730
values,
17351731
schema,
17361732
bloom_filter,
1737-
bloom_filter_position,
17381733
} = options;
17391734

17401735
let encodings = match values.data_type() {
@@ -1775,7 +1770,6 @@ mod tests {
17751770
.set_dictionary_page_size_limit(dictionary_size.max(1))
17761771
.set_encoding(*encoding)
17771772
.set_bloom_filter_enabled(bloom_filter)
1778-
.set_bloom_filter_position(bloom_filter_position)
17791773
.build();
17801774

17811775
files.push(roundtrip_opts(&expected_batch, props))
@@ -2133,22 +2127,6 @@ mod tests {
21332127
values_required::<BinaryViewArray, _>(many_vecs_iter);
21342128
}
21352129

2136-
#[test]
2137-
fn i32_column_bloom_filter_at_end() {
2138-
let array = Arc::new(Int32Array::from_iter(0..SMALL_SIZE as i32));
2139-
let mut options = RoundTripOptions::new(array, false);
2140-
options.bloom_filter = true;
2141-
options.bloom_filter_position = BloomFilterPosition::End;
2142-
2143-
let files = one_column_roundtrip_with_options(options);
2144-
check_bloom_filter(
2145-
files,
2146-
"col".to_string(),
2147-
(0..SMALL_SIZE as i32).collect(),
2148-
(SMALL_SIZE as i32 + 1..SMALL_SIZE as i32 + 10).collect(),
2149-
);
2150-
}
2151-
21522130
#[test]
21532131
fn i32_column_bloom_filter() {
21542132
let array = Arc::new(Int32Array::from_iter(0..SMALL_SIZE as i32));

parquet/src/arrow/async_writer/mod.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,7 @@ use crate::{
5454
arrow::arrow_writer::ArrowWriterOptions,
5555
arrow::ArrowWriter,
5656
errors::{ParquetError, Result},
57-
file::{metadata::RowGroupMetaData, properties::WriterProperties},
57+
file::{metadata::RowGroupMetaDataPtr, properties::WriterProperties},
5858
format::{FileMetaData, KeyValue},
5959
};
6060
use arrow_array::RecordBatch;
@@ -172,7 +172,7 @@ impl<W: AsyncFileWriter> AsyncArrowWriter<W> {
172172
}
173173

174174
/// Returns metadata for any flushed row groups
175-
pub fn flushed_row_groups(&self) -> &[RowGroupMetaData] {
175+
pub fn flushed_row_groups(&self) -> &[RowGroupMetaDataPtr] {
176176
self.sync_writer.flushed_row_groups()
177177
}
178178

parquet/src/file/metadata.rs

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -333,11 +333,6 @@ impl RowGroupMetaData {
333333
&self.columns
334334
}
335335

336-
/// Returns mutable slice of column chunk metadata.
337-
pub fn columns_mut(&mut self) -> &mut [ColumnChunkMetaData] {
338-
&mut self.columns
339-
}
340-
341336
/// Number of rows in this row group.
342337
pub fn num_rows(&self) -> i64 {
343338
self.num_rows

parquet/src/file/properties.rs

Lines changed: 0 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -43,8 +43,6 @@ pub const DEFAULT_STATISTICS_ENABLED: EnabledStatistics = EnabledStatistics::Pag
4343
pub const DEFAULT_MAX_STATISTICS_SIZE: usize = 4096;
4444
/// Default value for [`WriterProperties::max_row_group_size`]
4545
pub const DEFAULT_MAX_ROW_GROUP_SIZE: usize = 1024 * 1024;
46-
/// Default value for [`WriterProperties::bloom_filter_position`]
47-
pub const DEFAULT_BLOOM_FILTER_POSITION: BloomFilterPosition = BloomFilterPosition::AfterRowGroup;
4846
/// Default value for [`WriterProperties::created_by`]
4947
pub const DEFAULT_CREATED_BY: &str = concat!("parquet-rs version ", env!("CARGO_PKG_VERSION"));
5048
/// Default value for [`WriterProperties::column_index_truncate_length`]
@@ -88,24 +86,6 @@ impl FromStr for WriterVersion {
8886
}
8987
}
9088

91-
/// Where in the file [`ArrowWriter`](crate::arrow::arrow_writer::ArrowWriter) should
92-
/// write Bloom filters
93-
///
94-
/// Basic constant, which is not part of the Thrift definition.
95-
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
96-
pub enum BloomFilterPosition {
97-
/// Write Bloom Filters of each row group right after the row group
98-
///
99-
/// This saves memory by writing it as soon as it is computed, at the cost
100-
/// of data locality for readers
101-
AfterRowGroup,
102-
/// Write Bloom Filters at the end of the file
103-
///
104-
/// This allows better data locality for readers, at the cost of memory usage
105-
/// for writers.
106-
End,
107-
}
108-
10989
/// Reference counted writer properties.
11090
pub type WriterPropertiesPtr = Arc<WriterProperties>;
11191

@@ -150,7 +130,6 @@ pub struct WriterProperties {
150130
data_page_row_count_limit: usize,
151131
write_batch_size: usize,
152132
max_row_group_size: usize,
153-
bloom_filter_position: BloomFilterPosition,
154133
writer_version: WriterVersion,
155134
created_by: String,
156135
pub(crate) key_value_metadata: Option<Vec<KeyValue>>,
@@ -238,11 +217,6 @@ impl WriterProperties {
238217
self.max_row_group_size
239218
}
240219

241-
/// Returns maximum number of rows in a row group.
242-
pub fn bloom_filter_position(&self) -> BloomFilterPosition {
243-
self.bloom_filter_position
244-
}
245-
246220
/// Returns configured writer version.
247221
pub fn writer_version(&self) -> WriterVersion {
248222
self.writer_version
@@ -364,7 +338,6 @@ pub struct WriterPropertiesBuilder {
364338
data_page_row_count_limit: usize,
365339
write_batch_size: usize,
366340
max_row_group_size: usize,
367-
bloom_filter_position: BloomFilterPosition,
368341
writer_version: WriterVersion,
369342
created_by: String,
370343
key_value_metadata: Option<Vec<KeyValue>>,
@@ -384,7 +357,6 @@ impl WriterPropertiesBuilder {
384357
data_page_row_count_limit: usize::MAX,
385358
write_batch_size: DEFAULT_WRITE_BATCH_SIZE,
386359
max_row_group_size: DEFAULT_MAX_ROW_GROUP_SIZE,
387-
bloom_filter_position: DEFAULT_BLOOM_FILTER_POSITION,
388360
writer_version: DEFAULT_WRITER_VERSION,
389361
created_by: DEFAULT_CREATED_BY.to_string(),
390362
key_value_metadata: None,
@@ -404,7 +376,6 @@ impl WriterPropertiesBuilder {
404376
data_page_row_count_limit: self.data_page_row_count_limit,
405377
write_batch_size: self.write_batch_size,
406378
max_row_group_size: self.max_row_group_size,
407-
bloom_filter_position: self.bloom_filter_position,
408379
writer_version: self.writer_version,
409380
created_by: self.created_by,
410381
key_value_metadata: self.key_value_metadata,
@@ -516,12 +487,6 @@ impl WriterPropertiesBuilder {
516487
self
517488
}
518489

519-
/// Sets where in the final file Bloom Filters are written (default `AfterRowGroup`)
520-
pub fn set_bloom_filter_position(mut self, value: BloomFilterPosition) -> Self {
521-
self.bloom_filter_position = value;
522-
self
523-
}
524-
525490
/// Sets "created by" property (defaults to `parquet-rs version <VERSION>`).
526491
pub fn set_created_by(mut self, value: String) -> Self {
527492
self.created_by = value;
@@ -1087,7 +1052,6 @@ mod tests {
10871052
);
10881053
assert_eq!(props.write_batch_size(), DEFAULT_WRITE_BATCH_SIZE);
10891054
assert_eq!(props.max_row_group_size(), DEFAULT_MAX_ROW_GROUP_SIZE);
1090-
assert_eq!(props.bloom_filter_position(), DEFAULT_BLOOM_FILTER_POSITION);
10911055
assert_eq!(props.writer_version(), DEFAULT_WRITER_VERSION);
10921056
assert_eq!(props.created_by(), DEFAULT_CREATED_BY);
10931057
assert_eq!(props.key_value_metadata(), None);

0 commit comments

Comments
 (0)