Skip to content

Commit c0ff96f

Browse files
committed
Enable reading string view by default from Parquet
1 parent 470b3a8 commit c0ff96f

File tree

13 files changed

+46
-59
lines changed

13 files changed

+46
-59
lines changed

benchmarks/src/clickbench.rs

-1
Original file line numberDiff line numberDiff line change
@@ -119,7 +119,6 @@ impl RunOpt {
119119
let mut config = self.common.config();
120120
{
121121
let parquet_options = &mut config.options_mut().execution.parquet;
122-
parquet_options.schema_force_view_types = self.common.force_view_types;
123122
// The hits_partitioned dataset specifies string columns
124123
// as binary due to how it was written. Force it to strings
125124
parquet_options.binary_as_string = true;

benchmarks/src/imdb/run.rs

+1-7
Original file line numberDiff line numberDiff line change
@@ -305,11 +305,7 @@ impl RunOpt {
305305
.config()
306306
.with_collect_statistics(!self.disable_statistics);
307307
config.options_mut().optimizer.prefer_hash_join = self.prefer_hash_join;
308-
config
309-
.options_mut()
310-
.execution
311-
.parquet
312-
.schema_force_view_types = self.common.force_view_types;
308+
313309
let ctx = SessionContext::new_with_config(config);
314310

315311
// register tables
@@ -517,7 +513,6 @@ mod tests {
517513
partitions: Some(2),
518514
batch_size: 8192,
519515
debug: false,
520-
force_view_types: false,
521516
};
522517
let opt = RunOpt {
523518
query: Some(query),
@@ -551,7 +546,6 @@ mod tests {
551546
partitions: Some(2),
552547
batch_size: 8192,
553548
debug: false,
554-
force_view_types: false,
555549
};
556550
let opt = RunOpt {
557551
query: Some(query),

benchmarks/src/tpch/run.rs

-7
Original file line numberDiff line numberDiff line change
@@ -120,11 +120,6 @@ impl RunOpt {
120120
.config()
121121
.with_collect_statistics(!self.disable_statistics);
122122
config.options_mut().optimizer.prefer_hash_join = self.prefer_hash_join;
123-
config
124-
.options_mut()
125-
.execution
126-
.parquet
127-
.schema_force_view_types = self.common.force_view_types;
128123
let ctx = SessionContext::new_with_config(config);
129124

130125
// register tables
@@ -345,7 +340,6 @@ mod tests {
345340
partitions: Some(2),
346341
batch_size: 8192,
347342
debug: false,
348-
force_view_types: false,
349343
};
350344
let opt = RunOpt {
351345
query: Some(query),
@@ -379,7 +373,6 @@ mod tests {
379373
partitions: Some(2),
380374
batch_size: 8192,
381375
debug: false,
382-
force_view_types: false,
383376
};
384377
let opt = RunOpt {
385378
query: Some(query),

benchmarks/src/util/options.rs

-5
Original file line numberDiff line numberDiff line change
@@ -37,11 +37,6 @@ pub struct CommonOpt {
3737
/// Activate debug mode to see more details
3838
#[structopt(short, long)]
3939
pub debug: bool,
40-
41-
/// If true, will use StringView/BinaryViewArray instead of String/BinaryArray
42-
/// when reading ParquetFiles
43-
#[structopt(long)]
44-
pub force_view_types: bool,
4540
}
4641

4742
impl CommonOpt {

datafusion/common/src/config.rs

+1-1
Original file line numberDiff line numberDiff line change
@@ -388,7 +388,7 @@ config_namespace! {
388388

389389
/// (reading) If true, parquet reader will read columns of `Utf8/Utf8Large` with `Utf8View`,
390390
/// and `Binary/BinaryLarge` with `BinaryView`.
391-
pub schema_force_view_types: bool, default = false
391+
pub schema_force_view_types: bool, default = true
392392

393393
/// (reading) If true, parquet reader will read columns of
394394
/// `Binary/LargeBinary` with `Utf8`, and `BinaryView` with `Utf8View`.

datafusion/common/src/scalar/mod.rs

+5
Original file line numberDiff line numberDiff line change
@@ -978,6 +978,11 @@ impl ScalarValue {
978978
ScalarValue::from(val.into())
979979
}
980980

981+
/// Returns a [`ScalarValue::Utf8View`] representing `val`
982+
pub fn new_utf8view(val: impl Into<String>) -> Self {
983+
ScalarValue::Utf8View(Some(val.into()))
984+
}
985+
981986
/// Returns a [`ScalarValue::IntervalYearMonth`] representing
982987
/// `years` years and `months` months
983988
pub fn new_interval_ym(years: i32, months: i32) -> Self {

datafusion/core/tests/parquet/page_pruning.rs

+3-2
Original file line numberDiff line numberDiff line change
@@ -149,8 +149,9 @@ async fn page_index_filter_one_col() {
149149
let session_ctx = SessionContext::new();
150150
let task_ctx = session_ctx.task_ctx();
151151

152-
// 5.create filter date_string_col == 1;
153-
let filter = col("date_string_col").eq(lit("01/01/09"));
152+
// 5.create filter date_string_col == "01/01/09"`;
153+
// Note this test doesn't apply type coercion so the literal must match the actual view type
154+
let filter = col("date_string_col").eq(lit(ScalarValue::new_utf8view("01/01/09")));
154155
let parquet_exec = get_parquet_exec(&state, filter).await;
155156
let mut results = parquet_exec.execute(0, task_ctx.clone()).unwrap();
156157
let batch = results.next().await.unwrap().unwrap();

datafusion/sqllogictest/test_files/describe.slt

+2-2
Original file line numberDiff line numberDiff line change
@@ -81,8 +81,8 @@ int_col Int32 YES
8181
bigint_col Int64 YES
8282
float_col Float32 YES
8383
double_col Float64 YES
84-
date_string_col Utf8 YES
85-
string_col Utf8 YES
84+
date_string_col Utf8View YES
85+
string_col Utf8View YES
8686
timestamp_col Timestamp(Nanosecond, None) YES
8787
year Int32 YES
8888
month Int32 YES

datafusion/sqllogictest/test_files/explain.slt

+6-6
Original file line numberDiff line numberDiff line change
@@ -305,8 +305,8 @@ initial_physical_plan
305305
01)GlobalLimitExec: skip=0, fetch=10, statistics=[Rows=Exact(8), Bytes=Absent, [(Col[0]:),(Col[1]:),(Col[2]:),(Col[3]:),(Col[4]:),(Col[5]:),(Col[6]:),(Col[7]:),(Col[8]:),(Col[9]:),(Col[10]:)]]
306306
02)--ParquetExec: file_groups={1 group: [[WORKSPACE_ROOT/parquet-testing/data/alltypes_plain.parquet]]}, projection=[id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col, double_col, date_string_col, string_col, timestamp_col], limit=10, statistics=[Rows=Exact(8), Bytes=Absent, [(Col[0]:),(Col[1]:),(Col[2]:),(Col[3]:),(Col[4]:),(Col[5]:),(Col[6]:),(Col[7]:),(Col[8]:),(Col[9]:),(Col[10]:)]]
307307
initial_physical_plan_with_schema
308-
01)GlobalLimitExec: skip=0, fetch=10, schema=[id:Int32;N, bool_col:Boolean;N, tinyint_col:Int32;N, smallint_col:Int32;N, int_col:Int32;N, bigint_col:Int64;N, float_col:Float32;N, double_col:Float64;N, date_string_col:Binary;N, string_col:Binary;N, timestamp_col:Timestamp(Nanosecond, None);N]
309-
02)--ParquetExec: file_groups={1 group: [[WORKSPACE_ROOT/parquet-testing/data/alltypes_plain.parquet]]}, projection=[id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col, double_col, date_string_col, string_col, timestamp_col], limit=10, schema=[id:Int32;N, bool_col:Boolean;N, tinyint_col:Int32;N, smallint_col:Int32;N, int_col:Int32;N, bigint_col:Int64;N, float_col:Float32;N, double_col:Float64;N, date_string_col:Binary;N, string_col:Binary;N, timestamp_col:Timestamp(Nanosecond, None);N]
308+
01)GlobalLimitExec: skip=0, fetch=10, schema=[id:Int32;N, bool_col:Boolean;N, tinyint_col:Int32;N, smallint_col:Int32;N, int_col:Int32;N, bigint_col:Int64;N, float_col:Float32;N, double_col:Float64;N, date_string_col:BinaryView;N, string_col:BinaryView;N, timestamp_col:Timestamp(Nanosecond, None);N]
309+
02)--ParquetExec: file_groups={1 group: [[WORKSPACE_ROOT/parquet-testing/data/alltypes_plain.parquet]]}, projection=[id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col, double_col, date_string_col, string_col, timestamp_col], limit=10, schema=[id:Int32;N, bool_col:Boolean;N, tinyint_col:Int32;N, smallint_col:Int32;N, int_col:Int32;N, bigint_col:Int64;N, float_col:Float32;N, double_col:Float64;N, date_string_col:BinaryView;N, string_col:BinaryView;N, timestamp_col:Timestamp(Nanosecond, None);N]
310310
physical_plan after OutputRequirements
311311
01)OutputRequirementExec, statistics=[Rows=Exact(8), Bytes=Absent, [(Col[0]:),(Col[1]:),(Col[2]:),(Col[3]:),(Col[4]:),(Col[5]:),(Col[6]:),(Col[7]:),(Col[8]:),(Col[9]:),(Col[10]:)]]
312312
02)--GlobalLimitExec: skip=0, fetch=10, statistics=[Rows=Exact(8), Bytes=Absent, [(Col[0]:),(Col[1]:),(Col[2]:),(Col[3]:),(Col[4]:),(Col[5]:),(Col[6]:),(Col[7]:),(Col[8]:),(Col[9]:),(Col[10]:)]]
@@ -328,7 +328,7 @@ physical_plan after ProjectionPushdown SAME TEXT AS ABOVE
328328
physical_plan after LimitPushdown ParquetExec: file_groups={1 group: [[WORKSPACE_ROOT/parquet-testing/data/alltypes_plain.parquet]]}, projection=[id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col, double_col, date_string_col, string_col, timestamp_col], limit=10, statistics=[Rows=Exact(8), Bytes=Absent, [(Col[0]:),(Col[1]:),(Col[2]:),(Col[3]:),(Col[4]:),(Col[5]:),(Col[6]:),(Col[7]:),(Col[8]:),(Col[9]:),(Col[10]:)]]
329329
physical_plan after SanityCheckPlan SAME TEXT AS ABOVE
330330
physical_plan ParquetExec: file_groups={1 group: [[WORKSPACE_ROOT/parquet-testing/data/alltypes_plain.parquet]]}, projection=[id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col, double_col, date_string_col, string_col, timestamp_col], limit=10, statistics=[Rows=Exact(8), Bytes=Absent, [(Col[0]:),(Col[1]:),(Col[2]:),(Col[3]:),(Col[4]:),(Col[5]:),(Col[6]:),(Col[7]:),(Col[8]:),(Col[9]:),(Col[10]:)]]
331-
physical_plan_with_schema ParquetExec: file_groups={1 group: [[WORKSPACE_ROOT/parquet-testing/data/alltypes_plain.parquet]]}, projection=[id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col, double_col, date_string_col, string_col, timestamp_col], limit=10, schema=[id:Int32;N, bool_col:Boolean;N, tinyint_col:Int32;N, smallint_col:Int32;N, int_col:Int32;N, bigint_col:Int64;N, float_col:Float32;N, double_col:Float64;N, date_string_col:Binary;N, string_col:Binary;N, timestamp_col:Timestamp(Nanosecond, None);N]
331+
physical_plan_with_schema ParquetExec: file_groups={1 group: [[WORKSPACE_ROOT/parquet-testing/data/alltypes_plain.parquet]]}, projection=[id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col, double_col, date_string_col, string_col, timestamp_col], limit=10, schema=[id:Int32;N, bool_col:Boolean;N, tinyint_col:Int32;N, smallint_col:Int32;N, int_col:Int32;N, bigint_col:Int64;N, float_col:Float32;N, double_col:Float64;N, date_string_col:BinaryView;N, string_col:BinaryView;N, timestamp_col:Timestamp(Nanosecond, None);N]
332332

333333

334334
statement ok
@@ -345,8 +345,8 @@ initial_physical_plan_with_stats
345345
01)GlobalLimitExec: skip=0, fetch=10, statistics=[Rows=Exact(8), Bytes=Absent, [(Col[0]:),(Col[1]:),(Col[2]:),(Col[3]:),(Col[4]:),(Col[5]:),(Col[6]:),(Col[7]:),(Col[8]:),(Col[9]:),(Col[10]:)]]
346346
02)--ParquetExec: file_groups={1 group: [[WORKSPACE_ROOT/parquet-testing/data/alltypes_plain.parquet]]}, projection=[id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col, double_col, date_string_col, string_col, timestamp_col], limit=10, statistics=[Rows=Exact(8), Bytes=Absent, [(Col[0]:),(Col[1]:),(Col[2]:),(Col[3]:),(Col[4]:),(Col[5]:),(Col[6]:),(Col[7]:),(Col[8]:),(Col[9]:),(Col[10]:)]]
347347
initial_physical_plan_with_schema
348-
01)GlobalLimitExec: skip=0, fetch=10, schema=[id:Int32;N, bool_col:Boolean;N, tinyint_col:Int32;N, smallint_col:Int32;N, int_col:Int32;N, bigint_col:Int64;N, float_col:Float32;N, double_col:Float64;N, date_string_col:Binary;N, string_col:Binary;N, timestamp_col:Timestamp(Nanosecond, None);N]
349-
02)--ParquetExec: file_groups={1 group: [[WORKSPACE_ROOT/parquet-testing/data/alltypes_plain.parquet]]}, projection=[id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col, double_col, date_string_col, string_col, timestamp_col], limit=10, schema=[id:Int32;N, bool_col:Boolean;N, tinyint_col:Int32;N, smallint_col:Int32;N, int_col:Int32;N, bigint_col:Int64;N, float_col:Float32;N, double_col:Float64;N, date_string_col:Binary;N, string_col:Binary;N, timestamp_col:Timestamp(Nanosecond, None);N]
348+
01)GlobalLimitExec: skip=0, fetch=10, schema=[id:Int32;N, bool_col:Boolean;N, tinyint_col:Int32;N, smallint_col:Int32;N, int_col:Int32;N, bigint_col:Int64;N, float_col:Float32;N, double_col:Float64;N, date_string_col:BinaryView;N, string_col:BinaryView;N, timestamp_col:Timestamp(Nanosecond, None);N]
349+
02)--ParquetExec: file_groups={1 group: [[WORKSPACE_ROOT/parquet-testing/data/alltypes_plain.parquet]]}, projection=[id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col, double_col, date_string_col, string_col, timestamp_col], limit=10, schema=[id:Int32;N, bool_col:Boolean;N, tinyint_col:Int32;N, smallint_col:Int32;N, int_col:Int32;N, bigint_col:Int64;N, float_col:Float32;N, double_col:Float64;N, date_string_col:BinaryView;N, string_col:BinaryView;N, timestamp_col:Timestamp(Nanosecond, None);N]
350350
physical_plan after OutputRequirements
351351
01)OutputRequirementExec
352352
02)--GlobalLimitExec: skip=0, fetch=10
@@ -369,7 +369,7 @@ physical_plan after LimitPushdown ParquetExec: file_groups={1 group: [[WORKSPACE
369369
physical_plan after SanityCheckPlan SAME TEXT AS ABOVE
370370
physical_plan ParquetExec: file_groups={1 group: [[WORKSPACE_ROOT/parquet-testing/data/alltypes_plain.parquet]]}, projection=[id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col, double_col, date_string_col, string_col, timestamp_col], limit=10
371371
physical_plan_with_stats ParquetExec: file_groups={1 group: [[WORKSPACE_ROOT/parquet-testing/data/alltypes_plain.parquet]]}, projection=[id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col, double_col, date_string_col, string_col, timestamp_col], limit=10, statistics=[Rows=Exact(8), Bytes=Absent, [(Col[0]:),(Col[1]:),(Col[2]:),(Col[3]:),(Col[4]:),(Col[5]:),(Col[6]:),(Col[7]:),(Col[8]:),(Col[9]:),(Col[10]:)]]
372-
physical_plan_with_schema ParquetExec: file_groups={1 group: [[WORKSPACE_ROOT/parquet-testing/data/alltypes_plain.parquet]]}, projection=[id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col, double_col, date_string_col, string_col, timestamp_col], limit=10, schema=[id:Int32;N, bool_col:Boolean;N, tinyint_col:Int32;N, smallint_col:Int32;N, int_col:Int32;N, bigint_col:Int64;N, float_col:Float32;N, double_col:Float64;N, date_string_col:Binary;N, string_col:Binary;N, timestamp_col:Timestamp(Nanosecond, None);N]
372+
physical_plan_with_schema ParquetExec: file_groups={1 group: [[WORKSPACE_ROOT/parquet-testing/data/alltypes_plain.parquet]]}, projection=[id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col, double_col, date_string_col, string_col, timestamp_col], limit=10, schema=[id:Int32;N, bool_col:Boolean;N, tinyint_col:Int32;N, smallint_col:Int32;N, int_col:Int32;N, bigint_col:Int64;N, float_col:Float32;N, double_col:Float64;N, date_string_col:BinaryView;N, string_col:BinaryView;N, timestamp_col:Timestamp(Nanosecond, None);N]
373373

374374

375375
statement ok

datafusion/sqllogictest/test_files/information_schema.slt

+2-2
Original file line numberDiff line numberDiff line change
@@ -202,7 +202,7 @@ datafusion.execution.parquet.metadata_size_hint NULL
202202
datafusion.execution.parquet.pruning true
203203
datafusion.execution.parquet.pushdown_filters false
204204
datafusion.execution.parquet.reorder_filters false
205-
datafusion.execution.parquet.schema_force_view_types false
205+
datafusion.execution.parquet.schema_force_view_types true
206206
datafusion.execution.parquet.skip_metadata true
207207
datafusion.execution.parquet.statistics_enabled page
208208
datafusion.execution.parquet.write_batch_size 1024
@@ -294,7 +294,7 @@ datafusion.execution.parquet.metadata_size_hint NULL (reading) If specified, the
294294
datafusion.execution.parquet.pruning true (reading) If true, the parquet reader attempts to skip entire row groups based on the predicate in the query and the metadata (min/max values) stored in the parquet file
295295
datafusion.execution.parquet.pushdown_filters false (reading) If true, filter expressions are be applied during the parquet decoding operation to reduce the number of rows decoded. This optimization is sometimes called "late materialization".
296296
datafusion.execution.parquet.reorder_filters false (reading) If true, filter expressions evaluated during the parquet decoding operation will be reordered heuristically to minimize the cost of evaluation. If false, the filters are applied in the same order as written in the query
297-
datafusion.execution.parquet.schema_force_view_types false (reading) If true, parquet reader will read columns of `Utf8/Utf8Large` with `Utf8View`, and `Binary/BinaryLarge` with `BinaryView`.
297+
datafusion.execution.parquet.schema_force_view_types true (reading) If true, parquet reader will read columns of `Utf8/Utf8Large` with `Utf8View`, and `Binary/BinaryLarge` with `BinaryView`.
298298
datafusion.execution.parquet.skip_metadata true (reading) If true, the parquet reader skip the optional embedded metadata that may be in the file Schema. This setting can help avoid schema conflicts when querying multiple parquet files with schemas containing compatible types but different metadata
299299
datafusion.execution.parquet.statistics_enabled page (writing) Sets if statistics are enabled for any column Valid values are: "none", "chunk", and "page" These values are not case sensitive. If NULL, uses default parquet writer setting
300300
datafusion.execution.parquet.write_batch_size 1024 (writing) Sets write_batch_size in bytes

datafusion/sqllogictest/test_files/map.slt

+1-1
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ describe data;
4242
----
4343
ints Map(Field { name: "entries", data_type: Struct([Field { name: "key", data_type: Utf8, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "value", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }]), nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, false) NO
4444
strings Map(Field { name: "entries", data_type: Struct([Field { name: "key", data_type: Utf8, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "value", data_type: Utf8, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }]), nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, false) NO
45-
timestamp Utf8 NO
45+
timestamp Utf8View NO
4646

4747
query ??T
4848
SELECT * FROM data ORDER by ints['bytes'] DESC LIMIT 10;

0 commit comments

Comments
 (0)