From 448b0ec3c0d6b648d18e821dea851658698d6611 Mon Sep 17 00:00:00 2001 From: Alexander Rafferty Date: Sun, 28 Jul 2024 18:46:04 +1000 Subject: [PATCH 1/2] Fix #11687: Replace all uses of `OnceLock` with `LazyLock` --- Cargo.toml | 2 +- datafusion-cli/src/main.rs | 20 +++--- datafusion/core/Cargo.toml | 2 +- .../physical_plan/parquet/access_plan.rs | 58 +++++++-------- datafusion/core/tests/expr_api/mod.rs | 72 +++++++++---------- datafusion/core/tests/memory_limit/mod.rs | 22 +++--- .../tests/parquet/external_access_plan.rs | 66 ++++++++--------- datafusion/expr/src/test/function_stub.rs | 16 ++--- datafusion/functions-aggregate/src/macros.rs | 16 ++--- datafusion/functions-nested/src/macros.rs | 19 ++--- datafusion/functions/src/macros.rs | 22 +++--- .../functions/src/regex/regexpreplace.rs | 15 ++-- .../physical-expr/src/utils/guarantee.rs | 22 +++--- .../engines/datafusion_engine/normalize.rs | 50 ++++++------- 14 files changed, 177 insertions(+), 225 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index cdf3d2f93b93..6d8f28380f66 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -54,7 +54,7 @@ homepage = "https://datafusion.apache.org" license = "Apache-2.0" readme = "README.md" repository = "https://github.com/apache/datafusion" -rust-version = "1.76" +rust-version = "1.80" version = "40.0.0" [workspace.dependencies] diff --git a/datafusion-cli/src/main.rs b/datafusion-cli/src/main.rs index 6266ae6f561a..a2b52eb701a1 100644 --- a/datafusion-cli/src/main.rs +++ b/datafusion-cli/src/main.rs @@ -19,7 +19,7 @@ use std::collections::HashMap; use std::env; use std::path::Path; use std::process::ExitCode; -use std::sync::{Arc, OnceLock}; +use std::sync::{Arc, LazyLock}; use datafusion::error::{DataFusionError, Result}; use datafusion::execution::context::SessionConfig; @@ -295,9 +295,8 @@ impl ByteUnit { } fn extract_memory_pool_size(size: &str) -> Result { - fn byte_suffixes() -> &'static HashMap<&'static str, ByteUnit> { - static BYTE_SUFFIXES: OnceLock> = OnceLock::new(); - BYTE_SUFFIXES.get_or_init(|| { + static BYTE_SUFFIXES: LazyLock> = + LazyLock::new(|| { let mut m = HashMap::new(); m.insert("b", ByteUnit::Byte); m.insert("k", ByteUnit::KiB); @@ -309,23 +308,20 @@ fn extract_memory_pool_size(size: &str) -> Result { m.insert("t", ByteUnit::TiB); m.insert("tb", ByteUnit::TiB); m - }) - } + }); - fn suffix_re() -> &'static regex::Regex { - static SUFFIX_REGEX: OnceLock = OnceLock::new(); - SUFFIX_REGEX.get_or_init(|| regex::Regex::new(r"^(-?[0-9]+)([a-z]+)?$").unwrap()) - } + static SUFFIX_REGEX: LazyLock = + LazyLock::new(|| regex::Regex::new(r"^(-?[0-9]+)([a-z]+)?$").unwrap()); let lower = size.to_lowercase(); - if let Some(caps) = suffix_re().captures(&lower) { + if let Some(caps) = SUFFIX_REGEX.captures(&lower) { let num_str = caps.get(1).unwrap().as_str(); let num = num_str.parse::().map_err(|_| { format!("Invalid numeric value in memory pool size '{}'", size) })?; let suffix = caps.get(2).map(|m| m.as_str()).unwrap_or("b"); - let unit = byte_suffixes() + let unit = BYTE_SUFFIXES .get(suffix) .ok_or_else(|| format!("Invalid memory pool size '{}'", size))?; let memory_pool_size = usize::try_from(unit.multiplier()) diff --git a/datafusion/core/Cargo.toml b/datafusion/core/Cargo.toml index 09b90a56d2aa..44d7563dba8a 100644 --- a/datafusion/core/Cargo.toml +++ b/datafusion/core/Cargo.toml @@ -30,7 +30,7 @@ authors = { workspace = true } # Specify MSRV here as `cargo msrv` doesn't support workspace version and fails with # "Unable to find key 'package.rust-version' (or 'package.metadata.msrv') in 'arrow-datafusion/Cargo.toml'" # https://github.com/foresterre/cargo-msrv/issues/590 -rust-version = "1.76" +rust-version = "1.80" [lints] workspace = true diff --git a/datafusion/core/src/datasource/physical_plan/parquet/access_plan.rs b/datafusion/core/src/datasource/physical_plan/parquet/access_plan.rs index ea3030664b7b..0d77a99699bd 100644 --- a/datafusion/core/src/datasource/physical_plan/parquet/access_plan.rs +++ b/datafusion/core/src/datasource/physical_plan/parquet/access_plan.rs @@ -345,7 +345,7 @@ mod test { use parquet::basic::LogicalType; use parquet::file::metadata::ColumnChunkMetaData; use parquet::schema::types::{SchemaDescPtr, SchemaDescriptor}; - use std::sync::{Arc, OnceLock}; + use std::sync::{Arc, LazyLock}; #[test] fn test_only_scans() { @@ -358,7 +358,7 @@ mod test { let row_group_indexes = access_plan.row_group_indexes(); let row_selection = access_plan - .into_overall_row_selection(row_group_metadata()) + .into_overall_row_selection(&ROW_GROUP_METADATA) .unwrap(); // scan all row groups, no selection @@ -377,7 +377,7 @@ mod test { let row_group_indexes = access_plan.row_group_indexes(); let row_selection = access_plan - .into_overall_row_selection(row_group_metadata()) + .into_overall_row_selection(&ROW_GROUP_METADATA) .unwrap(); // skip all row groups, no selection @@ -403,7 +403,7 @@ mod test { let row_group_indexes = access_plan.row_group_indexes(); let row_selection = access_plan - .into_overall_row_selection(row_group_metadata()) + .into_overall_row_selection(&ROW_GROUP_METADATA) .unwrap(); assert_eq!(row_group_indexes, vec![0, 1]); @@ -442,7 +442,7 @@ mod test { let row_group_indexes = access_plan.row_group_indexes(); let row_selection = access_plan - .into_overall_row_selection(row_group_metadata()) + .into_overall_row_selection(&ROW_GROUP_METADATA) .unwrap(); assert_eq!(row_group_indexes, vec![1, 2, 3]); @@ -478,7 +478,7 @@ mod test { let row_group_indexes = access_plan.row_group_indexes(); let err = access_plan - .into_overall_row_selection(row_group_metadata()) + .into_overall_row_selection(&ROW_GROUP_METADATA) .unwrap_err() .to_string(); assert_eq!(row_group_indexes, vec![0, 1, 2, 3]); @@ -504,39 +504,35 @@ mod test { let row_group_indexes = access_plan.row_group_indexes(); let err = access_plan - .into_overall_row_selection(row_group_metadata()) + .into_overall_row_selection(&ROW_GROUP_METADATA) .unwrap_err() .to_string(); assert_eq!(row_group_indexes, vec![0, 1, 2, 3]); assert_contains!(err, "Invalid ParquetAccessPlan Selection. Row group 1 has 20 rows but selection only specifies 22 rows"); } - static ROW_GROUP_METADATA: OnceLock> = OnceLock::new(); - /// [`RowGroupMetaData`] that returns 4 row groups with 10, 20, 30, 40 rows /// respectively - fn row_group_metadata() -> &'static [RowGroupMetaData] { - ROW_GROUP_METADATA.get_or_init(|| { - let schema_descr = get_test_schema_descr(); - let row_counts = [10, 20, 30, 40]; - - row_counts - .into_iter() - .map(|num_rows| { - let column = ColumnChunkMetaData::builder(schema_descr.column(0)) - .set_num_values(num_rows) - .build() - .unwrap(); - - RowGroupMetaData::builder(schema_descr.clone()) - .set_num_rows(num_rows) - .set_column_metadata(vec![column]) - .build() - .unwrap() - }) - .collect() - }) - } + static ROW_GROUP_METADATA: LazyLock> = LazyLock::new(|| { + let schema_descr = get_test_schema_descr(); + let row_counts = [10, 20, 30, 40]; + + row_counts + .into_iter() + .map(|num_rows| { + let column = ColumnChunkMetaData::builder(schema_descr.column(0)) + .set_num_values(num_rows) + .build() + .unwrap(); + + RowGroupMetaData::builder(schema_descr.clone()) + .set_num_rows(num_rows) + .set_column_metadata(vec![column]) + .build() + .unwrap() + }) + .collect() + }); /// Single column schema with a single column named "a" of type `BYTE_ARRAY`/`String` fn get_test_schema_descr() -> SchemaDescPtr { diff --git a/datafusion/core/tests/expr_api/mod.rs b/datafusion/core/tests/expr_api/mod.rs index 051d65652633..cc0f0af9a9aa 100644 --- a/datafusion/core/tests/expr_api/mod.rs +++ b/datafusion/core/tests/expr_api/mod.rs @@ -28,7 +28,7 @@ use datafusion_functions_aggregate::sum::sum_udaf; use datafusion_functions_nested::expr_ext::{IndexAccessor, SliceAccessor}; use sqlparser::ast::NullTreatment; /// Tests of using and evaluating `Expr`s outside the context of a LogicalPlan -use std::sync::{Arc, OnceLock}; +use std::sync::{Arc, LazyLock}; mod parse_sql_expr; mod simplification; @@ -320,7 +320,7 @@ async fn test_aggregate_ext_null_treatment() { /// Evaluates the specified expr as an aggregate and compares the result to the /// expected result. async fn evaluate_agg_test(expr: Expr, expected_lines: Vec<&str>) { - let batch = test_batch(); + let batch = TEST_BATCH.clone(); let ctx = SessionContext::new(); let group_expr = vec![]; @@ -347,7 +347,7 @@ async fn evaluate_agg_test(expr: Expr, expected_lines: Vec<&str>) { /// Converts the `Expr` to a `PhysicalExpr`, evaluates it against the provided /// `RecordBatch` and compares the result to the expected result. fn evaluate_expr_test(expr: Expr, expected_lines: Vec<&str>) { - let batch = test_batch(); + let batch = TEST_BATCH.clone(); let df_schema = DFSchema::try_from(batch.schema()).unwrap(); let physical_expr = SessionContext::new() .create_physical_expr(expr, &df_schema) @@ -365,39 +365,33 @@ fn evaluate_expr_test(expr: Expr, expected_lines: Vec<&str>) { ); } -static TEST_BATCH: OnceLock = OnceLock::new(); - -fn test_batch() -> RecordBatch { - TEST_BATCH - .get_or_init(|| { - let string_array: ArrayRef = Arc::new(StringArray::from(vec!["1", "2", "3"])); - let int_array: ArrayRef = - Arc::new(Int64Array::from_iter(vec![Some(10), None, Some(5)])); - - // { a: "2021-02-01" } { a: "2021-02-02" } { a: "2021-02-03" } - let struct_array: ArrayRef = Arc::from(StructArray::from(vec![( - Arc::new(Field::new("a", DataType::Utf8, false)), - Arc::new(StringArray::from(vec![ - "2021-02-01", - "2021-02-02", - "2021-02-03", - ])) as _, - )])); - - // ["one"] ["two", "three", "four"] ["five"] - let mut builder = ListBuilder::new(StringBuilder::new()); - builder.append_value([Some("one")]); - builder.append_value([Some("two"), Some("three"), Some("four")]); - builder.append_value([Some("five")]); - let list_array: ArrayRef = Arc::new(builder.finish()); - - RecordBatch::try_from_iter(vec![ - ("id", string_array), - ("i", int_array), - ("props", struct_array), - ("list", list_array), - ]) - .unwrap() - }) - .clone() -} +static TEST_BATCH: LazyLock = LazyLock::new(|| { + let string_array: ArrayRef = Arc::new(StringArray::from(vec!["1", "2", "3"])); + let int_array: ArrayRef = + Arc::new(Int64Array::from_iter(vec![Some(10), None, Some(5)])); + + // { a: "2021-02-01" } { a: "2021-02-02" } { a: "2021-02-03" } + let struct_array: ArrayRef = Arc::from(StructArray::from(vec![( + Arc::new(Field::new("a", DataType::Utf8, false)), + Arc::new(StringArray::from(vec![ + "2021-02-01", + "2021-02-02", + "2021-02-03", + ])) as _, + )])); + + // ["one"] ["two", "three", "four"] ["five"] + let mut builder = ListBuilder::new(StringBuilder::new()); + builder.append_value([Some("one")]); + builder.append_value([Some("two"), Some("three"), Some("four")]); + builder.append_value([Some("five")]); + let list_array: ArrayRef = Arc::new(builder.finish()); + + RecordBatch::try_from_iter(vec![ + ("id", string_array), + ("i", int_array), + ("props", struct_array), + ("list", list_array), + ]) + .unwrap() +}); diff --git a/datafusion/core/tests/memory_limit/mod.rs b/datafusion/core/tests/memory_limit/mod.rs index f62a019eb960..a775e3458912 100644 --- a/datafusion/core/tests/memory_limit/mod.rs +++ b/datafusion/core/tests/memory_limit/mod.rs @@ -30,7 +30,7 @@ use datafusion_expr::{Expr, TableType}; use datafusion_physical_expr::{LexOrdering, PhysicalSortExpr}; use futures::StreamExt; use std::any::Any; -use std::sync::{Arc, OnceLock}; +use std::sync::{Arc, LazyLock}; use tokio::fs::File; use datafusion::datasource::streaming::StreamingTable; @@ -568,7 +568,10 @@ impl Scenario { single_row_batches, .. } = self { - batches_byte_size(&maybe_split_batches(dict_batches(), *single_row_batches)) + batches_byte_size(&maybe_split_batches( + DICT_BATCHES.clone(), + *single_row_batches, + )) } else { panic!("Scenario does not support partition size"); } @@ -604,7 +607,7 @@ impl Scenario { } => { use datafusion::physical_expr::expressions::col; let batches: Vec> = std::iter::repeat(maybe_split_batches( - dict_batches(), + DICT_BATCHES.clone(), *single_row_batches, )) .take(*partitions) @@ -686,18 +689,11 @@ fn maybe_split_batches( .collect() } -static DICT_BATCHES: OnceLock> = OnceLock::new(); - -/// Returns 5 sorted string dictionary batches each with 50 rows with -/// this schema. +/// 5 sorted string dictionary batches each with 50 rows with this schema. /// /// a: Dictionary, /// b: Dictionary, -fn dict_batches() -> Vec { - DICT_BATCHES.get_or_init(make_dict_batches).clone() -} - -fn make_dict_batches() -> Vec { +static DICT_BATCHES: LazyLock> = LazyLock::new(|| { let batch_size = 50; let mut i = 0; @@ -731,7 +727,7 @@ fn make_dict_batches() -> Vec { }); batches -} +}); // How many bytes does the memory from dict_batches consume? fn batches_byte_size(batches: &[RecordBatch]) -> usize { diff --git a/datafusion/core/tests/parquet/external_access_plan.rs b/datafusion/core/tests/parquet/external_access_plan.rs index 03afc858dfca..e6dc02fac556 100644 --- a/datafusion/core/tests/parquet/external_access_plan.rs +++ b/datafusion/core/tests/parquet/external_access_plan.rs @@ -33,7 +33,7 @@ use datafusion_physical_plan::ExecutionPlan; use parquet::arrow::arrow_reader::{RowSelection, RowSelector}; use parquet::arrow::ArrowWriter; use parquet::file::properties::WriterProperties; -use std::sync::{Arc, OnceLock}; +use std::sync::{Arc, LazyLock}; use tempfile::NamedTempFile; #[tokio::test] @@ -317,7 +317,7 @@ impl TestFull { schema, file_name, file_size, - } = get_test_data(); + } = TEST_DATA; let mut partitioned_file = PartitionedFile::new(file_name, *file_size); @@ -369,46 +369,42 @@ struct TestData { file_size: u64, } -static TEST_DATA: OnceLock = OnceLock::new(); +/// A parquet file with 2 row groups each with 5 rows +static TEST_DATA: LazyLock = LazyLock::new(|| { + let scenario = Scenario::UTF8; + let row_per_group = 5; -/// Return a parquet file with 2 row groups each with 5 rows -fn get_test_data() -> &'static TestData { - TEST_DATA.get_or_init(|| { - let scenario = Scenario::UTF8; - let row_per_group = 5; + let mut temp_file = tempfile::Builder::new() + .prefix("user_access_plan") + .suffix(".parquet") + .tempfile() + .expect("tempfile creation"); - let mut temp_file = tempfile::Builder::new() - .prefix("user_access_plan") - .suffix(".parquet") - .tempfile() - .expect("tempfile creation"); + let props = WriterProperties::builder() + .set_max_row_group_size(row_per_group) + .build(); - let props = WriterProperties::builder() - .set_max_row_group_size(row_per_group) - .build(); + let batches = create_data_batch(scenario); + let schema = batches[0].schema(); - let batches = create_data_batch(scenario); - let schema = batches[0].schema(); + let mut writer = + ArrowWriter::try_new(&mut temp_file, schema.clone(), Some(props)).unwrap(); - let mut writer = - ArrowWriter::try_new(&mut temp_file, schema.clone(), Some(props)).unwrap(); - - for batch in batches { - writer.write(&batch).expect("writing batch"); - } - writer.close().unwrap(); + for batch in batches { + writer.write(&batch).expect("writing batch"); + } + writer.close().unwrap(); - let file_name = temp_file.path().to_string_lossy().to_string(); - let file_size = temp_file.path().metadata().unwrap().len(); + let file_name = temp_file.path().to_string_lossy().to_string(); + let file_size = temp_file.path().metadata().unwrap().len(); - TestData { - temp_file, - schema, - file_name, - file_size, - } - }) -} + TestData { + temp_file, + schema, + file_name, + file_size, + } +}); /// Return the total value of the specified metric name fn metric_value(parquet_metrics: &MetricsSet, metric_name: &str) -> Option { diff --git a/datafusion/expr/src/test/function_stub.rs b/datafusion/expr/src/test/function_stub.rs index 14a6522ebe91..a6c7f1d19396 100644 --- a/datafusion/expr/src/test/function_stub.rs +++ b/datafusion/expr/src/test/function_stub.rs @@ -20,6 +20,7 @@ //! These are used to avoid a dependence on `datafusion-functions-aggregate` which live in a different crate use std::any::Any; +use std::sync::{Arc, LazyLock}; use arrow::datatypes::{ DataType, Field, DECIMAL128_MAX_PRECISION, DECIMAL256_MAX_PRECISION, @@ -43,18 +44,13 @@ macro_rules! create_func { /// Singleton instance of [$UDAF], ensures the UDAF is only created once /// named STATIC_$(UDAF). For example `STATIC_FirstValue` #[allow(non_upper_case_globals)] - static [< STATIC_ $UDAF >]: std::sync::OnceLock> = - std::sync::OnceLock::new(); + static [< STATIC_ $UDAF >]: LazyLock> = LazyLock::new(|| { + Arc::new(crate::AggregateUDF::from(<$UDAF>::default())) + }); - /// AggregateFunction that returns a [AggregateUDF] for [$UDAF] - /// - /// [AggregateUDF]: crate::AggregateUDF + #[doc = concat!("AggregateFunction that returns a [AggregateUDF](crate::AggregateUDF) for [`", stringify!($UDAF), "`]")] pub fn $AGGREGATE_UDF_FN() -> std::sync::Arc { - [< STATIC_ $UDAF >] - .get_or_init(|| { - std::sync::Arc::new(crate::AggregateUDF::from(<$UDAF>::default())) - }) - .clone() + [< STATIC_ $UDAF >].clone() } } } diff --git a/datafusion/functions-aggregate/src/macros.rs b/datafusion/functions-aggregate/src/macros.rs index cae72cf35223..233b3983df9d 100644 --- a/datafusion/functions-aggregate/src/macros.rs +++ b/datafusion/functions-aggregate/src/macros.rs @@ -85,18 +85,14 @@ macro_rules! create_func { /// Singleton instance of [$UDAF], ensures the UDAF is only created once /// named STATIC_$(UDAF). For example `STATIC_FirstValue` #[allow(non_upper_case_globals)] - static [< STATIC_ $UDAF >]: std::sync::OnceLock> = - std::sync::OnceLock::new(); + static [< STATIC_ $UDAF >]: std::sync::LazyLock> = + std::sync::LazyLock::new(|| { + std::sync::Arc::new(datafusion_expr::AggregateUDF::from($CREATE)) + }); - /// AggregateFunction that returns a [AggregateUDF] for [$UDAF] - /// - /// [AggregateUDF]: datafusion_expr::AggregateUDF + #[doc = concat!("AggregateFunction that returns a [`AggregateUDF`](datafusion_expr::AggregateUDF) for [`", stringify!($UDAF), "`]")] pub fn $AGGREGATE_UDF_FN() -> std::sync::Arc { - [< STATIC_ $UDAF >] - .get_or_init(|| { - std::sync::Arc::new(datafusion_expr::AggregateUDF::from($CREATE)) - }) - .clone() + [< STATIC_ $UDAF >].clone() } } } diff --git a/datafusion/functions-nested/src/macros.rs b/datafusion/functions-nested/src/macros.rs index a6e0c2ee62be..3042cee344d8 100644 --- a/datafusion/functions-nested/src/macros.rs +++ b/datafusion/functions-nested/src/macros.rs @@ -88,19 +88,14 @@ macro_rules! create_func { /// Singleton instance of [`$UDF`], ensures the UDF is only created once /// named STATIC_$(UDF). For example `STATIC_ArrayToString` #[allow(non_upper_case_globals)] - static [< STATIC_ $UDF >]: std::sync::OnceLock> = - std::sync::OnceLock::new(); - /// ScalarFunction that returns a [`ScalarUDF`] for [`$UDF`] - /// - /// [`ScalarUDF`]: datafusion_expr::ScalarUDF + static [< STATIC_ $UDF >]: std::sync::LazyLock> = + std::sync::LazyLock::new(|| { + std::sync::Arc::new(datafusion_expr::ScalarUDF::new_from_impl(<$UDF>::new())) + }); + + #[doc = concat!("ScalarFunction that returns a [`ScalarUDF`](datafusion_expr::ScalarUDF) for [`", stringify!($UDF), "`]")] pub fn $SCALAR_UDF_FN() -> std::sync::Arc { - [< STATIC_ $UDF >] - .get_or_init(|| { - std::sync::Arc::new(datafusion_expr::ScalarUDF::new_from_impl( - <$UDF>::new(), - )) - }) - .clone() + [< STATIC_ $UDF >].clone() } } }; diff --git a/datafusion/functions/src/macros.rs b/datafusion/functions/src/macros.rs index e26c94e1bb79..bb538d88b5b5 100644 --- a/datafusion/functions/src/macros.rs +++ b/datafusion/functions/src/macros.rs @@ -72,20 +72,16 @@ macro_rules! export_functions { macro_rules! make_udf_function { ($UDF:ty, $GNAME:ident, $NAME:ident) => { /// Singleton instance of the function - static $GNAME: std::sync::OnceLock> = - std::sync::OnceLock::new(); - - /// Return a [`ScalarUDF`] for [`$UDF`] - /// - /// [`ScalarUDF`]: datafusion_expr::ScalarUDF + static $GNAME: std::sync::LazyLock> = + std::sync::LazyLock::new(|| { + std::sync::Arc::new(datafusion_expr::ScalarUDF::new_from_impl( + <$UDF>::new(), + )) + }); + + #[doc = concat!("Return a [`ScalarUDF`](datafusion_expr::ScalarUDF) for [`", stringify!($UDF), "`]")] pub fn $NAME() -> std::sync::Arc { - $GNAME - .get_or_init(|| { - std::sync::Arc::new(datafusion_expr::ScalarUDF::new_from_impl( - <$UDF>::new(), - )) - }) - .clone() + $GNAME.clone() } }; } diff --git a/datafusion/functions/src/regex/regexpreplace.rs b/datafusion/functions/src/regex/regexpreplace.rs index d820f991be18..aed117b79eb1 100644 --- a/datafusion/functions/src/regex/regexpreplace.rs +++ b/datafusion/functions/src/regex/regexpreplace.rs @@ -36,7 +36,7 @@ use regex::Regex; use std::any::Any; use std::collections::HashMap; use std::sync::Arc; -use std::sync::OnceLock; +use std::sync::LazyLock; #[derive(Debug)] pub struct RegexpReplaceFunc { signature: Signature, @@ -118,6 +118,7 @@ impl ScalarUDFImpl for RegexpReplaceFunc { } } } + fn regexp_replace_func(args: &[ColumnarValue]) -> Result { match args[0].data_type() { DataType::Utf8 => specialize_regexp_replace::(args), @@ -127,14 +128,14 @@ fn regexp_replace_func(args: &[ColumnarValue]) -> Result { } } } -/// replace POSIX capture groups (like \1) with Rust Regex group (like ${1}) + +/// Replace POSIX capture groups (like \1) with Rust Regex group (like ${1}) /// used by regexp_replace fn regex_replace_posix_groups(replacement: &str) -> String { - fn capture_groups_re() -> &'static Regex { - static CAPTURE_GROUPS_RE_LOCK: OnceLock = OnceLock::new(); - CAPTURE_GROUPS_RE_LOCK.get_or_init(|| Regex::new(r"(\\)(\d*)").unwrap()) - } - capture_groups_re() + static CAPTURE_GROUPS_RE_LOCK: LazyLock = + LazyLock::new(|| Regex::new(r"(\\)(\d*)").unwrap()); + + CAPTURE_GROUPS_RE_LOCK .replace_all(replacement, "$${$2}") .into_owned() } diff --git a/datafusion/physical-expr/src/utils/guarantee.rs b/datafusion/physical-expr/src/utils/guarantee.rs index 4385066529e7..e1a104423727 100644 --- a/datafusion/physical-expr/src/utils/guarantee.rs +++ b/datafusion/physical-expr/src/utils/guarantee.rs @@ -420,7 +420,7 @@ impl<'a> ColOpLit<'a> { #[cfg(test)] mod test { - use std::sync::OnceLock; + use std::sync::LazyLock; use super::*; use crate::planner::logical2physical; @@ -835,8 +835,7 @@ mod test { /// Tests that analyzing expr results in the expected guarantees fn test_analyze(expr: Expr, expected: Vec) { println!("Begin analyze of {expr}"); - let schema = schema(); - let physical_expr = logical2physical(&expr, &schema); + let physical_expr = logical2physical(&expr, &SCHEMA); let actual = LiteralGuarantee::analyze(&physical_expr); assert_eq!( @@ -869,15 +868,10 @@ mod test { LiteralGuarantee::try_new(column, Guarantee::NotIn, literals.iter()).unwrap() } - // Schema for testing - fn schema() -> SchemaRef { - Arc::clone(SCHEMA.get_or_init(|| { - Arc::new(Schema::new(vec![ - Field::new("a", DataType::Utf8, false), - Field::new("b", DataType::Int32, false), - ])) - })) - } - - static SCHEMA: OnceLock = OnceLock::new(); + static SCHEMA: LazyLock = LazyLock::new(|| { + Arc::new(Schema::new(vec![ + Field::new("a", DataType::Utf8, false), + Field::new("b", DataType::Int32, false), + ])) + }); } diff --git a/datafusion/sqllogictest/src/engines/datafusion_engine/normalize.rs b/datafusion/sqllogictest/src/engines/datafusion_engine/normalize.rs index 520b6b53b32d..f0b3b0354346 100644 --- a/datafusion/sqllogictest/src/engines/datafusion_engine/normalize.rs +++ b/datafusion/sqllogictest/src/engines/datafusion_engine/normalize.rs @@ -21,7 +21,7 @@ use arrow::{array, array::ArrayRef, datatypes::DataType, record_batch::RecordBat use datafusion_common::format::DEFAULT_FORMAT_OPTIONS; use datafusion_common::DataFusionError; use std::path::PathBuf; -use std::sync::OnceLock; +use std::sync::LazyLock; use crate::engines::output::DFColumnType; @@ -130,7 +130,7 @@ fn expand_row(mut row: Vec) -> impl Iterator> { /// ``` fn normalize_paths(mut row: Vec) -> Vec { row.iter_mut().for_each(|s| { - let workspace_root: &str = workspace_root().as_ref(); + let workspace_root: &str = WORKSPACE_ROOT.as_ref(); if s.contains(workspace_root) { *s = s.replace(workspace_root, "WORKSPACE_ROOT"); } @@ -138,34 +138,30 @@ fn normalize_paths(mut row: Vec) -> Vec { row } -/// return the location of the datafusion checkout -fn workspace_root() -> &'static object_store::path::Path { - static WORKSPACE_ROOT_LOCK: OnceLock = OnceLock::new(); - WORKSPACE_ROOT_LOCK.get_or_init(|| { - // e.g. /Software/datafusion/datafusion/core - let dir = PathBuf::from(env!("CARGO_MANIFEST_DIR")); +/// The location of the datafusion checkout +static WORKSPACE_ROOT: LazyLock = LazyLock::new(|| { + // e.g. /Software/datafusion/datafusion/core + let dir = PathBuf::from(env!("CARGO_MANIFEST_DIR")); - // e.g. /Software/datafusion/datafusion - let workspace_root = dir - .parent() - .expect("Can not find parent of datafusion/core") - // e.g. /Software/datafusion - .parent() - .expect("parent of datafusion") - .to_string_lossy(); + // e.g. /Software/datafusion/datafusion + let workspace_root = dir + .parent() + .expect("Can not find parent of datafusion/core") + // e.g. /Software/datafusion + .parent() + .expect("parent of datafusion") + .to_string_lossy(); - let sanitized_workplace_root = if cfg!(windows) { - // Object store paths are delimited with `/`, e.g. `/datafusion/datafusion/testing/data/csv/aggregate_test_100.csv`. - // The default windows delimiter is `\`, so the workplace path is `datafusion\datafusion`. - workspace_root - .replace(std::path::MAIN_SEPARATOR, object_store::path::DELIMITER) - } else { - workspace_root.to_string() - }; + let sanitized_workplace_root = if cfg!(windows) { + // Object store paths are delimited with `/`, e.g. `/datafusion/datafusion/testing/data/csv/aggregate_test_100.csv`. + // The default windows delimiter is `\`, so the workplace path is `datafusion\datafusion`. + workspace_root.replace(std::path::MAIN_SEPARATOR, object_store::path::DELIMITER) + } else { + workspace_root.to_string() + }; - object_store::path::Path::parse(sanitized_workplace_root).unwrap() - }) -} + object_store::path::Path::parse(sanitized_workplace_root).unwrap() +}); /// Convert a single batch to a `Vec>` for comparison fn convert_batch(batch: RecordBatch) -> Result>> { From ac6962256c976a1d307bf5ad7b5e6071f74b7f79 Mon Sep 17 00:00:00 2001 From: Alexander Rafferty Date: Sun, 28 Jul 2024 21:03:58 +1000 Subject: [PATCH 2/2] Fix bug in `external_access_plan.rs` and update remaining MSRVs to 1.80. --- datafusion-cli/Cargo.toml | 2 +- datafusion/core/tests/parquet/external_access_plan.rs | 2 +- datafusion/proto-common/Cargo.toml | 2 +- datafusion/proto-common/gen/Cargo.toml | 2 +- datafusion/proto/Cargo.toml | 2 +- datafusion/proto/gen/Cargo.toml | 2 +- datafusion/substrait/Cargo.toml | 2 +- 7 files changed, 7 insertions(+), 7 deletions(-) diff --git a/datafusion-cli/Cargo.toml b/datafusion-cli/Cargo.toml index 860dc123fa94..9583492c4113 100644 --- a/datafusion-cli/Cargo.toml +++ b/datafusion-cli/Cargo.toml @@ -26,7 +26,7 @@ license = "Apache-2.0" homepage = "https://datafusion.apache.org" repository = "https://github.com/apache/datafusion" # Specify MSRV here as `cargo msrv` doesn't support workspace version -rust-version = "1.76" +rust-version = "1.80" readme = "README.md" [dependencies] diff --git a/datafusion/core/tests/parquet/external_access_plan.rs b/datafusion/core/tests/parquet/external_access_plan.rs index e6dc02fac556..cce9c66f1e71 100644 --- a/datafusion/core/tests/parquet/external_access_plan.rs +++ b/datafusion/core/tests/parquet/external_access_plan.rs @@ -317,7 +317,7 @@ impl TestFull { schema, file_name, file_size, - } = TEST_DATA; + } = &*TEST_DATA; let mut partitioned_file = PartitionedFile::new(file_name, *file_size); diff --git a/datafusion/proto-common/Cargo.toml b/datafusion/proto-common/Cargo.toml index e5d65827cdec..9b2f15a9a710 100644 --- a/datafusion/proto-common/Cargo.toml +++ b/datafusion/proto-common/Cargo.toml @@ -26,7 +26,7 @@ homepage = { workspace = true } repository = { workspace = true } license = { workspace = true } authors = { workspace = true } -rust-version = "1.76" +rust-version = "1.80" # Exclude proto files so crates.io consumers don't need protoc exclude = ["*.proto"] diff --git a/datafusion/proto-common/gen/Cargo.toml b/datafusion/proto-common/gen/Cargo.toml index 54ec0e44694b..bb03208b2b70 100644 --- a/datafusion/proto-common/gen/Cargo.toml +++ b/datafusion/proto-common/gen/Cargo.toml @@ -20,7 +20,7 @@ name = "gen-common" description = "Code generation for proto" version = "0.1.0" edition = { workspace = true } -rust-version = "1.76" +rust-version = "1.80" authors = { workspace = true } homepage = { workspace = true } repository = { workspace = true } diff --git a/datafusion/proto/Cargo.toml b/datafusion/proto/Cargo.toml index 95d9e6700a50..4203bd7a28c0 100644 --- a/datafusion/proto/Cargo.toml +++ b/datafusion/proto/Cargo.toml @@ -27,7 +27,7 @@ repository = { workspace = true } license = { workspace = true } authors = { workspace = true } # Specify MSRV here as `cargo msrv` doesn't support workspace version -rust-version = "1.76" +rust-version = "1.80" # Exclude proto files so crates.io consumers don't need protoc exclude = ["*.proto"] diff --git a/datafusion/proto/gen/Cargo.toml b/datafusion/proto/gen/Cargo.toml index 401c51c94563..e69282540cb2 100644 --- a/datafusion/proto/gen/Cargo.toml +++ b/datafusion/proto/gen/Cargo.toml @@ -20,7 +20,7 @@ name = "gen" description = "Code generation for proto" version = "0.1.0" edition = { workspace = true } -rust-version = "1.76" +rust-version = "1.80" authors = { workspace = true } homepage = { workspace = true } repository = { workspace = true } diff --git a/datafusion/substrait/Cargo.toml b/datafusion/substrait/Cargo.toml index 9e7ef9632ad3..0647263225c4 100644 --- a/datafusion/substrait/Cargo.toml +++ b/datafusion/substrait/Cargo.toml @@ -26,7 +26,7 @@ repository = { workspace = true } license = { workspace = true } authors = { workspace = true } # Specify MSRV here as `cargo msrv` doesn't support workspace version -rust-version = "1.76" +rust-version = "1.80" [lints] workspace = true