From 21f76c22e9660b2d6f207df9379068d919fa7e67 Mon Sep 17 00:00:00 2001 From: kf zheng <100595273+kev1n8@users.noreply.github.com> Date: Mon, 12 Aug 2024 21:11:45 +0800 Subject: [PATCH 1/5] add stringview option for ltrim --- datafusion/functions/src/string/ltrim.rs | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/datafusion/functions/src/string/ltrim.rs b/datafusion/functions/src/string/ltrim.rs index 6a9fafdd9299..08d02d8be353 100644 --- a/datafusion/functions/src/string/ltrim.rs +++ b/datafusion/functions/src/string/ltrim.rs @@ -32,7 +32,8 @@ use crate::utils::{make_scalar_function, utf8_to_str_type}; /// Returns the longest string with leading characters removed. If the characters are not specified, whitespace is removed. /// ltrim('zzzytest', 'xyz') = 'test' fn ltrim(args: &[ArrayRef]) -> Result { - general_trim::(args, TrimType::Left, false) + let use_string_view = args[0].data_type() == &DataType::Utf8View; + general_trim::(args, TrimType::Left, use_string_view) } #[derive(Debug)] @@ -51,7 +52,16 @@ impl LtrimFunc { use DataType::*; Self { signature: Signature::one_of( - vec![Exact(vec![Utf8]), Exact(vec![Utf8, Utf8])], + vec![ + // Planner attempts coercion to the target type starting with the most preferred candidate. + // For example, given input `(Utf8View, Utf8)`, it first tries coercing to `(Utf8View, Utf8View)`. + // If that fails, it proceeds to `(Utf8, Utf8)`. + Exact(vec![Utf8View, Utf8View]), + // Exact(vec![Utf8, Utf8View]), + Exact(vec![Utf8, Utf8]), + Exact(vec![Utf8View]), + Exact(vec![Utf8]), + ], Volatility::Immutable, ), } @@ -77,7 +87,7 @@ impl ScalarUDFImpl for LtrimFunc { fn invoke(&self, args: &[ColumnarValue]) -> Result { match args[0].data_type() { - DataType::Utf8 => make_scalar_function( + DataType::Utf8 | DataType::Utf8View => make_scalar_function( ltrim::, vec![Hint::Pad, Hint::AcceptsSingular], )(args), @@ -85,7 +95,10 @@ impl ScalarUDFImpl for LtrimFunc { ltrim::, vec![Hint::Pad, Hint::AcceptsSingular], )(args), - other => exec_err!("Unsupported data type {other:?} for function ltrim"), + other => exec_err!( + "Unsupported data type {other:?} for function ltrim,\ + expected for Utf8, LargeUtf8 or Utf8View." + ), } } } From d98a01fd71bc942751440f34193d963af0e0f2c6 Mon Sep 17 00:00:00 2001 From: kf zheng <100595273+kev1n8@users.noreply.github.com> Date: Mon, 12 Aug 2024 21:11:53 +0800 Subject: [PATCH 2/5] add stringview option for rtrim --- datafusion/functions/src/string/rtrim.rs | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/datafusion/functions/src/string/rtrim.rs b/datafusion/functions/src/string/rtrim.rs index 50b626e3df0e..aeb23a9a2ef2 100644 --- a/datafusion/functions/src/string/rtrim.rs +++ b/datafusion/functions/src/string/rtrim.rs @@ -32,7 +32,8 @@ use crate::utils::{make_scalar_function, utf8_to_str_type}; /// Returns the longest string with trailing characters removed. If the characters are not specified, whitespace is removed. /// rtrim('testxxzx', 'xyz') = 'test' fn rtrim(args: &[ArrayRef]) -> Result { - general_trim::(args, TrimType::Right, false) + let use_string_view = args[0].data_type() == &DataType::Utf8View; + general_trim::(args, TrimType::Right, use_string_view) } #[derive(Debug)] @@ -51,7 +52,16 @@ impl RtrimFunc { use DataType::*; Self { signature: Signature::one_of( - vec![Exact(vec![Utf8]), Exact(vec![Utf8, Utf8])], + vec![ + // Planner attempts coercion to the target type starting with the most preferred candidate. + // For example, given input `(Utf8View, Utf8)`, it first tries coercing to `(Utf8View, Utf8View)`. + // If that fails, it proceeds to `(Utf8, Utf8)`. + Exact(vec![Utf8View, Utf8View]), + // Exact(vec![Utf8, Utf8View]), + Exact(vec![Utf8, Utf8]), + Exact(vec![Utf8View]), + Exact(vec![Utf8]), + ], Volatility::Immutable, ), } @@ -77,7 +87,7 @@ impl ScalarUDFImpl for RtrimFunc { fn invoke(&self, args: &[ColumnarValue]) -> Result { match args[0].data_type() { - DataType::Utf8 => make_scalar_function( + DataType::Utf8 | DataType::Utf8View => make_scalar_function( rtrim::, vec![Hint::Pad, Hint::AcceptsSingular], )(args), @@ -85,7 +95,10 @@ impl ScalarUDFImpl for RtrimFunc { rtrim::, vec![Hint::Pad, Hint::AcceptsSingular], )(args), - other => exec_err!("Unsupported data type {other:?} for function rtrim"), + other => exec_err!( + "Unsupported data type {other:?} for function rtrim,\ + expected for Utf8, LargeUtf8 or Utf8View." + ), } } } From 069bce94198c3f953990605a0123dc9a0d2dbc38 Mon Sep 17 00:00:00 2001 From: kf zheng <100595273+kev1n8@users.noreply.github.com> Date: Mon, 12 Aug 2024 21:12:35 +0800 Subject: [PATCH 3/5] add some tests to ensure no casts for ltrim & rtrim when using stringview --- .../sqllogictest/test_files/string_view.slt | 126 +++++++++++++----- 1 file changed, 91 insertions(+), 35 deletions(-) diff --git a/datafusion/sqllogictest/test_files/string_view.slt b/datafusion/sqllogictest/test_files/string_view.slt index fcd71b7f7e94..9685fea89cfa 100644 --- a/datafusion/sqllogictest/test_files/string_view.slt +++ b/datafusion/sqllogictest/test_files/string_view.slt @@ -607,6 +607,97 @@ Xiangpeng Xiangpeng Xiangpeng NULL Raphael Raphael Raphael NULL NULL NULL NULL NULL +## Ensure no casts for LTRIM +# Test LTRIM with Utf8View input +query TT +EXPLAIN SELECT + LTRIM(column1_utf8view) AS l +FROM test; +---- +logical_plan +01)Projection: ltrim(test.column1_utf8view) AS l +02)--TableScan: test projection=[column1_utf8view] + +# Test LTRIM with Utf8View input and Utf8View pattern +query TT +EXPLAIN SELECT + LTRIM(column1_utf8view, 'foo') AS l +FROM test; +---- +logical_plan +01)Projection: ltrim(test.column1_utf8view, Utf8View("foo")) AS l +02)--TableScan: test projection=[column1_utf8view] + +# Test LTRIM with Utf8View bytes longer than 12 +query TT +EXPLAIN SELECT + LTRIM(column1_utf8view, 'this is longer than 12') AS l +FROM test; +---- +logical_plan +01)Projection: ltrim(test.column1_utf8view, Utf8View("this is longer than 12")) AS l +02)--TableScan: test projection=[column1_utf8view] + +# Test LTRIM outputs +query TTTT +SELECT + LTRIM(column1_utf8view, 'foo') AS l1, + LTRIM(column1_utf8view, column2_utf8view) AS l2, + LTRIM(column1_utf8view) AS l3, + LTRIM(column1_utf8view, NULL) AS l4 +FROM test; +---- +Andrew Andrew Andrew NULL +Xiangpeng (empty) Xiangpeng NULL +Raphael aphael Raphael NULL +NULL NULL NULL NULL + +## ensure no casts for RTRIM +# Test RTRIM with Utf8View input +query TT +EXPLAIN SELECT + RTRIM(column1_utf8view) AS l +FROM test; +---- +logical_plan +01)Projection: rtrim(test.column1_utf8view) AS l +02)--TableScan: test projection=[column1_utf8view] + +# Test RTRIM with Utf8View input and Utf8View pattern +query TT +EXPLAIN SELECT + RTRIM(column1_utf8view, 'foo') AS l +FROM test; +---- +logical_plan +01)Projection: rtrim(test.column1_utf8view, Utf8View("foo")) AS l +02)--TableScan: test projection=[column1_utf8view] + +# Test RTRIM with Utf8View bytes longer than 12 +query TT +EXPLAIN SELECT + RTRIM(column1_utf8view, 'this is longer than 12') AS l +FROM test; +---- +logical_plan +01)Projection: rtrim(test.column1_utf8view, Utf8View("this is longer than 12")) AS l +02)--TableScan: test projection=[column1_utf8view] + +# Test RTRIM outputs +query TTTT +SELECT + RTRIM(column1_utf8view, 'foo') AS l1, + RTRIM(column1_utf8view, column2_utf8view) AS l2, + RTRIM(column1_utf8view) AS l3, + RTRIM(column1_utf8view, NULL) AS l4 +FROM test; +---- +Andrew Andrew Andrew NULL +Xiangpeng (empty) Xiangpeng NULL +Raphael Raphael Raphael NULL +NULL NULL NULL NULL + + ## Ensure no casts for CHARACTER_LENGTH query TT EXPLAIN SELECT @@ -685,16 +776,6 @@ logical_plan 01)Projection: lower(CAST(test.column1_utf8view AS Utf8)) AS c1 02)--TableScan: test projection=[column1_utf8view] -## Ensure no casts for LTRIM -## TODO https://github.com/apache/datafusion/issues/11856 -query TT -EXPLAIN SELECT - LTRIM(column1_utf8view) as c1 -FROM test; ----- -logical_plan -01)Projection: ltrim(CAST(test.column1_utf8view AS Utf8)) AS c1 -02)--TableScan: test projection=[column1_utf8view] ## Ensure no casts for LPAD ## TODO https://github.com/apache/datafusion/issues/11857 @@ -795,18 +876,6 @@ logical_plan 01)Projection: reverse(CAST(test.column1_utf8view AS Utf8)) AS c1 02)--TableScan: test projection=[column1_utf8view] -## Ensure no casts for RTRIM -## TODO file ticket -query TT -EXPLAIN SELECT - RTRIM(column1_utf8view) as c1, - RTRIM(column1_utf8view, 'foo') as c2 -FROM test; ----- -logical_plan -01)Projection: rtrim(__common_expr_1) AS c1, rtrim(__common_expr_1, Utf8("foo")) AS c2 -02)--Projection: CAST(test.column1_utf8view AS Utf8) AS __common_expr_1 -03)----TableScan: test projection=[column1_utf8view] ## Ensure no casts for RIGHT ## TODO file ticket @@ -833,19 +902,6 @@ logical_plan 03)----TableScan: test projection=[column1_utf8view, column2_utf8view] -## Ensure no casts for RTRIM -## TODO file ticket -query TT -EXPLAIN SELECT - RTRIM(column1_utf8view) as c, - RTRIM(column1_utf8view, column2_utf8view) as c1 -FROM test; ----- -logical_plan -01)Projection: rtrim(__common_expr_1) AS c, rtrim(__common_expr_1, CAST(test.column2_utf8view AS Utf8)) AS c1 -02)--Projection: CAST(test.column1_utf8view AS Utf8) AS __common_expr_1, test.column2_utf8view -03)----TableScan: test projection=[column1_utf8view, column2_utf8view] - ## Ensure no casts for SPLIT_PART ## TODO file ticket query TT From 08dc02952105a56b73715e4c559114d9246d960b Mon Sep 17 00:00:00 2001 From: kf zheng <100595273+kev1n8@users.noreply.github.com> Date: Tue, 13 Aug 2024 10:59:18 +0800 Subject: [PATCH 4/5] fix typo and remove useless comments --- datafusion/functions/src/string/btrim.rs | 3 +-- datafusion/functions/src/string/ltrim.rs | 3 +-- datafusion/functions/src/string/rtrim.rs | 3 +-- 3 files changed, 3 insertions(+), 6 deletions(-) diff --git a/datafusion/functions/src/string/btrim.rs b/datafusion/functions/src/string/btrim.rs index 86470dd7a646..371a11c82c54 100644 --- a/datafusion/functions/src/string/btrim.rs +++ b/datafusion/functions/src/string/btrim.rs @@ -57,7 +57,6 @@ impl BTrimFunc { // For example, given input `(Utf8View, Utf8)`, it first tries coercing to `(Utf8View, Utf8View)`. // If that fails, it proceeds to `(Utf8, Utf8)`. Exact(vec![Utf8View, Utf8View]), - // Exact(vec![Utf8, Utf8View]), Exact(vec![Utf8, Utf8]), Exact(vec![Utf8View]), Exact(vec![Utf8]), @@ -98,7 +97,7 @@ impl ScalarUDFImpl for BTrimFunc { )(args), other => exec_err!( "Unsupported data type {other:?} for function btrim,\ - expected for Utf8, LargeUtf8 or Utf8View." + expected Utf8, LargeUtf8 or Utf8View." ), } } diff --git a/datafusion/functions/src/string/ltrim.rs b/datafusion/functions/src/string/ltrim.rs index 08d02d8be353..b7b27afcee1f 100644 --- a/datafusion/functions/src/string/ltrim.rs +++ b/datafusion/functions/src/string/ltrim.rs @@ -57,7 +57,6 @@ impl LtrimFunc { // For example, given input `(Utf8View, Utf8)`, it first tries coercing to `(Utf8View, Utf8View)`. // If that fails, it proceeds to `(Utf8, Utf8)`. Exact(vec![Utf8View, Utf8View]), - // Exact(vec![Utf8, Utf8View]), Exact(vec![Utf8, Utf8]), Exact(vec![Utf8View]), Exact(vec![Utf8]), @@ -97,7 +96,7 @@ impl ScalarUDFImpl for LtrimFunc { )(args), other => exec_err!( "Unsupported data type {other:?} for function ltrim,\ - expected for Utf8, LargeUtf8 or Utf8View." + expected Utf8, LargeUtf8 or Utf8View." ), } } diff --git a/datafusion/functions/src/string/rtrim.rs b/datafusion/functions/src/string/rtrim.rs index aeb23a9a2ef2..ec53f3ed7430 100644 --- a/datafusion/functions/src/string/rtrim.rs +++ b/datafusion/functions/src/string/rtrim.rs @@ -57,7 +57,6 @@ impl RtrimFunc { // For example, given input `(Utf8View, Utf8)`, it first tries coercing to `(Utf8View, Utf8View)`. // If that fails, it proceeds to `(Utf8, Utf8)`. Exact(vec![Utf8View, Utf8View]), - // Exact(vec![Utf8, Utf8View]), Exact(vec![Utf8, Utf8]), Exact(vec![Utf8View]), Exact(vec![Utf8]), @@ -97,7 +96,7 @@ impl ScalarUDFImpl for RtrimFunc { )(args), other => exec_err!( "Unsupported data type {other:?} for function rtrim,\ - expected for Utf8, LargeUtf8 or Utf8View." + expected Utf8, LargeUtf8 or Utf8View." ), } } From ae90c1957eb25b3bc56c07fe71d56a9f4ff5f7e3 Mon Sep 17 00:00:00 2001 From: kf zheng <100595273+kev1n8@users.noreply.github.com> Date: Tue, 13 Aug 2024 11:06:47 +0800 Subject: [PATCH 5/5] add tests covering ltrim and rtrim functioning --- .../sqllogictest/test_files/string_view.slt | 26 ++++++++++--------- 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/datafusion/sqllogictest/test_files/string_view.slt b/datafusion/sqllogictest/test_files/string_view.slt index 9685fea89cfa..648994c84658 100644 --- a/datafusion/sqllogictest/test_files/string_view.slt +++ b/datafusion/sqllogictest/test_files/string_view.slt @@ -639,18 +639,19 @@ logical_plan 02)--TableScan: test projection=[column1_utf8view] # Test LTRIM outputs -query TTTT +query TTTTT SELECT LTRIM(column1_utf8view, 'foo') AS l1, LTRIM(column1_utf8view, column2_utf8view) AS l2, LTRIM(column1_utf8view) AS l3, - LTRIM(column1_utf8view, NULL) AS l4 + LTRIM(column1_utf8view, NULL) AS l4, + LTRIM(column1_utf8view, 'Xiang') AS l5 FROM test; ---- -Andrew Andrew Andrew NULL -Xiangpeng (empty) Xiangpeng NULL -Raphael aphael Raphael NULL -NULL NULL NULL NULL +Andrew Andrew Andrew NULL Andrew +Xiangpeng (empty) Xiangpeng NULL peng +Raphael aphael Raphael NULL Raphael +NULL NULL NULL NULL NULL ## ensure no casts for RTRIM # Test RTRIM with Utf8View input @@ -684,18 +685,19 @@ logical_plan 02)--TableScan: test projection=[column1_utf8view] # Test RTRIM outputs -query TTTT +query TTTTT SELECT RTRIM(column1_utf8view, 'foo') AS l1, RTRIM(column1_utf8view, column2_utf8view) AS l2, RTRIM(column1_utf8view) AS l3, - RTRIM(column1_utf8view, NULL) AS l4 + RTRIM(column1_utf8view, NULL) AS l4, + RTRIM(column1_utf8view, 'peng') As l5 FROM test; ---- -Andrew Andrew Andrew NULL -Xiangpeng (empty) Xiangpeng NULL -Raphael Raphael Raphael NULL -NULL NULL NULL NULL +Andrew Andrew Andrew NULL Andrew +Xiangpeng (empty) Xiangpeng NULL Xia +Raphael Raphael Raphael NULL Raphael +NULL NULL NULL NULL NULL ## Ensure no casts for CHARACTER_LENGTH