From 723ceb7ace71ba79c5bcd4dc51e505dccfc6e522 Mon Sep 17 00:00:00 2001 From: Devan Date: Wed, 28 Aug 2024 12:31:59 -0500 Subject: [PATCH 01/18] wip --- datafusion/functions/src/string/concat.rs | 15 +++++++++---- .../sqllogictest/test_files/string_view.slt | 22 +++++++++++++++++++ 2 files changed, 33 insertions(+), 4 deletions(-) diff --git a/datafusion/functions/src/string/concat.rs b/datafusion/functions/src/string/concat.rs index 6d15e2206721..8bdc5bb9ab76 100644 --- a/datafusion/functions/src/string/concat.rs +++ b/datafusion/functions/src/string/concat.rs @@ -22,7 +22,7 @@ use arrow::datatypes::DataType; use arrow::datatypes::DataType::Utf8; use datafusion_common::cast::as_string_array; -use datafusion_common::{internal_err, Result, ScalarValue}; +use datafusion_common::{exec_err, internal_err, Result, ScalarValue}; use datafusion_expr::expr::ScalarFunction; use datafusion_expr::simplify::{ExprSimplifyResult, SimplifyInfo}; use datafusion_expr::{lit, ColumnarValue, Expr, Volatility}; @@ -46,7 +46,7 @@ impl ConcatFunc { pub fn new() -> Self { use DataType::*; Self { - signature: Signature::variadic(vec![Utf8], Volatility::Immutable), + signature: Signature::variadic(vec![Utf8, Utf8View], Volatility::Immutable), } } } @@ -64,8 +64,15 @@ impl ScalarUDFImpl for ConcatFunc { &self.signature } - fn return_type(&self, _arg_types: &[DataType]) -> Result { - Ok(Utf8) + fn return_type(&self, arg_types: &[DataType]) -> Result { + use DataType::*; + Ok(match &arg_types[0] { + Utf8 => Utf8, + Utf8View => Utf8View, + other => { + exec_err!("CONCAT function cannot use {other} as a return type"); + } + }) } /// Concatenates the text representations of all the arguments. NULL arguments are ignored. diff --git a/datafusion/sqllogictest/test_files/string_view.slt b/datafusion/sqllogictest/test_files/string_view.slt index 83c75b8df38c..7c6cfdf86d0b 100644 --- a/datafusion/sqllogictest/test_files/string_view.slt +++ b/datafusion/sqllogictest/test_files/string_view.slt @@ -863,6 +863,27 @@ XIANGPENG RAPHAEL NULL +## Should run CONCAT() successfully +query T +SELECT + concat(column1_utf8view, column2_utf8view) as c +FROM test; +---- +AndrewX +XiangpengXiangpeng +RaphaelR +R + +## ensure no cast CONCAT to utf8 from utf8view +query TT +EXPLAIN SELECT + concat(column1_utf8view, column2_utf8view) as c +FROM test; +---- +logical_plan +01)Projection: concat(test.column1_utf8view, test.column2_utf8view) AS c +02)--TableScan: test projection=[column1_utf8view, column2_utf8view] + ## Ensure no casts for LPAD query TT EXPLAIN SELECT @@ -1307,3 +1328,4 @@ select column2|| ' ' ||column3 from temp; ---- rust fast datafusion cool + From f7abdd523de087575d59bab5295b5c4f3b3f2ab2 Mon Sep 17 00:00:00 2001 From: Devan Date: Wed, 28 Aug 2024 21:00:07 -0500 Subject: [PATCH 02/18] feat: Update the CONCAT scalar function to support Utf8View --- datafusion/functions/src/string/common.rs | 17 +++- datafusion/functions/src/string/concat.rs | 81 +++++++++++++++---- .../sqllogictest/test_files/string_view.slt | 12 +-- 3 files changed, 82 insertions(+), 28 deletions(-) diff --git a/datafusion/functions/src/string/common.rs b/datafusion/functions/src/string/common.rs index a5dc22b4d9e4..8779d99cd491 100644 --- a/datafusion/functions/src/string/common.rs +++ b/datafusion/functions/src/string/common.rs @@ -255,6 +255,8 @@ pub(crate) enum ColumnarValueRef<'a> { Scalar(&'a [u8]), NullableArray(&'a StringArray), NonNullableArray(&'a StringArray), + NullableStringViewArray(&'a StringViewArray), + NonNullableStringViewArray(&'a StringViewArray) } impl<'a> ColumnarValueRef<'a> { @@ -262,15 +264,18 @@ impl<'a> ColumnarValueRef<'a> { pub fn is_valid(&self, i: usize) -> bool { match &self { Self::Scalar(_) | Self::NonNullableArray(_) => true, + Self::NonNullableStringViewArray(_) => true, Self::NullableArray(array) => array.is_valid(i), + Self::NullableStringViewArray(array) => array.is_valid(i), } } #[inline] pub fn nulls(&self) -> Option { match &self { - Self::Scalar(_) | Self::NonNullableArray(_) => None, + Self::Scalar(_) | Self::NonNullableArray(_) | Self::NonNullableStringViewArray(_) => None, Self::NullableArray(array) => array.nulls().cloned(), + Self::NullableStringViewArray(array) => array.nulls().cloned(), } } } @@ -389,9 +394,19 @@ impl StringArrayBuilder { .extend_from_slice(array.value(i).as_bytes()); } } + ColumnarValueRef::NullableStringViewArray(array) => { + if !CHECK_VALID || array.is_valid(i) { + self.value_buffer + .extend_from_slice(array.value(i).as_bytes()); + } + }, ColumnarValueRef::NonNullableArray(array) => { self.value_buffer .extend_from_slice(array.value(i).as_bytes()); + }, + ColumnarValueRef::NonNullableStringViewArray(array) => { + self.value_buffer + .extend_from_slice(array.value(i).as_bytes()); } } } diff --git a/datafusion/functions/src/string/concat.rs b/datafusion/functions/src/string/concat.rs index 8bdc5bb9ab76..2e7b48d6161f 100644 --- a/datafusion/functions/src/string/concat.rs +++ b/datafusion/functions/src/string/concat.rs @@ -17,12 +17,11 @@ use std::any::Any; use std::sync::Arc; - +use arrow::array::{Array, StringViewArray}; use arrow::datatypes::DataType; -use arrow::datatypes::DataType::Utf8; -use datafusion_common::cast::as_string_array; -use datafusion_common::{exec_err, internal_err, Result, ScalarValue}; +use datafusion_common::cast::{as_string_array, as_string_view_array}; +use datafusion_common::{internal_err, plan_err, Result, ScalarValue}; use datafusion_expr::expr::ScalarFunction; use datafusion_expr::simplify::{ExprSimplifyResult, SimplifyInfo}; use datafusion_expr::{lit, ColumnarValue, Expr, Volatility}; @@ -70,7 +69,7 @@ impl ScalarUDFImpl for ConcatFunc { Utf8 => Utf8, Utf8View => Utf8View, other => { - exec_err!("CONCAT function cannot use {other} as a return type"); + return plan_err!("CONCAT function cannot use {other} as a return type"); } }) } @@ -78,6 +77,7 @@ impl ScalarUDFImpl for ConcatFunc { /// Concatenates the text representations of all the arguments. NULL arguments are ignored. /// concat('abcde', 2, NULL, 22) = 'abcde222' fn invoke(&self, args: &[ColumnarValue]) -> Result { + let args_datatype = args[0].data_type(); let array_len = args .iter() .filter_map(|x| match x { @@ -94,7 +94,18 @@ impl ScalarUDFImpl for ConcatFunc { result.push_str(v); } } - return Ok(ColumnarValue::Scalar(ScalarValue::Utf8(Some(result)))); + + return match args_datatype { + DataType::Utf8View => { + Ok(ColumnarValue::Scalar(ScalarValue::Utf8View(Some(result)))) + }, + DataType::Utf8 => { + Ok(ColumnarValue::Scalar(ScalarValue::Utf8(Some(result)))) + }, + other => { + plan_err!("Concat function does not support datatype of {other}") + } + } } // Array @@ -110,15 +121,41 @@ impl ScalarUDFImpl for ConcatFunc { columns.push(ColumnarValueRef::Scalar(s.as_bytes())); } } + ColumnarValue::Scalar(ScalarValue::Utf8View(maybe_value)) => { + if let Some(s) = maybe_value { + data_size += s.len() * len; + columns.push(ColumnarValueRef::Scalar(s.as_bytes())); + } + }, ColumnarValue::Array(array) => { - let string_array = as_string_array(array)?; - data_size += string_array.values().len(); - let column = if array.is_nullable() { - ColumnarValueRef::NullableArray(string_array) - } else { - ColumnarValueRef::NonNullableArray(string_array) + match array.data_type() { + DataType::Utf8 | DataType::LargeUtf8 => { + let string_array = as_string_array(array)?; + + data_size += string_array.values().len(); + let column = if array.is_nullable() { + ColumnarValueRef::NullableArray(string_array) + } else { + ColumnarValueRef::NonNullableArray(string_array) + }; + columns.push(column); + }, + DataType::Utf8View => { + let string_array = as_string_view_array(array)?; + + data_size += string_array.len(); + let column = if array.is_nullable() { + ColumnarValueRef::NullableStringViewArray(string_array) + } else { + ColumnarValueRef::NonNullableStringViewArray(string_array) + }; + columns.push(column); + }, + other => { + return plan_err!("Input was {other} which is not a supported datatype for concat function") + } }; - columns.push(column); + } _ => unreachable!(), } @@ -131,7 +168,18 @@ impl ScalarUDFImpl for ConcatFunc { .for_each(|column| builder.write::(column, i)); builder.append_offset(); } - Ok(ColumnarValue::Array(Arc::new(builder.finish(None)))) + let string_array = builder.finish(None); + + match args_datatype { + DataType::Utf8 | DataType::LargeUtf8 => { + Ok(ColumnarValue::Array(Arc::new(string_array))) + }, + DataType::Utf8View => { + Ok(ColumnarValue::Array(Arc::new(StringViewArray::from_iter(string_array.into_iter())))) + }, + _ => unreachable!() + } + } /// Simplify the `concat` function by @@ -158,11 +206,11 @@ pub fn simplify_concat(args: Vec) -> Result { for arg in args.clone() { match arg { // filter out `null` args - Expr::Literal(ScalarValue::Utf8(None) | ScalarValue::LargeUtf8(None)) => {} + Expr::Literal(ScalarValue::Utf8(None) | ScalarValue::LargeUtf8(None) | ScalarValue::Utf8View(None)) => {} // All literals have been converted to Utf8 or LargeUtf8 in type_coercion. // Concatenate it with the `contiguous_scalar`. Expr::Literal( - ScalarValue::Utf8(Some(v)) | ScalarValue::LargeUtf8(Some(v)), + ScalarValue::Utf8(Some(v)) | ScalarValue::LargeUtf8(Some(v)) | ScalarValue::Utf8View(Some(v)), ) => contiguous_scalar += &v, Expr::Literal(x) => { return internal_err!( @@ -201,6 +249,7 @@ pub fn simplify_concat(args: Vec) -> Result { #[cfg(test)] mod tests { use super::*; + use DataType::*; use crate::utils::test::test_function; use arrow::array::Array; use arrow::array::{ArrayRef, StringArray}; diff --git a/datafusion/sqllogictest/test_files/string_view.slt b/datafusion/sqllogictest/test_files/string_view.slt index 7c6cfdf86d0b..177e87369f44 100644 --- a/datafusion/sqllogictest/test_files/string_view.slt +++ b/datafusion/sqllogictest/test_files/string_view.slt @@ -776,7 +776,7 @@ EXPLAIN SELECT FROM test; ---- logical_plan -01)Projection: concat(CAST(test.column1_utf8view AS Utf8), CAST(test.column2_utf8view AS Utf8)) AS c +01)Projection: concat(test.column1_utf8view, test.column2_utf8view) AS c 02)--TableScan: test projection=[column1_utf8view, column2_utf8view] ## Ensure no casts for CONCAT_WS @@ -874,16 +874,6 @@ XiangpengXiangpeng RaphaelR R -## ensure no cast CONCAT to utf8 from utf8view -query TT -EXPLAIN SELECT - concat(column1_utf8view, column2_utf8view) as c -FROM test; ----- -logical_plan -01)Projection: concat(test.column1_utf8view, test.column2_utf8view) AS c -02)--TableScan: test projection=[column1_utf8view, column2_utf8view] - ## Ensure no casts for LPAD query TT EXPLAIN SELECT From 503d5b9d96df22f2ab489baa798a5cef7d2a871c Mon Sep 17 00:00:00 2001 From: Devan Date: Wed, 28 Aug 2024 21:01:43 -0500 Subject: [PATCH 03/18] fmt --- datafusion/functions/src/string/common.rs | 18 +++++++++------- datafusion/functions/src/string/concat.rs | 26 +++++++++++------------ 2 files changed, 22 insertions(+), 22 deletions(-) diff --git a/datafusion/functions/src/string/common.rs b/datafusion/functions/src/string/common.rs index 8779d99cd491..c2b8895f41af 100644 --- a/datafusion/functions/src/string/common.rs +++ b/datafusion/functions/src/string/common.rs @@ -256,7 +256,7 @@ pub(crate) enum ColumnarValueRef<'a> { NullableArray(&'a StringArray), NonNullableArray(&'a StringArray), NullableStringViewArray(&'a StringViewArray), - NonNullableStringViewArray(&'a StringViewArray) + NonNullableStringViewArray(&'a StringViewArray), } impl<'a> ColumnarValueRef<'a> { @@ -273,7 +273,9 @@ impl<'a> ColumnarValueRef<'a> { #[inline] pub fn nulls(&self) -> Option { match &self { - Self::Scalar(_) | Self::NonNullableArray(_) | Self::NonNullableStringViewArray(_) => None, + Self::Scalar(_) + | Self::NonNullableArray(_) + | Self::NonNullableStringViewArray(_) => None, Self::NullableArray(array) => array.nulls().cloned(), Self::NullableStringViewArray(array) => array.nulls().cloned(), } @@ -395,15 +397,15 @@ impl StringArrayBuilder { } } ColumnarValueRef::NullableStringViewArray(array) => { - if !CHECK_VALID || array.is_valid(i) { - self.value_buffer - .extend_from_slice(array.value(i).as_bytes()); - } - }, + if !CHECK_VALID || array.is_valid(i) { + self.value_buffer + .extend_from_slice(array.value(i).as_bytes()); + } + } ColumnarValueRef::NonNullableArray(array) => { self.value_buffer .extend_from_slice(array.value(i).as_bytes()); - }, + } ColumnarValueRef::NonNullableStringViewArray(array) => { self.value_buffer .extend_from_slice(array.value(i).as_bytes()); diff --git a/datafusion/functions/src/string/concat.rs b/datafusion/functions/src/string/concat.rs index 2e7b48d6161f..4199a217920a 100644 --- a/datafusion/functions/src/string/concat.rs +++ b/datafusion/functions/src/string/concat.rs @@ -15,10 +15,10 @@ // specific language governing permissions and limitations // under the License. -use std::any::Any; -use std::sync::Arc; use arrow::array::{Array, StringViewArray}; use arrow::datatypes::DataType; +use std::any::Any; +use std::sync::Arc; use datafusion_common::cast::{as_string_array, as_string_view_array}; use datafusion_common::{internal_err, plan_err, Result, ScalarValue}; @@ -98,14 +98,14 @@ impl ScalarUDFImpl for ConcatFunc { return match args_datatype { DataType::Utf8View => { Ok(ColumnarValue::Scalar(ScalarValue::Utf8View(Some(result)))) - }, + } DataType::Utf8 => { Ok(ColumnarValue::Scalar(ScalarValue::Utf8(Some(result)))) - }, + } other => { plan_err!("Concat function does not support datatype of {other}") } - } + }; } // Array @@ -126,7 +126,7 @@ impl ScalarUDFImpl for ConcatFunc { data_size += s.len() * len; columns.push(ColumnarValueRef::Scalar(s.as_bytes())); } - }, + } ColumnarValue::Array(array) => { match array.data_type() { DataType::Utf8 | DataType::LargeUtf8 => { @@ -155,7 +155,6 @@ impl ScalarUDFImpl for ConcatFunc { return plan_err!("Input was {other} which is not a supported datatype for concat function") } }; - } _ => unreachable!(), } @@ -173,13 +172,12 @@ impl ScalarUDFImpl for ConcatFunc { match args_datatype { DataType::Utf8 | DataType::LargeUtf8 => { Ok(ColumnarValue::Array(Arc::new(string_array))) - }, - DataType::Utf8View => { - Ok(ColumnarValue::Array(Arc::new(StringViewArray::from_iter(string_array.into_iter())))) - }, - _ => unreachable!() + } + DataType::Utf8View => Ok(ColumnarValue::Array(Arc::new( + StringViewArray::from_iter(string_array.into_iter()), + ))), + _ => unreachable!(), } - } /// Simplify the `concat` function by @@ -249,10 +247,10 @@ pub fn simplify_concat(args: Vec) -> Result { #[cfg(test)] mod tests { use super::*; - use DataType::*; use crate::utils::test::test_function; use arrow::array::Array; use arrow::array::{ArrayRef, StringArray}; + use DataType::*; #[test] fn test_functions() -> Result<()> { From b30330dd4268067a51c91637bd079359e9a22e06 Mon Sep 17 00:00:00 2001 From: Devan Date: Wed, 28 Aug 2024 21:14:37 -0500 Subject: [PATCH 04/18] fmt and add default return type for concat --- datafusion/functions/src/string/concat.rs | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/datafusion/functions/src/string/concat.rs b/datafusion/functions/src/string/concat.rs index 4199a217920a..3d40a3f37d7e 100644 --- a/datafusion/functions/src/string/concat.rs +++ b/datafusion/functions/src/string/concat.rs @@ -66,11 +66,8 @@ impl ScalarUDFImpl for ConcatFunc { fn return_type(&self, arg_types: &[DataType]) -> Result { use DataType::*; Ok(match &arg_types[0] { - Utf8 => Utf8, Utf8View => Utf8View, - other => { - return plan_err!("CONCAT function cannot use {other} as a return type"); - } + _ => Utf8, }) } From 76d6b5f302e7b7bfd6e14bfdc2b8840db327ee72 Mon Sep 17 00:00:00 2001 From: Devan Date: Wed, 28 Aug 2024 21:59:45 -0500 Subject: [PATCH 05/18] fix clippy lint Signed-off-by: Devan --- datafusion/functions/src/string/concat.rs | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/datafusion/functions/src/string/concat.rs b/datafusion/functions/src/string/concat.rs index 3d40a3f37d7e..966439a74676 100644 --- a/datafusion/functions/src/string/concat.rs +++ b/datafusion/functions/src/string/concat.rs @@ -170,9 +170,12 @@ impl ScalarUDFImpl for ConcatFunc { DataType::Utf8 | DataType::LargeUtf8 => { Ok(ColumnarValue::Array(Arc::new(string_array))) } - DataType::Utf8View => Ok(ColumnarValue::Array(Arc::new( - StringViewArray::from_iter(string_array.into_iter()), - ))), + DataType::Utf8View => { + let string_array_iter = string_array.into_iter(); + Ok(ColumnarValue::Array(Arc::new( + StringViewArray::from_iter(string_array_iter), + ))) + }, _ => unreachable!(), } } From 9798ed35a3b52f125c409ea9b0ae5d8e8c677d8c Mon Sep 17 00:00:00 2001 From: Devan Date: Wed, 28 Aug 2024 22:01:10 -0500 Subject: [PATCH 06/18] fmt Signed-off-by: Devan --- datafusion/functions/src/string/concat.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/datafusion/functions/src/string/concat.rs b/datafusion/functions/src/string/concat.rs index 966439a74676..da53b07c2135 100644 --- a/datafusion/functions/src/string/concat.rs +++ b/datafusion/functions/src/string/concat.rs @@ -172,10 +172,10 @@ impl ScalarUDFImpl for ConcatFunc { } DataType::Utf8View => { let string_array_iter = string_array.into_iter(); - Ok(ColumnarValue::Array(Arc::new( - StringViewArray::from_iter(string_array_iter), - ))) - }, + Ok(ColumnarValue::Array(Arc::new(StringViewArray::from_iter( + string_array_iter, + )))) + } _ => unreachable!(), } } From 91d04ff39d3d06db45257984f1b08a6499848e06 Mon Sep 17 00:00:00 2001 From: Devan Date: Thu, 29 Aug 2024 09:05:13 -0500 Subject: [PATCH 07/18] add more tests for sqllogic Signed-off-by: Devan --- .../sqllogictest/test_files/string_view.slt | 24 ++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/datafusion/sqllogictest/test_files/string_view.slt b/datafusion/sqllogictest/test_files/string_view.slt index 177e87369f44..5fcd445b843e 100644 --- a/datafusion/sqllogictest/test_files/string_view.slt +++ b/datafusion/sqllogictest/test_files/string_view.slt @@ -863,7 +863,7 @@ XIANGPENG RAPHAEL NULL -## Should run CONCAT() successfully +## Should run CONCAT successfully query T SELECT concat(column1_utf8view, column2_utf8view) as c @@ -874,6 +874,28 @@ XiangpengXiangpeng RaphaelR R +## Should run CONCAT successfully with utf8 and utf8view +query T +SELECT + concat(column1_utf8view, column2_utf8) as c +FROM test; +---- +AndrewX +XiangpengXiangpeng +RaphaelR +R + +## Should run CONCAT successfully with utf8 utf8view and largeutf8 +query T +SELECT + concat(column1_utf8view, column2_utf8, column2_large_utf8) as c +FROM test; +---- +AndrewXX +XiangpengXiangpengXiangpeng +RaphaelRR +RR + ## Ensure no casts for LPAD query TT EXPLAIN SELECT From 6d289275882374bd324ca5604f1604f65e3b9191 Mon Sep 17 00:00:00 2001 From: Devan Date: Thu, 29 Aug 2024 16:01:05 -0500 Subject: [PATCH 08/18] make sure no casting with LargeUtf8 --- datafusion/functions/src/string/concat.rs | 9 ++++++++- datafusion/sqllogictest/test_files/string_view.slt | 13 +++++++++++-- 2 files changed, 19 insertions(+), 3 deletions(-) diff --git a/datafusion/functions/src/string/concat.rs b/datafusion/functions/src/string/concat.rs index da53b07c2135..6bd56b7673df 100644 --- a/datafusion/functions/src/string/concat.rs +++ b/datafusion/functions/src/string/concat.rs @@ -45,7 +45,10 @@ impl ConcatFunc { pub fn new() -> Self { use DataType::*; Self { - signature: Signature::variadic(vec![Utf8, Utf8View], Volatility::Immutable), + signature: Signature::variadic( + vec![Utf8, Utf8View, LargeUtf8], + Volatility::Immutable, + ), } } } @@ -67,6 +70,7 @@ impl ScalarUDFImpl for ConcatFunc { use DataType::*; Ok(match &arg_types[0] { Utf8View => Utf8View, + LargeUtf8 => LargeUtf8, _ => Utf8, }) } @@ -99,6 +103,9 @@ impl ScalarUDFImpl for ConcatFunc { DataType::Utf8 => { Ok(ColumnarValue::Scalar(ScalarValue::Utf8(Some(result)))) } + DataType::LargeUtf8 => { + Ok(ColumnarValue::Scalar(ScalarValue::LargeUtf8(Some(result)))) + } other => { plan_err!("Concat function does not support datatype of {other}") } diff --git a/datafusion/sqllogictest/test_files/string_view.slt b/datafusion/sqllogictest/test_files/string_view.slt index 5fcd445b843e..1d71defca719 100644 --- a/datafusion/sqllogictest/test_files/string_view.slt +++ b/datafusion/sqllogictest/test_files/string_view.slt @@ -768,8 +768,7 @@ logical_plan 01)Projection: character_length(test.column1_utf8view) AS l 02)--TableScan: test projection=[column1_utf8view] -## Ensure no casts for CONCAT -## TODO https://github.com/apache/datafusion/issues/11836 +## Ensure no casts for CONCAT Utf8View query TT EXPLAIN SELECT concat(column1_utf8view, column2_utf8view) as c @@ -779,6 +778,16 @@ logical_plan 01)Projection: concat(test.column1_utf8view, test.column2_utf8view) AS c 02)--TableScan: test projection=[column1_utf8view, column2_utf8view] +## Ensure no casts for CONCAT LargeUtf8 +query TT +EXPLAIN SELECT + concat(column1_large_utf8, column2_large_utf8) as c +FROM test; +---- +logical_plan +01)Projection: concat(test.column1_large_utf8, test.column2_large_utf8) AS c +02)--TableScan: test projection=[column1_large_utf8, column2_large_utf8] + ## Ensure no casts for CONCAT_WS ## TODO https://github.com/apache/datafusion/issues/11837 query TT From 769d99df50420d68094ef0eaaf2e996bd69d7682 Mon Sep 17 00:00:00 2001 From: Devan Date: Thu, 29 Aug 2024 18:04:57 -0500 Subject: [PATCH 09/18] fixing utf8large --- datafusion/sqllogictest/test_files/string_view.slt | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/datafusion/sqllogictest/test_files/string_view.slt b/datafusion/sqllogictest/test_files/string_view.slt index 1d71defca719..ccaf46ca2fc0 100644 --- a/datafusion/sqllogictest/test_files/string_view.slt +++ b/datafusion/sqllogictest/test_files/string_view.slt @@ -905,6 +905,17 @@ XiangpengXiangpengXiangpeng RaphaelRR RR +## Should run CONCAT successfully with utf8large +query T +SELECT + concat(column1_large_utf8, column2_large_utf8) as c +FROM test; +---- +AndrewX +XiangpengXiangpeng +RaphaelR +R + ## Ensure no casts for LPAD query TT EXPLAIN SELECT From ac30a834a698480079c07dab9061607019715785 Mon Sep 17 00:00:00 2001 From: Devan Date: Thu, 29 Aug 2024 19:57:33 -0500 Subject: [PATCH 10/18] fix large utf8 Signed-off-by: Devan --- datafusion/functions/src/string/common.rs | 109 ++++++++++++++++++++-- datafusion/functions/src/string/concat.rs | 56 ++++++++--- 2 files changed, 144 insertions(+), 21 deletions(-) diff --git a/datafusion/functions/src/string/common.rs b/datafusion/functions/src/string/common.rs index c2b8895f41af..ede183c577cc 100644 --- a/datafusion/functions/src/string/common.rs +++ b/datafusion/functions/src/string/common.rs @@ -20,14 +20,9 @@ use std::fmt::{Display, Formatter}; use std::sync::Arc; -use arrow::array::{ - new_null_array, Array, ArrayAccessor, ArrayDataBuilder, ArrayIter, ArrayRef, - GenericStringArray, GenericStringBuilder, OffsetSizeTrait, StringArray, - StringBuilder, StringViewArray, -}; +use arrow::array::{new_null_array, Array, ArrayAccessor, ArrayDataBuilder, ArrayIter, ArrayRef, GenericStringArray, GenericStringBuilder, LargeStringArray, OffsetSizeTrait, StringArray, StringBuilder, StringViewArray}; use arrow::buffer::{Buffer, MutableBuffer, NullBuffer}; use arrow::datatypes::DataType; - use datafusion_common::cast::{as_generic_string_array, as_string_view_array}; use datafusion_common::Result; use datafusion_common::{exec_err, ScalarValue}; @@ -255,6 +250,8 @@ pub(crate) enum ColumnarValueRef<'a> { Scalar(&'a [u8]), NullableArray(&'a StringArray), NonNullableArray(&'a StringArray), + NullableLargeStringArray(&'a LargeStringArray), + NonNullableLargeStringArray(&'a LargeStringArray), NullableStringViewArray(&'a StringViewArray), NonNullableStringViewArray(&'a StringViewArray), } @@ -263,10 +260,10 @@ impl<'a> ColumnarValueRef<'a> { #[inline] pub fn is_valid(&self, i: usize) -> bool { match &self { - Self::Scalar(_) | Self::NonNullableArray(_) => true, - Self::NonNullableStringViewArray(_) => true, + Self::Scalar(_) | Self::NonNullableArray(_) | Self::NonNullableLargeStringArray(_)| Self::NonNullableStringViewArray(_) => true, Self::NullableArray(array) => array.is_valid(i), Self::NullableStringViewArray(array) => array.is_valid(i), + Self::NullableLargeStringArray(array) => array.is_valid(i), } } @@ -275,9 +272,11 @@ impl<'a> ColumnarValueRef<'a> { match &self { Self::Scalar(_) | Self::NonNullableArray(_) - | Self::NonNullableStringViewArray(_) => None, + | Self::NonNullableStringViewArray(_) + | Self::NonNullableLargeStringArray(_) => None, Self::NullableArray(array) => array.nulls().cloned(), Self::NullableStringViewArray(array) => array.nulls().cloned(), + Self::NullableLargeStringArray(array) => array.nulls().cloned(), } } } @@ -396,6 +395,12 @@ impl StringArrayBuilder { .extend_from_slice(array.value(i).as_bytes()); } } + ColumnarValueRef::NullableLargeStringArray(array) => { + if !CHECK_VALID || array.is_valid(i) { + self.value_buffer + .extend_from_slice(array.value(i).as_bytes()); + } + } ColumnarValueRef::NullableStringViewArray(array) => { if !CHECK_VALID || array.is_valid(i) { self.value_buffer @@ -406,6 +411,10 @@ impl StringArrayBuilder { self.value_buffer .extend_from_slice(array.value(i).as_bytes()); } + ColumnarValueRef::NonNullableLargeStringArray(array) => { + self.value_buffer + .extend_from_slice(array.value(i).as_bytes()); + } ColumnarValueRef::NonNullableStringViewArray(array) => { self.value_buffer .extend_from_slice(array.value(i).as_bytes()); @@ -435,6 +444,88 @@ impl StringArrayBuilder { } } +pub(crate) struct LargeStringArrayBuilder { + offsets_buffer: MutableBuffer, + value_buffer: MutableBuffer, +} + +impl LargeStringArrayBuilder { + pub fn with_capacity(item_capacity: usize, data_capacity: usize) -> Self { + let mut offsets_buffer = MutableBuffer::with_capacity( + (item_capacity + 1) * std::mem::size_of::(), + ); + // SAFETY: the first offset value is definitely not going to exceed the bounds. + unsafe { offsets_buffer.push_unchecked(0_i64) }; + Self { + offsets_buffer, + value_buffer: MutableBuffer::with_capacity(data_capacity), + } + } + + pub fn write( + &mut self, + column: &ColumnarValueRef, + i: usize, + ) { + match column { + ColumnarValueRef::Scalar(s) => { + self.value_buffer.extend_from_slice(s); + } + ColumnarValueRef::NullableArray(array) => { + if !CHECK_VALID || array.is_valid(i) { + self.value_buffer + .extend_from_slice(array.value(i).as_bytes()); + } + } + ColumnarValueRef::NullableLargeStringArray(array) => { + if !CHECK_VALID || array.is_valid(i) { + self.value_buffer + .extend_from_slice(array.value(i).as_bytes()); + } + } + ColumnarValueRef::NullableStringViewArray(array) => { + if !CHECK_VALID || array.is_valid(i) { + self.value_buffer + .extend_from_slice(array.value(i).as_bytes()); + } + } + ColumnarValueRef::NonNullableArray(array) => { + self.value_buffer + .extend_from_slice(array.value(i).as_bytes()); + } + ColumnarValueRef::NonNullableLargeStringArray(array) => { + self.value_buffer + .extend_from_slice(array.value(i).as_bytes()); + } + ColumnarValueRef::NonNullableStringViewArray(array) => { + self.value_buffer + .extend_from_slice(array.value(i).as_bytes()); + } + } + } + + pub fn append_offset(&mut self) { + let next_offset: i64 = self + .value_buffer + .len() + .try_into() + .expect("byte array offset overflow"); + unsafe { self.offsets_buffer.push_unchecked(next_offset) }; + } + + pub fn finish(self, null_buffer: Option) -> LargeStringArray { + let array_builder = ArrayDataBuilder::new(DataType::LargeUtf8) + .len(self.offsets_buffer.len() / std::mem::size_of::() - 1) + .add_buffer(self.offsets_buffer.into()) + .add_buffer(self.value_buffer.into()) + .nulls(null_buffer); + // SAFETY: all data that was appended was valid Large UTF8 and the values + // and offsets were created correctly + let array_data = unsafe { array_builder.build_unchecked() }; + LargeStringArray::from(array_data) + } +} + fn case_conversion_array<'a, O, F>(array: &'a ArrayRef, op: F) -> Result where O: OffsetSizeTrait, diff --git a/datafusion/functions/src/string/concat.rs b/datafusion/functions/src/string/concat.rs index 6bd56b7673df..57000aae869a 100644 --- a/datafusion/functions/src/string/concat.rs +++ b/datafusion/functions/src/string/concat.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -use arrow::array::{Array, StringViewArray}; +use arrow::array::{as_largestring_array, Array, StringViewArray}; use arrow::datatypes::DataType; use std::any::Any; use std::sync::Arc; @@ -133,7 +133,7 @@ impl ScalarUDFImpl for ConcatFunc { } ColumnarValue::Array(array) => { match array.data_type() { - DataType::Utf8 | DataType::LargeUtf8 => { + DataType::Utf8 => { let string_array = as_string_array(array)?; data_size += string_array.values().len(); @@ -144,6 +144,17 @@ impl ScalarUDFImpl for ConcatFunc { }; columns.push(column); }, + DataType::LargeUtf8 => { + let string_array = as_largestring_array(array); + + data_size += string_array.values().len(); + let column = if array.is_nullable() { + ColumnarValueRef::NullableLargeStringArray(string_array) + } else { + ColumnarValueRef::NonNullableLargeStringArray(string_array) + }; + columns.push(column); + }, DataType::Utf8View => { let string_array = as_string_view_array(array)?; @@ -164,20 +175,41 @@ impl ScalarUDFImpl for ConcatFunc { } } - let mut builder = StringArrayBuilder::with_capacity(len, data_size); - for i in 0..len { - columns - .iter() - .for_each(|column| builder.write::(column, i)); - builder.append_offset(); - } - let string_array = builder.finish(None); - match args_datatype { - DataType::Utf8 | DataType::LargeUtf8 => { + DataType::Utf8 => { + let mut builder = StringArrayBuilder::with_capacity(len, data_size); + for i in 0..len { + columns + .iter() + .for_each(|column| builder.write::(column, i)); + builder.append_offset(); + } + + let string_array = builder.finish(None); + Ok(ColumnarValue::Array(Arc::new(string_array))) + } + DataType::LargeUtf8 => { + let mut builder = LargeStringArrayBuilder::with_capacity(len, data_size); + for i in 0..len { + columns + .iter() + .for_each(|column| builder.write::(column, i)); + builder.append_offset(); + } + + let string_array = builder.finish(None); Ok(ColumnarValue::Array(Arc::new(string_array))) } DataType::Utf8View => { + let mut builder = StringArrayBuilder::with_capacity(len, data_size); + for i in 0..len { + columns + .iter() + .for_each(|column| builder.write::(column, i)); + builder.append_offset(); + } + + let string_array = builder.finish(None); let string_array_iter = string_array.into_iter(); Ok(ColumnarValue::Array(Arc::new(StringViewArray::from_iter( string_array_iter, From 7ea6e0a7fe8f5545d6d8d29965bc0e1cfb66b06b Mon Sep 17 00:00:00 2001 From: Devan Date: Thu, 29 Aug 2024 19:58:21 -0500 Subject: [PATCH 11/18] fix large utf8 Signed-off-by: Devan --- datafusion/functions/src/string/common.rs | 11 +++++++++-- datafusion/functions/src/string/concat.rs | 4 ++-- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/datafusion/functions/src/string/common.rs b/datafusion/functions/src/string/common.rs index ede183c577cc..315575dc657f 100644 --- a/datafusion/functions/src/string/common.rs +++ b/datafusion/functions/src/string/common.rs @@ -20,7 +20,11 @@ use std::fmt::{Display, Formatter}; use std::sync::Arc; -use arrow::array::{new_null_array, Array, ArrayAccessor, ArrayDataBuilder, ArrayIter, ArrayRef, GenericStringArray, GenericStringBuilder, LargeStringArray, OffsetSizeTrait, StringArray, StringBuilder, StringViewArray}; +use arrow::array::{ + new_null_array, Array, ArrayAccessor, ArrayDataBuilder, ArrayIter, ArrayRef, + GenericStringArray, GenericStringBuilder, LargeStringArray, OffsetSizeTrait, + StringArray, StringBuilder, StringViewArray, +}; use arrow::buffer::{Buffer, MutableBuffer, NullBuffer}; use arrow::datatypes::DataType; use datafusion_common::cast::{as_generic_string_array, as_string_view_array}; @@ -260,7 +264,10 @@ impl<'a> ColumnarValueRef<'a> { #[inline] pub fn is_valid(&self, i: usize) -> bool { match &self { - Self::Scalar(_) | Self::NonNullableArray(_) | Self::NonNullableLargeStringArray(_)| Self::NonNullableStringViewArray(_) => true, + Self::Scalar(_) + | Self::NonNullableArray(_) + | Self::NonNullableLargeStringArray(_) + | Self::NonNullableStringViewArray(_) => true, Self::NullableArray(array) => array.is_valid(i), Self::NullableStringViewArray(array) => array.is_valid(i), Self::NullableLargeStringArray(array) => array.is_valid(i), diff --git a/datafusion/functions/src/string/concat.rs b/datafusion/functions/src/string/concat.rs index 57000aae869a..2ff4c860c2a0 100644 --- a/datafusion/functions/src/string/concat.rs +++ b/datafusion/functions/src/string/concat.rs @@ -153,7 +153,7 @@ impl ScalarUDFImpl for ConcatFunc { } else { ColumnarValueRef::NonNullableLargeStringArray(string_array) }; - columns.push(column); + columns.push(column); }, DataType::Utf8View => { let string_array = as_string_view_array(array)?; @@ -184,7 +184,7 @@ impl ScalarUDFImpl for ConcatFunc { .for_each(|column| builder.write::(column, i)); builder.append_offset(); } - + let string_array = builder.finish(None); Ok(ColumnarValue::Array(Arc::new(string_array))) } From dd3ad39fcfece51c2ecf88eaded3824ac85ca8c8 Mon Sep 17 00:00:00 2001 From: Devan Date: Thu, 29 Aug 2024 20:16:05 -0500 Subject: [PATCH 12/18] add test Signed-off-by: Devan --- datafusion/sqllogictest/test_files/string_view.slt | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/datafusion/sqllogictest/test_files/string_view.slt b/datafusion/sqllogictest/test_files/string_view.slt index ccaf46ca2fc0..eb625e530b66 100644 --- a/datafusion/sqllogictest/test_files/string_view.slt +++ b/datafusion/sqllogictest/test_files/string_view.slt @@ -872,7 +872,7 @@ XIANGPENG RAPHAEL NULL -## Should run CONCAT successfully +## Should run CONCAT successfully with utf8view query T SELECT concat(column1_utf8view, column2_utf8view) as c @@ -883,6 +883,17 @@ XiangpengXiangpeng RaphaelR R +## Should run CONCAT successfully with utf8 +query T +SELECT + concat(column1_utf8, column2_utf8) as c +FROM test; +---- +AndrewX +XiangpengXiangpeng +RaphaelR +R + ## Should run CONCAT successfully with utf8 and utf8view query T SELECT From 504459c657ba234f383cde1b1db560f604ab10bc Mon Sep 17 00:00:00 2001 From: Devan Date: Thu, 29 Aug 2024 20:21:37 -0500 Subject: [PATCH 13/18] fmt Signed-off-by: Devan --- datafusion/functions/src/string/concat.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/datafusion/functions/src/string/concat.rs b/datafusion/functions/src/string/concat.rs index 2ff4c860c2a0..510b6034250b 100644 --- a/datafusion/functions/src/string/concat.rs +++ b/datafusion/functions/src/string/concat.rs @@ -78,7 +78,7 @@ impl ScalarUDFImpl for ConcatFunc { /// Concatenates the text representations of all the arguments. NULL arguments are ignored. /// concat('abcde', 2, NULL, 22) = 'abcde222' fn invoke(&self, args: &[ColumnarValue]) -> Result { - let args_datatype = args[0].data_type(); + let first_arg_datatype = args[0].data_type(); let array_len = args .iter() .filter_map(|x| match x { @@ -96,7 +96,7 @@ impl ScalarUDFImpl for ConcatFunc { } } - return match args_datatype { + return match first_arg_datatype { DataType::Utf8View => { Ok(ColumnarValue::Scalar(ScalarValue::Utf8View(Some(result)))) } @@ -175,7 +175,7 @@ impl ScalarUDFImpl for ConcatFunc { } } - match args_datatype { + match first_arg_datatype { DataType::Utf8 => { let mut builder = StringArrayBuilder::with_capacity(len, data_size); for i in 0..len { From e0819349925ab6c56b782832521d2e241691fb62 Mon Sep 17 00:00:00 2001 From: Devan Date: Fri, 30 Aug 2024 07:44:24 -0500 Subject: [PATCH 14/18] make it so Utf8View just returns Utf8 Signed-off-by: Devan --- datafusion/functions/src/string/concat.rs | 20 ++------------------ 1 file changed, 2 insertions(+), 18 deletions(-) diff --git a/datafusion/functions/src/string/concat.rs b/datafusion/functions/src/string/concat.rs index 510b6034250b..36cafb5257c8 100644 --- a/datafusion/functions/src/string/concat.rs +++ b/datafusion/functions/src/string/concat.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -use arrow::array::{as_largestring_array, Array, StringViewArray}; +use arrow::array::{as_largestring_array, Array}; use arrow::datatypes::DataType; use std::any::Any; use std::sync::Arc; @@ -69,7 +69,6 @@ impl ScalarUDFImpl for ConcatFunc { fn return_type(&self, arg_types: &[DataType]) -> Result { use DataType::*; Ok(match &arg_types[0] { - Utf8View => Utf8View, LargeUtf8 => LargeUtf8, _ => Utf8, }) @@ -176,7 +175,7 @@ impl ScalarUDFImpl for ConcatFunc { } match first_arg_datatype { - DataType::Utf8 => { + DataType::Utf8 | DataType::Utf8View => { let mut builder = StringArrayBuilder::with_capacity(len, data_size); for i in 0..len { columns @@ -200,21 +199,6 @@ impl ScalarUDFImpl for ConcatFunc { let string_array = builder.finish(None); Ok(ColumnarValue::Array(Arc::new(string_array))) } - DataType::Utf8View => { - let mut builder = StringArrayBuilder::with_capacity(len, data_size); - for i in 0..len { - columns - .iter() - .for_each(|column| builder.write::(column, i)); - builder.append_offset(); - } - - let string_array = builder.finish(None); - let string_array_iter = string_array.into_iter(); - Ok(ColumnarValue::Array(Arc::new(StringViewArray::from_iter( - string_array_iter, - )))) - } _ => unreachable!(), } } From 0069c1a10821eee5f55db62acc7d52621fb1a096 Mon Sep 17 00:00:00 2001 From: Devan Date: Sun, 1 Sep 2024 09:51:20 -0500 Subject: [PATCH 15/18] wip -- trying to build a stringview with columnar refs Signed-off-by: Devan --- datafusion/functions/src/string/common.rs | 87 +++++++++++++++++++++++ datafusion/functions/src/string/concat.rs | 56 ++++++++++++--- 2 files changed, 134 insertions(+), 9 deletions(-) diff --git a/datafusion/functions/src/string/common.rs b/datafusion/functions/src/string/common.rs index 315575dc657f..13a08df27e66 100644 --- a/datafusion/functions/src/string/common.rs +++ b/datafusion/functions/src/string/common.rs @@ -250,6 +250,7 @@ where } } +#[derive(Debug)] pub(crate) enum ColumnarValueRef<'a> { Scalar(&'a [u8]), NullableArray(&'a StringArray), @@ -435,6 +436,7 @@ impl StringArrayBuilder { .len() .try_into() .expect("byte array offset overflow"); + println!("appending offset of size {next_offset}.."); unsafe { self.offsets_buffer.push_unchecked(next_offset) }; } @@ -447,10 +449,95 @@ impl StringArrayBuilder { // SAFETY: all data that was appended was valid UTF8 and the values // and offsets were created correctly let array_data = unsafe { array_builder.build_unchecked() }; + let buf = &array_data.offset(); + println!("{buf:?}"); StringArray::from(array_data) } } +pub(crate) struct StringViewArrayBuilder { + offsets_buffer: MutableBuffer, + value_buffer: MutableBuffer, +} + +impl StringViewArrayBuilder { + pub fn with_capacity(item_capacity: usize, data_capacity: usize) -> Self { + let mut offsets_buffer = MutableBuffer::with_capacity( + (item_capacity + 1) * std::mem::size_of::(), + ); + // SAFETY: the first offset value is definitely not going to exceed the bounds. + unsafe { offsets_buffer.push_unchecked(0_u128) }; + Self { + offsets_buffer, + value_buffer: MutableBuffer::with_capacity(data_capacity), + } + } + + pub fn write( + &mut self, + column: &ColumnarValueRef, + i: usize, + ) { + match column { + ColumnarValueRef::Scalar(s) => { + self.value_buffer.extend_from_slice(s); + } + ColumnarValueRef::NullableArray(array) => { + if !CHECK_VALID || array.is_valid(i) { + self.value_buffer + .extend_from_slice(array.value(i).as_bytes()); + } + } + ColumnarValueRef::NullableLargeStringArray(array) => { + if !CHECK_VALID || array.is_valid(i) { + self.value_buffer + .extend_from_slice(array.value(i).as_bytes()); + } + } + ColumnarValueRef::NullableStringViewArray(array) => { + if !CHECK_VALID || array.is_valid(i) { + self.value_buffer + .extend_from_slice(array.value(i).as_bytes()); + } + } + ColumnarValueRef::NonNullableArray(array) => { + self.value_buffer + .extend_from_slice(array.value(i).as_bytes()); + } + ColumnarValueRef::NonNullableLargeStringArray(array) => { + self.value_buffer + .extend_from_slice(array.value(i).as_bytes()); + } + ColumnarValueRef::NonNullableStringViewArray(array) => { + self.value_buffer + .extend_from_slice(array.value(i).as_bytes()); + } + } + } + + pub fn append_offset(&mut self) { + let next_offset: u128 = self + .value_buffer + .len() + .try_into() + .expect("byte array offset overflow"); + unsafe { self.offsets_buffer.push_unchecked(next_offset) }; + } + + pub fn finish(self, null_buffer: Option) -> StringViewArray { + let array_builder = ArrayDataBuilder::new(DataType::Utf8View) + .len(self.offsets_buffer.len() / std::mem::size_of::() - 1) + .add_buffer(self.offsets_buffer.into()) + .add_buffer(self.value_buffer.into()) + .nulls(null_buffer); + // SAFETY: all data that was appended was valid UTF8 and the values + // and offsets were created correctly + let array_data = unsafe { array_builder.build_unchecked() }; + println!("{array_data:?}"); + StringViewArray::from(array_data) + } +} + pub(crate) struct LargeStringArrayBuilder { offsets_buffer: MutableBuffer, value_buffer: MutableBuffer, diff --git a/datafusion/functions/src/string/concat.rs b/datafusion/functions/src/string/concat.rs index 36cafb5257c8..d3e4c8a30914 100644 --- a/datafusion/functions/src/string/concat.rs +++ b/datafusion/functions/src/string/concat.rs @@ -68,16 +68,30 @@ impl ScalarUDFImpl for ConcatFunc { fn return_type(&self, arg_types: &[DataType]) -> Result { use DataType::*; - Ok(match &arg_types[0] { - LargeUtf8 => LargeUtf8, - _ => Utf8, - }) + let mut dt = &Utf8; + arg_types.iter().for_each(|data_type| { + if data_type == &Utf8View { + dt = data_type; + } else if data_type == &LargeUtf8 && dt != &Utf8View { + dt = data_type; + } + }); + + Ok(dt.to_owned()) } /// Concatenates the text representations of all the arguments. NULL arguments are ignored. /// concat('abcde', 2, NULL, 22) = 'abcde222' fn invoke(&self, args: &[ColumnarValue]) -> Result { - let first_arg_datatype = args[0].data_type(); + let mut return_datatype = DataType::Utf8; + args.iter().for_each(|col| { + if col.data_type() == DataType::Utf8View { + return_datatype = col.data_type(); + } else if col.data_type() == DataType::LargeUtf8 && return_datatype != DataType::Utf8View { + return_datatype = col.data_type(); + } + }); + let array_len = args .iter() .filter_map(|x| match x { @@ -95,7 +109,7 @@ impl ScalarUDFImpl for ConcatFunc { } } - return match first_arg_datatype { + return match return_datatype { DataType::Utf8View => { Ok(ColumnarValue::Scalar(ScalarValue::Utf8View(Some(result)))) } @@ -174,8 +188,8 @@ impl ScalarUDFImpl for ConcatFunc { } } - match first_arg_datatype { - DataType::Utf8 | DataType::Utf8View => { + match return_datatype { + DataType::Utf8 => { let mut builder = StringArrayBuilder::with_capacity(len, data_size); for i in 0..len { columns @@ -187,6 +201,18 @@ impl ScalarUDFImpl for ConcatFunc { let string_array = builder.finish(None); Ok(ColumnarValue::Array(Arc::new(string_array))) } + DataType::Utf8View => { + let mut builder = StringViewArrayBuilder::with_capacity(len, data_size); + for i in 0..len { + columns + .iter() + .for_each(|column| builder.write::(column, i)); + builder.append_offset(); + } + + let string_array = builder.finish(None); + Ok(ColumnarValue::Array(Arc::new(string_array))) + }, DataType::LargeUtf8 => { let mut builder = LargeStringArrayBuilder::with_capacity(len, data_size); for i in 0..len { @@ -271,7 +297,7 @@ pub fn simplify_concat(args: Vec) -> Result { mod tests { use super::*; use crate::utils::test::test_function; - use arrow::array::Array; + use arrow::array::{Array, StringViewArray}; use arrow::array::{ArrayRef, StringArray}; use DataType::*; @@ -309,6 +335,18 @@ mod tests { Utf8, StringArray ); + test_function!( + ConcatFunc::new(), + &[ + ColumnarValue::Scalar(ScalarValue::Utf8(Some("aa".to_string()))), + ColumnarValue::Scalar(ScalarValue::LargeUtf8(Some("bb".to_string()))), + ColumnarValue::Scalar(ScalarValue::Utf8View(Some("cc".to_string()))) + ], + Ok(Some("aabbcc")), + &str, + Utf8View, + StringArray + ); Ok(()) } From 0929a4b293a2134e5eefc4a4afc7821c41f05e7c Mon Sep 17 00:00:00 2001 From: Devan Date: Sun, 1 Sep 2024 11:16:54 -0500 Subject: [PATCH 16/18] built stringview builder but it does allocate a new String each iter :( Signed-off-by: Devan --- datafusion/functions/src/string/common.rs | 66 +++++++---------------- datafusion/functions/src/string/concat.rs | 2 +- 2 files changed, 19 insertions(+), 49 deletions(-) diff --git a/datafusion/functions/src/string/common.rs b/datafusion/functions/src/string/common.rs index 13a08df27e66..ff25a8057057 100644 --- a/datafusion/functions/src/string/common.rs +++ b/datafusion/functions/src/string/common.rs @@ -20,11 +20,7 @@ use std::fmt::{Display, Formatter}; use std::sync::Arc; -use arrow::array::{ - new_null_array, Array, ArrayAccessor, ArrayDataBuilder, ArrayIter, ArrayRef, - GenericStringArray, GenericStringBuilder, LargeStringArray, OffsetSizeTrait, - StringArray, StringBuilder, StringViewArray, -}; +use arrow::array::{new_null_array, Array, ArrayAccessor, ArrayDataBuilder, ArrayIter, ArrayRef, GenericStringArray, GenericStringBuilder, LargeStringArray, OffsetSizeTrait, StringArray, StringBuilder, StringViewArray, StringViewBuilder}; use arrow::buffer::{Buffer, MutableBuffer, NullBuffer}; use arrow::datatypes::DataType; use datafusion_common::cast::{as_generic_string_array, as_string_view_array}; @@ -436,7 +432,6 @@ impl StringArrayBuilder { .len() .try_into() .expect("byte array offset overflow"); - println!("appending offset of size {next_offset}.."); unsafe { self.offsets_buffer.push_unchecked(next_offset) }; } @@ -449,27 +444,21 @@ impl StringArrayBuilder { // SAFETY: all data that was appended was valid UTF8 and the values // and offsets were created correctly let array_data = unsafe { array_builder.build_unchecked() }; - let buf = &array_data.offset(); - println!("{buf:?}"); StringArray::from(array_data) } } pub(crate) struct StringViewArrayBuilder { - offsets_buffer: MutableBuffer, - value_buffer: MutableBuffer, + builder: StringViewBuilder, + block: String } impl StringViewArrayBuilder { - pub fn with_capacity(item_capacity: usize, data_capacity: usize) -> Self { - let mut offsets_buffer = MutableBuffer::with_capacity( - (item_capacity + 1) * std::mem::size_of::(), - ); - // SAFETY: the first offset value is definitely not going to exceed the bounds. - unsafe { offsets_buffer.push_unchecked(0_u128) }; + pub fn with_capacity(_item_capacity: usize, data_capacity: usize) -> Self { + let builder = StringViewBuilder::with_capacity(data_capacity); Self { - offsets_buffer, - value_buffer: MutableBuffer::with_capacity(data_capacity), + builder, + block: String::new() } } @@ -480,61 +469,42 @@ impl StringViewArrayBuilder { ) { match column { ColumnarValueRef::Scalar(s) => { - self.value_buffer.extend_from_slice(s); + self.block.push_str(std::str::from_utf8(s).unwrap()); } ColumnarValueRef::NullableArray(array) => { if !CHECK_VALID || array.is_valid(i) { - self.value_buffer - .extend_from_slice(array.value(i).as_bytes()); + self.block.push_str(std::str::from_utf8(array.value(i).as_bytes()).unwrap()); } } ColumnarValueRef::NullableLargeStringArray(array) => { if !CHECK_VALID || array.is_valid(i) { - self.value_buffer - .extend_from_slice(array.value(i).as_bytes()); + self.block.push_str(std::str::from_utf8(array.value(i).as_bytes()).unwrap()); } } ColumnarValueRef::NullableStringViewArray(array) => { if !CHECK_VALID || array.is_valid(i) { - self.value_buffer - .extend_from_slice(array.value(i).as_bytes()); + self.block.push_str(std::str::from_utf8(array.value(i).as_bytes()).unwrap()); } } ColumnarValueRef::NonNullableArray(array) => { - self.value_buffer - .extend_from_slice(array.value(i).as_bytes()); + self.block.push_str(std::str::from_utf8(array.value(i).as_bytes()).unwrap()); } ColumnarValueRef::NonNullableLargeStringArray(array) => { - self.value_buffer - .extend_from_slice(array.value(i).as_bytes()); + self.block.push_str(std::str::from_utf8(array.value(i).as_bytes()).unwrap()); } ColumnarValueRef::NonNullableStringViewArray(array) => { - self.value_buffer - .extend_from_slice(array.value(i).as_bytes()); + self.block.push_str(std::str::from_utf8(array.value(i).as_bytes()).unwrap()); } } } pub fn append_offset(&mut self) { - let next_offset: u128 = self - .value_buffer - .len() - .try_into() - .expect("byte array offset overflow"); - unsafe { self.offsets_buffer.push_unchecked(next_offset) }; + self.builder.append_value(&self.block); + self.block = String::new(); } - pub fn finish(self, null_buffer: Option) -> StringViewArray { - let array_builder = ArrayDataBuilder::new(DataType::Utf8View) - .len(self.offsets_buffer.len() / std::mem::size_of::() - 1) - .add_buffer(self.offsets_buffer.into()) - .add_buffer(self.value_buffer.into()) - .nulls(null_buffer); - // SAFETY: all data that was appended was valid UTF8 and the values - // and offsets were created correctly - let array_data = unsafe { array_builder.build_unchecked() }; - println!("{array_data:?}"); - StringViewArray::from(array_data) + pub fn finish(mut self) -> StringViewArray { + self.builder.finish() } } diff --git a/datafusion/functions/src/string/concat.rs b/datafusion/functions/src/string/concat.rs index d3e4c8a30914..5cfc2cec009d 100644 --- a/datafusion/functions/src/string/concat.rs +++ b/datafusion/functions/src/string/concat.rs @@ -210,7 +210,7 @@ impl ScalarUDFImpl for ConcatFunc { builder.append_offset(); } - let string_array = builder.finish(None); + let string_array = builder.finish(); Ok(ColumnarValue::Array(Arc::new(string_array))) }, DataType::LargeUtf8 => { From f16de44ab1b3ffb52d7504f4bceb69df97a1faac Mon Sep 17 00:00:00 2001 From: Devan Date: Sun, 1 Sep 2024 11:29:45 -0500 Subject: [PATCH 17/18] add some testing Signed-off-by: Devan --- datafusion/functions/src/string/common.rs | 31 ++++++++++++++++------- datafusion/functions/src/string/concat.rs | 31 +++++++++++++++++------ 2 files changed, 45 insertions(+), 17 deletions(-) diff --git a/datafusion/functions/src/string/common.rs b/datafusion/functions/src/string/common.rs index ff25a8057057..060c5158efa5 100644 --- a/datafusion/functions/src/string/common.rs +++ b/datafusion/functions/src/string/common.rs @@ -20,7 +20,11 @@ use std::fmt::{Display, Formatter}; use std::sync::Arc; -use arrow::array::{new_null_array, Array, ArrayAccessor, ArrayDataBuilder, ArrayIter, ArrayRef, GenericStringArray, GenericStringBuilder, LargeStringArray, OffsetSizeTrait, StringArray, StringBuilder, StringViewArray, StringViewBuilder}; +use arrow::array::{ + new_null_array, Array, ArrayAccessor, ArrayDataBuilder, ArrayIter, ArrayRef, + GenericStringArray, GenericStringBuilder, LargeStringArray, OffsetSizeTrait, + StringArray, StringBuilder, StringViewArray, StringViewBuilder, +}; use arrow::buffer::{Buffer, MutableBuffer, NullBuffer}; use arrow::datatypes::DataType; use datafusion_common::cast::{as_generic_string_array, as_string_view_array}; @@ -450,7 +454,7 @@ impl StringArrayBuilder { pub(crate) struct StringViewArrayBuilder { builder: StringViewBuilder, - block: String + block: String, } impl StringViewArrayBuilder { @@ -458,7 +462,7 @@ impl StringViewArrayBuilder { let builder = StringViewBuilder::with_capacity(data_capacity); Self { builder, - block: String::new() + block: String::new(), } } @@ -473,27 +477,36 @@ impl StringViewArrayBuilder { } ColumnarValueRef::NullableArray(array) => { if !CHECK_VALID || array.is_valid(i) { - self.block.push_str(std::str::from_utf8(array.value(i).as_bytes()).unwrap()); + self.block.push_str( + std::str::from_utf8(array.value(i).as_bytes()).unwrap(), + ); } } ColumnarValueRef::NullableLargeStringArray(array) => { if !CHECK_VALID || array.is_valid(i) { - self.block.push_str(std::str::from_utf8(array.value(i).as_bytes()).unwrap()); + self.block.push_str( + std::str::from_utf8(array.value(i).as_bytes()).unwrap(), + ); } } ColumnarValueRef::NullableStringViewArray(array) => { if !CHECK_VALID || array.is_valid(i) { - self.block.push_str(std::str::from_utf8(array.value(i).as_bytes()).unwrap()); + self.block.push_str( + std::str::from_utf8(array.value(i).as_bytes()).unwrap(), + ); } } ColumnarValueRef::NonNullableArray(array) => { - self.block.push_str(std::str::from_utf8(array.value(i).as_bytes()).unwrap()); + self.block + .push_str(std::str::from_utf8(array.value(i).as_bytes()).unwrap()); } ColumnarValueRef::NonNullableLargeStringArray(array) => { - self.block.push_str(std::str::from_utf8(array.value(i).as_bytes()).unwrap()); + self.block + .push_str(std::str::from_utf8(array.value(i).as_bytes()).unwrap()); } ColumnarValueRef::NonNullableStringViewArray(array) => { - self.block.push_str(std::str::from_utf8(array.value(i).as_bytes()).unwrap()); + self.block + .push_str(std::str::from_utf8(array.value(i).as_bytes()).unwrap()); } } } diff --git a/datafusion/functions/src/string/concat.rs b/datafusion/functions/src/string/concat.rs index 5cfc2cec009d..65688f540938 100644 --- a/datafusion/functions/src/string/concat.rs +++ b/datafusion/functions/src/string/concat.rs @@ -87,7 +87,9 @@ impl ScalarUDFImpl for ConcatFunc { args.iter().for_each(|col| { if col.data_type() == DataType::Utf8View { return_datatype = col.data_type(); - } else if col.data_type() == DataType::LargeUtf8 && return_datatype != DataType::Utf8View { + } else if col.data_type() == DataType::LargeUtf8 + && return_datatype != DataType::Utf8View + { return_datatype = col.data_type(); } }); @@ -212,7 +214,7 @@ impl ScalarUDFImpl for ConcatFunc { let string_array = builder.finish(); Ok(ColumnarValue::Array(Arc::new(string_array))) - }, + } DataType::LargeUtf8 => { let mut builder = LargeStringArrayBuilder::with_capacity(len, data_size); for i in 0..len { @@ -297,7 +299,7 @@ pub fn simplify_concat(args: Vec) -> Result { mod tests { use super::*; use crate::utils::test::test_function; - use arrow::array::{Array, StringViewArray}; + use arrow::array::{Array, LargeStringArray, StringViewArray}; use arrow::array::{ArrayRef, StringArray}; use DataType::*; @@ -338,14 +340,27 @@ mod tests { test_function!( ConcatFunc::new(), &[ - ColumnarValue::Scalar(ScalarValue::Utf8(Some("aa".to_string()))), - ColumnarValue::Scalar(ScalarValue::LargeUtf8(Some("bb".to_string()))), - ColumnarValue::Scalar(ScalarValue::Utf8View(Some("cc".to_string()))) + ColumnarValue::Scalar(ScalarValue::from("aa")), + ColumnarValue::Scalar(ScalarValue::Utf8View(None)), + ColumnarValue::Scalar(ScalarValue::LargeUtf8(None)), + ColumnarValue::Scalar(ScalarValue::from("cc")), ], - Ok(Some("aabbcc")), + Ok(Some("aacc")), &str, Utf8View, - StringArray + StringViewArray + ); + test_function!( + ConcatFunc::new(), + &[ + ColumnarValue::Scalar(ScalarValue::from("aa")), + ColumnarValue::Scalar(ScalarValue::LargeUtf8(None)), + ColumnarValue::Scalar(ScalarValue::from("cc")), + ], + Ok(Some("aacc")), + &str, + LargeUtf8, + LargeStringArray ); Ok(()) From d0bf3bad4fe4d7d8ffce8496ef1ee07cf8cd665b Mon Sep 17 00:00:00 2001 From: Devan Date: Sun, 1 Sep 2024 11:42:12 -0500 Subject: [PATCH 18/18] clippy Signed-off-by: Devan --- datafusion/functions/src/string/concat.rs | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/datafusion/functions/src/string/concat.rs b/datafusion/functions/src/string/concat.rs index 65688f540938..00fe69b0bd33 100644 --- a/datafusion/functions/src/string/concat.rs +++ b/datafusion/functions/src/string/concat.rs @@ -72,7 +72,8 @@ impl ScalarUDFImpl for ConcatFunc { arg_types.iter().for_each(|data_type| { if data_type == &Utf8View { dt = data_type; - } else if data_type == &LargeUtf8 && dt != &Utf8View { + } + if data_type == &LargeUtf8 && dt != &Utf8View { dt = data_type; } }); @@ -87,7 +88,8 @@ impl ScalarUDFImpl for ConcatFunc { args.iter().for_each(|col| { if col.data_type() == DataType::Utf8View { return_datatype = col.data_type(); - } else if col.data_type() == DataType::LargeUtf8 + } + if col.data_type() == DataType::LargeUtf8 && return_datatype != DataType::Utf8View { return_datatype = col.data_type();