From b557b7d1151172697ce2a903f960690b249f8978 Mon Sep 17 00:00:00 2001 From: Cheng-Yuan-Lai Date: Sat, 22 Feb 2025 23:06:22 +0800 Subject: [PATCH 01/15] refactor: use TypeSignature::Coercible for crypto functions --- datafusion/functions/src/crypto/digest.rs | 22 ++++++++++++++-------- datafusion/functions/src/crypto/md5.rs | 22 ++++++++++++++++------ datafusion/functions/src/crypto/sha224.rs | 21 +++++++++++++++------ datafusion/functions/src/crypto/sha256.rs | 21 +++++++++++++++------ datafusion/functions/src/crypto/sha384.rs | 21 +++++++++++++++------ datafusion/functions/src/crypto/sha512.rs | 21 +++++++++++++++------ 6 files changed, 90 insertions(+), 38 deletions(-) diff --git a/datafusion/functions/src/crypto/digest.rs b/datafusion/functions/src/crypto/digest.rs index 4f9d4605fe07..60821e04298f 100644 --- a/datafusion/functions/src/crypto/digest.rs +++ b/datafusion/functions/src/crypto/digest.rs @@ -18,11 +18,15 @@ //! "crypto" DataFusion functions use super::basic::{digest, utf8_or_binary_to_binary_type}; use arrow::datatypes::DataType; -use datafusion_common::Result; +use datafusion_common::{ + types::{logical_binary, logical_string}, + Result, +}; use datafusion_expr::{ ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature, - TypeSignature::*, Volatility, + TypeSignature, Volatility, }; +use datafusion_expr_common::signature::{Coercion, TypeSignatureClass}; use datafusion_macros::user_doc; use std::any::Any; @@ -64,15 +68,17 @@ impl Default for DigestFunc { impl DigestFunc { pub fn new() -> Self { - use DataType::*; Self { signature: Signature::one_of( vec![ - Exact(vec![Utf8View, Utf8View]), - Exact(vec![Utf8, Utf8]), - Exact(vec![LargeUtf8, Utf8]), - Exact(vec![Binary, Utf8]), - Exact(vec![LargeBinary, Utf8]), + TypeSignature::Coercible(vec![ + Coercion::new_exact(TypeSignatureClass::Native(logical_string())), + Coercion::new_exact(TypeSignatureClass::Native(logical_string())), + ]), + TypeSignature::Coercible(vec![ + Coercion::new_exact(TypeSignatureClass::Native(logical_string())), + Coercion::new_exact(TypeSignatureClass::Native(logical_binary())), + ]), ], Volatility::Immutable, ), diff --git a/datafusion/functions/src/crypto/md5.rs b/datafusion/functions/src/crypto/md5.rs index 18ad0d6a7ded..68f8142bce1c 100644 --- a/datafusion/functions/src/crypto/md5.rs +++ b/datafusion/functions/src/crypto/md5.rs @@ -18,11 +18,16 @@ //! "crypto" DataFusion functions use crate::crypto::basic::md5; use arrow::datatypes::DataType; -use datafusion_common::{plan_err, Result}; +use datafusion_common::{ + plan_err, + types::{logical_binary, logical_string}, + Result, +}; use datafusion_expr::{ ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature, - Volatility, + TypeSignature, Volatility, }; +use datafusion_expr_common::signature::{Coercion, TypeSignatureClass}; use datafusion_macros::user_doc; use std::any::Any; @@ -52,11 +57,16 @@ impl Default for Md5Func { impl Md5Func { pub fn new() -> Self { - use DataType::*; Self { - signature: Signature::uniform( - 1, - vec![Utf8View, Utf8, LargeUtf8, Binary, LargeBinary], + signature: Signature::one_of( + vec![ + TypeSignature::Coercible(vec![Coercion::new_exact( + TypeSignatureClass::Native(logical_string()), + )]), + TypeSignature::Coercible(vec![Coercion::new_exact( + TypeSignatureClass::Native(logical_binary()), + )]), + ], Volatility::Immutable, ), } diff --git a/datafusion/functions/src/crypto/sha224.rs b/datafusion/functions/src/crypto/sha224.rs index 24fe5e119df3..b155e4efcbb4 100644 --- a/datafusion/functions/src/crypto/sha224.rs +++ b/datafusion/functions/src/crypto/sha224.rs @@ -18,11 +18,15 @@ //! "crypto" DataFusion functions use super::basic::{sha224, utf8_or_binary_to_binary_type}; use arrow::datatypes::DataType; -use datafusion_common::Result; +use datafusion_common::{ + types::{logical_binary, logical_string}, + Result, +}; use datafusion_expr::{ ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature, - Volatility, + TypeSignature, Volatility, }; +use datafusion_expr_common::signature::{Coercion, TypeSignatureClass}; use datafusion_macros::user_doc; use std::any::Any; @@ -53,11 +57,16 @@ impl Default for SHA224Func { impl SHA224Func { pub fn new() -> Self { - use DataType::*; Self { - signature: Signature::uniform( - 1, - vec![Utf8View, Utf8, LargeUtf8, Binary, LargeBinary], + signature: Signature::one_of( + vec![ + TypeSignature::Coercible(vec![Coercion::new_exact( + TypeSignatureClass::Native(logical_string()), + )]), + TypeSignature::Coercible(vec![Coercion::new_exact( + TypeSignatureClass::Native(logical_binary()), + )]), + ], Volatility::Immutable, ), } diff --git a/datafusion/functions/src/crypto/sha256.rs b/datafusion/functions/src/crypto/sha256.rs index c48dda19cbc5..f275295731b1 100644 --- a/datafusion/functions/src/crypto/sha256.rs +++ b/datafusion/functions/src/crypto/sha256.rs @@ -18,11 +18,15 @@ //! "crypto" DataFusion functions use super::basic::{sha256, utf8_or_binary_to_binary_type}; use arrow::datatypes::DataType; -use datafusion_common::Result; +use datafusion_common::{ + types::{logical_binary, logical_string}, + Result, +}; use datafusion_expr::{ ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature, - Volatility, + TypeSignature, Volatility, }; +use datafusion_expr_common::signature::{Coercion, TypeSignatureClass}; use datafusion_macros::user_doc; use std::any::Any; @@ -52,11 +56,16 @@ impl Default for SHA256Func { impl SHA256Func { pub fn new() -> Self { - use DataType::*; Self { - signature: Signature::uniform( - 1, - vec![Utf8View, Utf8, LargeUtf8, Binary, LargeBinary], + signature: Signature::one_of( + vec![ + TypeSignature::Coercible(vec![Coercion::new_exact( + TypeSignatureClass::Native(logical_string()), + )]), + TypeSignature::Coercible(vec![Coercion::new_exact( + TypeSignatureClass::Native(logical_binary()), + )]), + ], Volatility::Immutable, ), } diff --git a/datafusion/functions/src/crypto/sha384.rs b/datafusion/functions/src/crypto/sha384.rs index 11d1d130e929..4f6f8e1774fd 100644 --- a/datafusion/functions/src/crypto/sha384.rs +++ b/datafusion/functions/src/crypto/sha384.rs @@ -18,11 +18,15 @@ //! "crypto" DataFusion functions use super::basic::{sha384, utf8_or_binary_to_binary_type}; use arrow::datatypes::DataType; -use datafusion_common::Result; +use datafusion_common::{ + types::{logical_binary, logical_string}, + Result, +}; use datafusion_expr::{ ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature, - Volatility, + TypeSignature, Volatility, }; +use datafusion_expr_common::signature::{Coercion, TypeSignatureClass}; use datafusion_macros::user_doc; use std::any::Any; @@ -52,11 +56,16 @@ impl Default for SHA384Func { impl SHA384Func { pub fn new() -> Self { - use DataType::*; Self { - signature: Signature::uniform( - 1, - vec![Utf8View, Utf8, LargeUtf8, Binary, LargeBinary], + signature: Signature::one_of( + vec![ + TypeSignature::Coercible(vec![Coercion::new_exact( + TypeSignatureClass::Native(logical_string()), + )]), + TypeSignature::Coercible(vec![Coercion::new_exact( + TypeSignatureClass::Native(logical_binary()), + )]), + ], Volatility::Immutable, ), } diff --git a/datafusion/functions/src/crypto/sha512.rs b/datafusion/functions/src/crypto/sha512.rs index 26fa85a5da3a..bdb4fc74f1e2 100644 --- a/datafusion/functions/src/crypto/sha512.rs +++ b/datafusion/functions/src/crypto/sha512.rs @@ -18,11 +18,15 @@ //! "crypto" DataFusion functions use super::basic::{sha512, utf8_or_binary_to_binary_type}; use arrow::datatypes::DataType; -use datafusion_common::Result; +use datafusion_common::{ + types::{logical_binary, logical_string}, + Result, +}; use datafusion_expr::{ ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature, - Volatility, + TypeSignature, Volatility, }; +use datafusion_expr_common::signature::{Coercion, TypeSignatureClass}; use datafusion_macros::user_doc; use std::any::Any; @@ -52,11 +56,16 @@ impl Default for SHA512Func { impl SHA512Func { pub fn new() -> Self { - use DataType::*; Self { - signature: Signature::uniform( - 1, - vec![Utf8View, Utf8, LargeUtf8, Binary, LargeBinary], + signature: Signature::one_of( + vec![ + TypeSignature::Coercible(vec![Coercion::new_exact( + TypeSignatureClass::Native(logical_string()), + )]), + TypeSignature::Coercible(vec![Coercion::new_exact( + TypeSignatureClass::Native(logical_binary()), + )]), + ], Volatility::Immutable, ), } From 01f6bcc36c6780eaede25103cab679195805fffe Mon Sep 17 00:00:00 2001 From: Cheng-Yuan-Lai Date: Sun, 23 Feb 2025 11:31:50 +0800 Subject: [PATCH 02/15] fix --- datafusion/functions/src/crypto/digest.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/functions/src/crypto/digest.rs b/datafusion/functions/src/crypto/digest.rs index 60821e04298f..2840006169be 100644 --- a/datafusion/functions/src/crypto/digest.rs +++ b/datafusion/functions/src/crypto/digest.rs @@ -76,8 +76,8 @@ impl DigestFunc { Coercion::new_exact(TypeSignatureClass::Native(logical_string())), ]), TypeSignature::Coercible(vec![ - Coercion::new_exact(TypeSignatureClass::Native(logical_string())), Coercion::new_exact(TypeSignatureClass::Native(logical_binary())), + Coercion::new_exact(TypeSignatureClass::Native(logical_string())), ]), ], Volatility::Immutable, From 556a8c3f46e4b8a2823d254ed4e9e8b6f0f25406 Mon Sep 17 00:00:00 2001 From: Cheng-Yuan-Lai Date: Mon, 24 Feb 2025 00:34:47 +0800 Subject: [PATCH 03/15] fix --- datafusion/functions/src/crypto/digest.rs | 24 +++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/datafusion/functions/src/crypto/digest.rs b/datafusion/functions/src/crypto/digest.rs index 2840006169be..8c131daf7c9c 100644 --- a/datafusion/functions/src/crypto/digest.rs +++ b/datafusion/functions/src/crypto/digest.rs @@ -19,7 +19,7 @@ use super::basic::{digest, utf8_or_binary_to_binary_type}; use arrow::datatypes::DataType; use datafusion_common::{ - types::{logical_binary, logical_string}, + types::{logical_binary, logical_string, NativeType}, Result, }; use datafusion_expr::{ @@ -69,17 +69,17 @@ impl Default for DigestFunc { impl DigestFunc { pub fn new() -> Self { Self { - signature: Signature::one_of( - vec![ - TypeSignature::Coercible(vec![ - Coercion::new_exact(TypeSignatureClass::Native(logical_string())), - Coercion::new_exact(TypeSignatureClass::Native(logical_string())), - ]), - TypeSignature::Coercible(vec![ - Coercion::new_exact(TypeSignatureClass::Native(logical_binary())), - Coercion::new_exact(TypeSignatureClass::Native(logical_string())), - ]), - ], + signature: Signature::new( + TypeSignature::Coercible(vec![ + // First argument: Accepts string types and can coerce from binary + Coercion::new_implicit( + TypeSignatureClass::Native(logical_string()), + vec![TypeSignatureClass::Native(logical_binary())], + NativeType::String, + ), + // Second argument: Only accepts string types, no coercion from binary + Coercion::new_exact(TypeSignatureClass::Native(logical_string())), + ]), Volatility::Immutable, ), } From 7af8087faceb9911df22df27b58373a6f9313ced Mon Sep 17 00:00:00 2001 From: Cheng-Yuan-Lai Date: Wed, 26 Feb 2025 00:01:04 +0800 Subject: [PATCH 04/15] fix string_view.slt --- datafusion/functions/src/crypto/digest.rs | 22 +++++++++---------- .../test_files/string/string_view.slt | 2 +- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/datafusion/functions/src/crypto/digest.rs b/datafusion/functions/src/crypto/digest.rs index 8c131daf7c9c..c7a21ffcee5d 100644 --- a/datafusion/functions/src/crypto/digest.rs +++ b/datafusion/functions/src/crypto/digest.rs @@ -69,17 +69,17 @@ impl Default for DigestFunc { impl DigestFunc { pub fn new() -> Self { Self { - signature: Signature::new( - TypeSignature::Coercible(vec![ - // First argument: Accepts string types and can coerce from binary - Coercion::new_implicit( - TypeSignatureClass::Native(logical_string()), - vec![TypeSignatureClass::Native(logical_binary())], - NativeType::String, - ), - // Second argument: Only accepts string types, no coercion from binary - Coercion::new_exact(TypeSignatureClass::Native(logical_string())), - ]), + signature: Signature::one_of( + vec![ + TypeSignature::Coercible(vec![ + Coercion::new_exact(TypeSignatureClass::Native(logical_string())), + Coercion::new_exact(TypeSignatureClass::Native(logical_string())), + ]), + TypeSignature::Coercible(vec![ + Coercion::new_exact(TypeSignatureClass::Native(logical_binary())), + Coercion::new_exact(TypeSignatureClass::Native(logical_string())), + ]), + ], Volatility::Immutable, ), } diff --git a/datafusion/sqllogictest/test_files/string/string_view.slt b/datafusion/sqllogictest/test_files/string/string_view.slt index 96fb2477598c..a72c8f574484 100644 --- a/datafusion/sqllogictest/test_files/string/string_view.slt +++ b/datafusion/sqllogictest/test_files/string/string_view.slt @@ -1068,7 +1068,7 @@ EXPLAIN SELECT FROM test; ---- logical_plan -01)Projection: digest(test.column1_utf8view, Utf8View("md5")) AS c +01)Projection: digest(test.column1_utf8view, Utf8("md5")) AS c 02)--TableScan: test projection=[column1_utf8view] ## Ensure no unexpected casts for string_to_array From 19d1f05eb50d7b18e6274af85d18efa86d2a1a28 Mon Sep 17 00:00:00 2001 From: Cheng-Yuan-Lai Date: Wed, 26 Feb 2025 23:38:25 +0800 Subject: [PATCH 05/15] fix signatrue for sha256 --- datafusion/functions/src/crypto/sha256.rs | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/datafusion/functions/src/crypto/sha256.rs b/datafusion/functions/src/crypto/sha256.rs index f275295731b1..b6a7db72e261 100644 --- a/datafusion/functions/src/crypto/sha256.rs +++ b/datafusion/functions/src/crypto/sha256.rs @@ -19,7 +19,7 @@ use super::basic::{sha256, utf8_or_binary_to_binary_type}; use arrow::datatypes::DataType; use datafusion_common::{ - types::{logical_binary, logical_string}, + types::{logical_binary, logical_string, NativeType}, Result, }; use datafusion_expr::{ @@ -57,15 +57,15 @@ impl Default for SHA256Func { impl SHA256Func { pub fn new() -> Self { Self { - signature: Signature::one_of( - vec![ - TypeSignature::Coercible(vec![Coercion::new_exact( + signature: Signature::new( + TypeSignature::Coercible(vec![Coercion::new_implicit( + TypeSignatureClass::Native(logical_binary()), + vec![ TypeSignatureClass::Native(logical_string()), - )]), - TypeSignature::Coercible(vec![Coercion::new_exact( TypeSignatureClass::Native(logical_binary()), - )]), - ], + ], + NativeType::Binary, + )]), Volatility::Immutable, ), } From 771620ed673444ab737fef58d7217a55d74a5044 Mon Sep 17 00:00:00 2001 From: Cheng-Yuan-Lai Date: Wed, 26 Feb 2025 23:50:47 +0800 Subject: [PATCH 06/15] remove unsed import --- datafusion/functions/src/crypto/digest.rs | 2 +- datafusion/functions/src/crypto/sha256.rs | 7 ++----- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/datafusion/functions/src/crypto/digest.rs b/datafusion/functions/src/crypto/digest.rs index c7a21ffcee5d..2840006169be 100644 --- a/datafusion/functions/src/crypto/digest.rs +++ b/datafusion/functions/src/crypto/digest.rs @@ -19,7 +19,7 @@ use super::basic::{digest, utf8_or_binary_to_binary_type}; use arrow::datatypes::DataType; use datafusion_common::{ - types::{logical_binary, logical_string, NativeType}, + types::{logical_binary, logical_string}, Result, }; use datafusion_expr::{ diff --git a/datafusion/functions/src/crypto/sha256.rs b/datafusion/functions/src/crypto/sha256.rs index b6a7db72e261..cd63b846fcc6 100644 --- a/datafusion/functions/src/crypto/sha256.rs +++ b/datafusion/functions/src/crypto/sha256.rs @@ -59,11 +59,8 @@ impl SHA256Func { Self { signature: Signature::new( TypeSignature::Coercible(vec![Coercion::new_implicit( - TypeSignatureClass::Native(logical_binary()), - vec![ - TypeSignatureClass::Native(logical_string()), - TypeSignatureClass::Native(logical_binary()), - ], + TypeSignatureClass::Native(logical_string()), + vec![TypeSignatureClass::Native(logical_binary())], NativeType::Binary, )]), Volatility::Immutable, From 0d28d54d0b88f9e8f1368759047b8616e1ab476a Mon Sep 17 00:00:00 2001 From: Cheng-Yuan-Lai Date: Thu, 6 Mar 2025 23:47:53 +0800 Subject: [PATCH 07/15] support binaryView --- datafusion/functions/src/crypto/basic.rs | 123 ++++++++++++++++------ datafusion/functions/src/crypto/sha256.rs | 12 +-- 2 files changed, 95 insertions(+), 40 deletions(-) diff --git a/datafusion/functions/src/crypto/basic.rs b/datafusion/functions/src/crypto/basic.rs index 191154b8f8ff..3375addbd7cd 100644 --- a/datafusion/functions/src/crypto/basic.rs +++ b/datafusion/functions/src/crypto/basic.rs @@ -17,7 +17,10 @@ //! "crypto" DataFusion functions -use arrow::array::{Array, ArrayRef, BinaryArray, OffsetSizeTrait}; +use arrow::array::{ + Array, ArrayRef, BinaryArray, BinaryArrayType, BinaryViewArray, GenericBinaryArray, + OffsetSizeTrait, +}; use arrow::array::{AsArray, GenericStringArray, StringArray, StringViewArray}; use arrow::datatypes::DataType; use blake2::{Blake2b512, Blake2s256, Digest}; @@ -198,11 +201,13 @@ pub fn utf8_or_binary_to_binary_type( arg_type: &DataType, name: &str, ) -> Result { + dbg!(arg_type); Ok(match arg_type { DataType::Utf8View | DataType::LargeUtf8 | DataType::Utf8 | DataType::Binary + | DataType::BinaryView | DataType::LargeBinary => DataType::Binary, DataType::Null => DataType::Null, _ => { @@ -251,30 +256,44 @@ impl DigestAlgorithm { where T: OffsetSizeTrait, { - let input_value = as_generic_binary_array::(value)?; - let array: ArrayRef = match self { - Self::Md5 => digest_to_array!(Md5, input_value), - Self::Sha224 => digest_to_array!(Sha224, input_value), - Self::Sha256 => digest_to_array!(Sha256, input_value), - Self::Sha384 => digest_to_array!(Sha384, input_value), - Self::Sha512 => digest_to_array!(Sha512, input_value), - Self::Blake2b => digest_to_array!(Blake2b512, input_value), - Self::Blake2s => digest_to_array!(Blake2s256, input_value), - Self::Blake3 => { - let binary_array: BinaryArray = input_value - .iter() - .map(|opt| { - opt.map(|x| { - let mut digest = Blake3::default(); - digest.update(x); - Blake3::finalize(&digest).as_bytes().to_vec() - }) - }) - .collect(); - Arc::new(binary_array) + let array = match value.data_type() { + DataType::Binary | DataType::LargeBinary => { + let v = value.as_binary::(); + self.digest_binary_array_impl::<&GenericBinaryArray>(v) + } + DataType::BinaryView => { + let v = value.as_binary_view(); + self.digest_binary_array_impl::<&BinaryViewArray>(v) + } + other => { + return exec_err!("unsupported type for digest_utf_array: {other:?}") } }; Ok(ColumnarValue::Array(array)) + // let input_value = as_generic_binary_array::(value)?; + // let array: ArrayRef = match self { + // Self::Md5 => digest_to_array!(Md5, input_value), + // Self::Sha224 => digest_to_array!(Sha224, input_value), + // Self::Sha256 => digest_to_array!(Sha256, input_value), + // Self::Sha384 => digest_to_array!(Sha384, input_value), + // Self::Sha512 => digest_to_array!(Sha512, input_value), + // Self::Blake2b => digest_to_array!(Blake2b512, input_value), + // Self::Blake2s => digest_to_array!(Blake2s256, input_value), + // Self::Blake3 => { + // let binary_array: BinaryArray = input_value + // .iter() + // .map(|opt| { + // opt.map(|x| { + // let mut digest = Blake3::default(); + // digest.update(x); + // Blake3::finalize(&digest).as_bytes().to_vec() + // }) + // }) + // .collect(); + // Arc::new(binary_array) + // } + // }; + // Ok(ColumnarValue::Array(array)) } /// digest a string array to their hash values @@ -328,6 +347,37 @@ impl DigestAlgorithm { } } } + + pub fn digest_binary_array_impl<'a, BinaryArrType>( + self, + input_value: BinaryArrType, + ) -> ArrayRef + where + BinaryArrType: BinaryArrayType<'a>, + { + match self { + Self::Md5 => digest_to_array!(Md5, input_value), + Self::Sha224 => digest_to_array!(Sha224, input_value), + Self::Sha256 => digest_to_array!(Sha256, input_value), + Self::Sha384 => digest_to_array!(Sha384, input_value), + Self::Sha512 => digest_to_array!(Sha512, input_value), + Self::Blake2b => digest_to_array!(Blake2b512, input_value), + Self::Blake2s => digest_to_array!(Blake2s256, input_value), + Self::Blake3 => { + let binary_array: BinaryArray = input_value + .iter() + .map(|opt| { + opt.map(|x| { + let mut digest = Blake3::default(); + digest.update(x); + Blake3::finalize(&digest).as_bytes().to_vec() + }) + }) + .collect(); + Arc::new(binary_array) + } + } + } } pub fn digest_process( value: &ColumnarValue, @@ -342,22 +392,27 @@ pub fn digest_process( DataType::LargeBinary => { digest_algorithm.digest_binary_array::(a.as_ref()) } - other => exec_err!( - "Unsupported data type {other:?} for function {digest_algorithm}" - ), - }, - ColumnarValue::Scalar(scalar) => match scalar { - ScalarValue::Utf8View(a) - | ScalarValue::Utf8(a) - | ScalarValue::LargeUtf8(a) => { - Ok(digest_algorithm - .digest_scalar(a.as_ref().map(|s: &String| s.as_bytes()))) + DataType::BinaryView => { + digest_algorithm.digest_binary_array::(a.as_ref()) } - ScalarValue::Binary(a) | ScalarValue::LargeBinary(a) => Ok(digest_algorithm - .digest_scalar(a.as_ref().map(|v: &Vec| v.as_slice()))), other => exec_err!( "Unsupported data type {other:?} for function {digest_algorithm}" ), }, + ColumnarValue::Scalar(scalar) => { + match scalar { + ScalarValue::Utf8View(a) + | ScalarValue::Utf8(a) + | ScalarValue::LargeUtf8(a) => Ok(digest_algorithm + .digest_scalar(a.as_ref().map(|s: &String| s.as_bytes()))), + ScalarValue::Binary(a) + | ScalarValue::LargeBinary(a) + | ScalarValue::BinaryView(a) => Ok(digest_algorithm + .digest_scalar(a.as_ref().map(|v: &Vec| v.as_slice()))), + other => exec_err!( + "Unsupported data type {other:?} for function {digest_algorithm}" + ), + } + } } } diff --git a/datafusion/functions/src/crypto/sha256.rs b/datafusion/functions/src/crypto/sha256.rs index cd63b846fcc6..7a25ac96e639 100644 --- a/datafusion/functions/src/crypto/sha256.rs +++ b/datafusion/functions/src/crypto/sha256.rs @@ -24,7 +24,7 @@ use datafusion_common::{ }; use datafusion_expr::{ ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature, - TypeSignature, Volatility, + Volatility, }; use datafusion_expr_common::signature::{Coercion, TypeSignatureClass}; use datafusion_macros::user_doc; @@ -57,12 +57,12 @@ impl Default for SHA256Func { impl SHA256Func { pub fn new() -> Self { Self { - signature: Signature::new( - TypeSignature::Coercible(vec![Coercion::new_implicit( - TypeSignatureClass::Native(logical_string()), - vec![TypeSignatureClass::Native(logical_binary())], + signature: Signature::coercible( + vec![Coercion::new_implicit( + TypeSignatureClass::Native(logical_binary()), + vec![TypeSignatureClass::Native(logical_string())], NativeType::Binary, - )]), + )], Volatility::Immutable, ), } From cdbafcfc183c23f338b99ca4fed06d02861ddf4f Mon Sep 17 00:00:00 2001 From: Cheng-Yuan-Lai Date: Fri, 7 Mar 2025 00:59:58 +0800 Subject: [PATCH 08/15] modify signature if sha256 function --- datafusion/functions/src/crypto/sha256.rs | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/datafusion/functions/src/crypto/sha256.rs b/datafusion/functions/src/crypto/sha256.rs index 7a25ac96e639..f275295731b1 100644 --- a/datafusion/functions/src/crypto/sha256.rs +++ b/datafusion/functions/src/crypto/sha256.rs @@ -19,12 +19,12 @@ use super::basic::{sha256, utf8_or_binary_to_binary_type}; use arrow::datatypes::DataType; use datafusion_common::{ - types::{logical_binary, logical_string, NativeType}, + types::{logical_binary, logical_string}, Result, }; use datafusion_expr::{ ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature, - Volatility, + TypeSignature, Volatility, }; use datafusion_expr_common::signature::{Coercion, TypeSignatureClass}; use datafusion_macros::user_doc; @@ -57,12 +57,15 @@ impl Default for SHA256Func { impl SHA256Func { pub fn new() -> Self { Self { - signature: Signature::coercible( - vec![Coercion::new_implicit( - TypeSignatureClass::Native(logical_binary()), - vec![TypeSignatureClass::Native(logical_string())], - NativeType::Binary, - )], + signature: Signature::one_of( + vec![ + TypeSignature::Coercible(vec![Coercion::new_exact( + TypeSignatureClass::Native(logical_string()), + )]), + TypeSignature::Coercible(vec![Coercion::new_exact( + TypeSignatureClass::Native(logical_binary()), + )]), + ], Volatility::Immutable, ), } From 093bfbf46d31f8586e15662130e95c99d1f4a203 Mon Sep 17 00:00:00 2001 From: Cheng-Yuan-Lai Date: Fri, 7 Mar 2025 01:12:53 +0800 Subject: [PATCH 09/15] remove unsed import --- datafusion/functions/src/crypto/basic.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/datafusion/functions/src/crypto/basic.rs b/datafusion/functions/src/crypto/basic.rs index 3375addbd7cd..e71b326f390f 100644 --- a/datafusion/functions/src/crypto/basic.rs +++ b/datafusion/functions/src/crypto/basic.rs @@ -29,8 +29,8 @@ use datafusion_common::cast::as_binary_array; use arrow::compute::StringArrayType; use datafusion_common::{ - cast::as_generic_binary_array, exec_err, internal_err, plan_err, - utils::take_function_args, DataFusionError, Result, ScalarValue, + exec_err, internal_err, plan_err, utils::take_function_args, DataFusionError, Result, + ScalarValue, }; use datafusion_expr::ColumnarValue; use md5::Md5; From fdfabef36ff5a515c7c0ba723285fab74318d1ee Mon Sep 17 00:00:00 2001 From: Cheng-Yuan-Lai Date: Sat, 8 Mar 2025 00:06:49 +0800 Subject: [PATCH 10/15] clean unused codes --- datafusion/functions/src/crypto/basic.rs | 25 ------------------------ 1 file changed, 25 deletions(-) diff --git a/datafusion/functions/src/crypto/basic.rs b/datafusion/functions/src/crypto/basic.rs index e71b326f390f..eaa688c1c335 100644 --- a/datafusion/functions/src/crypto/basic.rs +++ b/datafusion/functions/src/crypto/basic.rs @@ -201,7 +201,6 @@ pub fn utf8_or_binary_to_binary_type( arg_type: &DataType, name: &str, ) -> Result { - dbg!(arg_type); Ok(match arg_type { DataType::Utf8View | DataType::LargeUtf8 @@ -270,30 +269,6 @@ impl DigestAlgorithm { } }; Ok(ColumnarValue::Array(array)) - // let input_value = as_generic_binary_array::(value)?; - // let array: ArrayRef = match self { - // Self::Md5 => digest_to_array!(Md5, input_value), - // Self::Sha224 => digest_to_array!(Sha224, input_value), - // Self::Sha256 => digest_to_array!(Sha256, input_value), - // Self::Sha384 => digest_to_array!(Sha384, input_value), - // Self::Sha512 => digest_to_array!(Sha512, input_value), - // Self::Blake2b => digest_to_array!(Blake2b512, input_value), - // Self::Blake2s => digest_to_array!(Blake2s256, input_value), - // Self::Blake3 => { - // let binary_array: BinaryArray = input_value - // .iter() - // .map(|opt| { - // opt.map(|x| { - // let mut digest = Blake3::default(); - // digest.update(x); - // Blake3::finalize(&digest).as_bytes().to_vec() - // }) - // }) - // .collect(); - // Arc::new(binary_array) - // } - // }; - // Ok(ColumnarValue::Array(array)) } /// digest a string array to their hash values From cd63fc66073fbc8907129575ec5c06182a737992 Mon Sep 17 00:00:00 2001 From: Ian Lai Date: Tue, 25 Mar 2025 09:33:34 +0000 Subject: [PATCH 11/15] rewrite function using new_implicit for sha512 --- datafusion/functions/src/crypto/sha512.rs | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/datafusion/functions/src/crypto/sha512.rs b/datafusion/functions/src/crypto/sha512.rs index bdb4fc74f1e2..93759966890a 100644 --- a/datafusion/functions/src/crypto/sha512.rs +++ b/datafusion/functions/src/crypto/sha512.rs @@ -19,7 +19,7 @@ use super::basic::{sha512, utf8_or_binary_to_binary_type}; use arrow::datatypes::DataType; use datafusion_common::{ - types::{logical_binary, logical_string}, + types::{logical_binary, logical_float64, logical_string, NativeType}, Result, }; use datafusion_expr::{ @@ -59,11 +59,15 @@ impl SHA512Func { Self { signature: Signature::one_of( vec![ - TypeSignature::Coercible(vec![Coercion::new_exact( - TypeSignatureClass::Native(logical_string()), + TypeSignature::Coercible(vec![Coercion::new_implicit( + TypeSignatureClass::Native(logical_binary()), + vec![TypeSignatureClass::Native(logical_string())], + NativeType::String, )]), - TypeSignature::Coercible(vec![Coercion::new_exact( + TypeSignature::Coercible(vec![Coercion::new_implicit( TypeSignatureClass::Native(logical_binary()), + vec![TypeSignatureClass::Native(logical_binary())], + NativeType::Binary, )]), ], Volatility::Immutable, From 4dfb4f64a95068217aac3841f33d55d2fb4c3180 Mon Sep 17 00:00:00 2001 From: Ian Lai Date: Tue, 25 Mar 2025 09:46:38 +0000 Subject: [PATCH 12/15] remove unused import --- datafusion/functions/src/crypto/sha512.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/functions/src/crypto/sha512.rs b/datafusion/functions/src/crypto/sha512.rs index 93759966890a..f48737e5751f 100644 --- a/datafusion/functions/src/crypto/sha512.rs +++ b/datafusion/functions/src/crypto/sha512.rs @@ -19,7 +19,7 @@ use super::basic::{sha512, utf8_or_binary_to_binary_type}; use arrow::datatypes::DataType; use datafusion_common::{ - types::{logical_binary, logical_float64, logical_string, NativeType}, + types::{logical_binary, logical_string, NativeType}, Result, }; use datafusion_expr::{ From c35d30cc184482f1f6cfa1ef646329524a4fd7ae Mon Sep 17 00:00:00 2001 From: Ian Lai Date: Wed, 26 Mar 2025 03:32:59 +0000 Subject: [PATCH 13/15] rewrite signature with new_implicit for other crypto functions --- datafusion/functions/src/crypto/md5.rs | 12 ++++++++---- datafusion/functions/src/crypto/sha224.rs | 12 ++++++++---- datafusion/functions/src/crypto/sha256.rs | 12 ++++++++---- datafusion/functions/src/crypto/sha384.rs | 12 ++++++++---- 4 files changed, 32 insertions(+), 16 deletions(-) diff --git a/datafusion/functions/src/crypto/md5.rs b/datafusion/functions/src/crypto/md5.rs index 68f8142bce1c..4b2c67b98379 100644 --- a/datafusion/functions/src/crypto/md5.rs +++ b/datafusion/functions/src/crypto/md5.rs @@ -20,7 +20,7 @@ use crate::crypto::basic::md5; use arrow::datatypes::DataType; use datafusion_common::{ plan_err, - types::{logical_binary, logical_string}, + types::{logical_binary, logical_string, NativeType}, Result, }; use datafusion_expr::{ @@ -60,11 +60,15 @@ impl Md5Func { Self { signature: Signature::one_of( vec![ - TypeSignature::Coercible(vec![Coercion::new_exact( - TypeSignatureClass::Native(logical_string()), + TypeSignature::Coercible(vec![Coercion::new_implicit( + TypeSignatureClass::Native(logical_binary()), + vec![TypeSignatureClass::Native(logical_string())], + NativeType::String, )]), - TypeSignature::Coercible(vec![Coercion::new_exact( + TypeSignature::Coercible(vec![Coercion::new_implicit( TypeSignatureClass::Native(logical_binary()), + vec![TypeSignatureClass::Native(logical_binary())], + NativeType::Binary, )]), ], Volatility::Immutable, diff --git a/datafusion/functions/src/crypto/sha224.rs b/datafusion/functions/src/crypto/sha224.rs index b155e4efcbb4..a64a3ef80319 100644 --- a/datafusion/functions/src/crypto/sha224.rs +++ b/datafusion/functions/src/crypto/sha224.rs @@ -19,7 +19,7 @@ use super::basic::{sha224, utf8_or_binary_to_binary_type}; use arrow::datatypes::DataType; use datafusion_common::{ - types::{logical_binary, logical_string}, + types::{logical_binary, logical_string, NativeType}, Result, }; use datafusion_expr::{ @@ -60,11 +60,15 @@ impl SHA224Func { Self { signature: Signature::one_of( vec![ - TypeSignature::Coercible(vec![Coercion::new_exact( - TypeSignatureClass::Native(logical_string()), + TypeSignature::Coercible(vec![Coercion::new_implicit( + TypeSignatureClass::Native(logical_binary()), + vec![TypeSignatureClass::Native(logical_string())], + NativeType::String, )]), - TypeSignature::Coercible(vec![Coercion::new_exact( + TypeSignature::Coercible(vec![Coercion::new_implicit( TypeSignatureClass::Native(logical_binary()), + vec![TypeSignatureClass::Native(logical_binary())], + NativeType::Binary, )]), ], Volatility::Immutable, diff --git a/datafusion/functions/src/crypto/sha256.rs b/datafusion/functions/src/crypto/sha256.rs index f275295731b1..94f3ea3b49fa 100644 --- a/datafusion/functions/src/crypto/sha256.rs +++ b/datafusion/functions/src/crypto/sha256.rs @@ -19,7 +19,7 @@ use super::basic::{sha256, utf8_or_binary_to_binary_type}; use arrow::datatypes::DataType; use datafusion_common::{ - types::{logical_binary, logical_string}, + types::{logical_binary, logical_string, NativeType}, Result, }; use datafusion_expr::{ @@ -59,11 +59,15 @@ impl SHA256Func { Self { signature: Signature::one_of( vec![ - TypeSignature::Coercible(vec![Coercion::new_exact( - TypeSignatureClass::Native(logical_string()), + TypeSignature::Coercible(vec![Coercion::new_implicit( + TypeSignatureClass::Native(logical_binary()), + vec![TypeSignatureClass::Native(logical_string())], + NativeType::String, )]), - TypeSignature::Coercible(vec![Coercion::new_exact( + TypeSignature::Coercible(vec![Coercion::new_implicit( TypeSignatureClass::Native(logical_binary()), + vec![TypeSignatureClass::Native(logical_binary())], + NativeType::Binary, )]), ], Volatility::Immutable, diff --git a/datafusion/functions/src/crypto/sha384.rs b/datafusion/functions/src/crypto/sha384.rs index 4f6f8e1774fd..023730469c7b 100644 --- a/datafusion/functions/src/crypto/sha384.rs +++ b/datafusion/functions/src/crypto/sha384.rs @@ -19,7 +19,7 @@ use super::basic::{sha384, utf8_or_binary_to_binary_type}; use arrow::datatypes::DataType; use datafusion_common::{ - types::{logical_binary, logical_string}, + types::{logical_binary, logical_string, NativeType}, Result, }; use datafusion_expr::{ @@ -59,11 +59,15 @@ impl SHA384Func { Self { signature: Signature::one_of( vec![ - TypeSignature::Coercible(vec![Coercion::new_exact( - TypeSignatureClass::Native(logical_string()), + TypeSignature::Coercible(vec![Coercion::new_implicit( + TypeSignatureClass::Native(logical_binary()), + vec![TypeSignatureClass::Native(logical_string())], + NativeType::String, )]), - TypeSignature::Coercible(vec![Coercion::new_exact( + TypeSignature::Coercible(vec![Coercion::new_implicit( TypeSignatureClass::Native(logical_binary()), + vec![TypeSignatureClass::Native(logical_binary())], + NativeType::Binary, )]), ], Volatility::Immutable, From eeaee64bf3407f97396630a18363ecd94aaae357 Mon Sep 17 00:00:00 2001 From: Ian Lai Date: Wed, 26 Mar 2025 06:20:37 +0000 Subject: [PATCH 14/15] support null for md5 function --- datafusion/functions/src/crypto/md5.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/datafusion/functions/src/crypto/md5.rs b/datafusion/functions/src/crypto/md5.rs index 4b2c67b98379..c1540450029c 100644 --- a/datafusion/functions/src/crypto/md5.rs +++ b/datafusion/functions/src/crypto/md5.rs @@ -93,11 +93,11 @@ impl ScalarUDFImpl for Md5Func { use DataType::*; Ok(match &arg_types[0] { LargeUtf8 | LargeBinary => Utf8, - Utf8View | Utf8 | Binary => Utf8, + Utf8View | Utf8 | Binary | BinaryView => Utf8, Null => Null, Dictionary(_, t) => match **t { LargeUtf8 | LargeBinary => Utf8, - Utf8 | Binary => Utf8, + Utf8 | Binary | BinaryView => Utf8, Null => Null, _ => { return plan_err!( From c2e0ae813903e9a083cc6e955bac359e661cccf9 Mon Sep 17 00:00:00 2001 From: Ian Lai Date: Wed, 26 Mar 2025 08:51:06 +0000 Subject: [PATCH 15/15] modify sqllogictest to fit allowed input type for md5 function --- datafusion/sqllogictest/test_files/array.slt | 22 ++++++++++---------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/datafusion/sqllogictest/test_files/array.slt b/datafusion/sqllogictest/test_files/array.slt index 352064fbe5c8..cb56686b6437 100644 --- a/datafusion/sqllogictest/test_files/array.slt +++ b/datafusion/sqllogictest/test_files/array.slt @@ -5973,13 +5973,13 @@ true false true false false false true true false false true false true # NB that `col in (a, b, c)` is simplified to OR if there are <= 3 elements, so we make 4-element haystack lists query I -with test AS (SELECT substr(md5(i)::text, 1, 32) as needle FROM generate_series(1, 100000) t(i)) +with test AS (SELECT substr(md5(i::text)::text, 1, 32) as needle FROM generate_series(1, 100000) t(i)) select count(*) from test WHERE needle IN ('7f4b18de3cfeb9b4ac78c381ee2ad278', 'a', 'b', 'c'); ---- 1 query TT -explain with test AS (SELECT substr(md5(i)::text, 1, 32) as needle FROM generate_series(1, 100000) t(i)) +explain with test AS (SELECT substr(md5(i::text)::text, 1, 32) as needle FROM generate_series(1, 100000) t(i)) select count(*) from test WHERE needle IN ('7f4b18de3cfeb9b4ac78c381ee2ad278', 'a', 'b', 'c'); ---- logical_plan @@ -6002,13 +6002,13 @@ physical_plan 09)----------------LazyMemoryExec: partitions=1, batch_generators=[generate_series: start=1, end=100000, batch_size=8192] query I -with test AS (SELECT substr(md5(i)::text, 1, 32) as needle FROM generate_series(1, 100000) t(i)) +with test AS (SELECT substr(md5(i::text)::text, 1, 32) as needle FROM generate_series(1, 100000) t(i)) select count(*) from test WHERE needle = ANY(['7f4b18de3cfeb9b4ac78c381ee2ad278', 'a', 'b', 'c']); ---- 1 query TT -explain with test AS (SELECT substr(md5(i)::text, 1, 32) as needle FROM generate_series(1, 100000) t(i)) +explain with test AS (SELECT substr(md5(i::text)::text, 1, 32) as needle FROM generate_series(1, 100000) t(i)) select count(*) from test WHERE needle = ANY(['7f4b18de3cfeb9b4ac78c381ee2ad278', 'a', 'b', 'c']); ---- logical_plan @@ -6031,13 +6031,13 @@ physical_plan 09)----------------LazyMemoryExec: partitions=1, batch_generators=[generate_series: start=1, end=100000, batch_size=8192] query I -with test AS (SELECT substr(md5(i)::text, 1, 32) as needle FROM generate_series(1, 100000) t(i)) +with test AS (SELECT substr(md5(i::text)::text, 1, 32) as needle FROM generate_series(1, 100000) t(i)) select count(*) from test WHERE array_has(['7f4b18de3cfeb9b4ac78c381ee2ad278', 'a', 'b', 'c'], needle); ---- 1 query TT -explain with test AS (SELECT substr(md5(i)::text, 1, 32) as needle FROM generate_series(1, 100000) t(i)) +explain with test AS (SELECT substr(md5(i::text)::text, 1, 32) as needle FROM generate_series(1, 100000) t(i)) select count(*) from test WHERE array_has(['7f4b18de3cfeb9b4ac78c381ee2ad278', 'a', 'b', 'c'], needle); ---- logical_plan @@ -6068,7 +6068,7 @@ physical_plan # FIXME: array_has with large list haystack not currently rewritten to InList query TT -explain with test AS (SELECT substr(md5(i)::text, 1, 32) as needle FROM generate_series(1, 100000) t(i)) +explain with test AS (SELECT substr(md5(i::text)::text, 1, 32) as needle FROM generate_series(1, 100000) t(i)) select count(*) from test WHERE array_has(arrow_cast(['7f4b18de3cfeb9b4ac78c381ee2ad278', 'a', 'b', 'c'], 'LargeList(Utf8View)'), needle); ---- logical_plan @@ -6091,13 +6091,13 @@ physical_plan 09)----------------LazyMemoryExec: partitions=1, batch_generators=[generate_series: start=1, end=100000, batch_size=8192] query I -with test AS (SELECT substr(md5(i)::text, 1, 32) as needle FROM generate_series(1, 100000) t(i)) +with test AS (SELECT substr(md5(i::text)::text, 1, 32) as needle FROM generate_series(1, 100000) t(i)) select count(*) from test WHERE array_has(arrow_cast(['7f4b18de3cfeb9b4ac78c381ee2ad278', 'a', 'b', 'c'], 'FixedSizeList(4, Utf8View)'), needle); ---- 1 query TT -explain with test AS (SELECT substr(md5(i)::text, 1, 32) as needle FROM generate_series(1, 100000) t(i)) +explain with test AS (SELECT substr(md5(i::text)::text, 1, 32) as needle FROM generate_series(1, 100000) t(i)) select count(*) from test WHERE array_has(arrow_cast(['7f4b18de3cfeb9b4ac78c381ee2ad278', 'a', 'b', 'c'], 'FixedSizeList(4, Utf8View)'), needle); ---- logical_plan @@ -6120,14 +6120,14 @@ physical_plan 09)----------------LazyMemoryExec: partitions=1, batch_generators=[generate_series: start=1, end=100000, batch_size=8192] query I -with test AS (SELECT substr(md5(i)::text, 1, 32) as needle FROM generate_series(1, 100000) t(i)) +with test AS (SELECT substr(md5(i::text)::text, 1, 32) as needle FROM generate_series(1, 100000) t(i)) select count(*) from test WHERE array_has([needle], needle); ---- 100000 # TODO: this should probably be possible to completely remove the filter as always true? query TT -explain with test AS (SELECT substr(md5(i)::text, 1, 32) as needle FROM generate_series(1, 100000) t(i)) +explain with test AS (SELECT substr(md5(i::text)::text, 1, 32) as needle FROM generate_series(1, 100000) t(i)) select count(*) from test WHERE array_has([needle], needle); ---- logical_plan