Skip to content

refactor: use TypeSignature::Coercible for crypto functions #14826

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 15 commits into from
Mar 27, 2025
102 changes: 66 additions & 36 deletions datafusion/functions/src/crypto/basic.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,10 @@

//! "crypto" DataFusion functions

use arrow::array::{Array, ArrayRef, BinaryArray, OffsetSizeTrait};
use arrow::array::{
Array, ArrayRef, BinaryArray, BinaryArrayType, BinaryViewArray, GenericBinaryArray,
OffsetSizeTrait,
};
use arrow::array::{AsArray, GenericStringArray, StringArray, StringViewArray};
use arrow::datatypes::DataType;
use blake2::{Blake2b512, Blake2s256, Digest};
Expand All @@ -26,8 +29,8 @@ use datafusion_common::cast::as_binary_array;

use arrow::compute::StringArrayType;
use datafusion_common::{
cast::as_generic_binary_array, exec_err, internal_err, plan_err,
utils::take_function_args, DataFusionError, Result, ScalarValue,
exec_err, internal_err, plan_err, utils::take_function_args, DataFusionError, Result,
ScalarValue,
};
use datafusion_expr::ColumnarValue;
use md5::Md5;
Expand Down Expand Up @@ -203,6 +206,7 @@ pub fn utf8_or_binary_to_binary_type(
| DataType::LargeUtf8
| DataType::Utf8
| DataType::Binary
| DataType::BinaryView
| DataType::LargeBinary => DataType::Binary,
DataType::Null => DataType::Null,
_ => {
Expand Down Expand Up @@ -251,27 +255,17 @@ impl DigestAlgorithm {
where
T: OffsetSizeTrait,
{
let input_value = as_generic_binary_array::<T>(value)?;
let array: ArrayRef = match self {
Self::Md5 => digest_to_array!(Md5, input_value),
Self::Sha224 => digest_to_array!(Sha224, input_value),
Self::Sha256 => digest_to_array!(Sha256, input_value),
Self::Sha384 => digest_to_array!(Sha384, input_value),
Self::Sha512 => digest_to_array!(Sha512, input_value),
Self::Blake2b => digest_to_array!(Blake2b512, input_value),
Self::Blake2s => digest_to_array!(Blake2s256, input_value),
Self::Blake3 => {
let binary_array: BinaryArray = input_value
.iter()
.map(|opt| {
opt.map(|x| {
let mut digest = Blake3::default();
digest.update(x);
Blake3::finalize(&digest).as_bytes().to_vec()
})
})
.collect();
Arc::new(binary_array)
let array = match value.data_type() {
DataType::Binary | DataType::LargeBinary => {
let v = value.as_binary::<T>();
self.digest_binary_array_impl::<&GenericBinaryArray<T>>(v)
}
DataType::BinaryView => {
let v = value.as_binary_view();
self.digest_binary_array_impl::<&BinaryViewArray>(v)
}
other => {
return exec_err!("unsupported type for digest_utf_array: {other:?}")
}
};
Ok(ColumnarValue::Array(array))
Expand Down Expand Up @@ -328,6 +322,37 @@ impl DigestAlgorithm {
}
}
}

pub fn digest_binary_array_impl<'a, BinaryArrType>(
self,
input_value: BinaryArrType,
) -> ArrayRef
where
BinaryArrType: BinaryArrayType<'a>,
{
match self {
Self::Md5 => digest_to_array!(Md5, input_value),
Self::Sha224 => digest_to_array!(Sha224, input_value),
Self::Sha256 => digest_to_array!(Sha256, input_value),
Self::Sha384 => digest_to_array!(Sha384, input_value),
Self::Sha512 => digest_to_array!(Sha512, input_value),
Self::Blake2b => digest_to_array!(Blake2b512, input_value),
Self::Blake2s => digest_to_array!(Blake2s256, input_value),
Self::Blake3 => {
let binary_array: BinaryArray = input_value
.iter()
.map(|opt| {
opt.map(|x| {
let mut digest = Blake3::default();
digest.update(x);
Blake3::finalize(&digest).as_bytes().to_vec()
})
})
.collect();
Arc::new(binary_array)
}
}
}
Comment on lines +326 to +355
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I found that digest_binary_array_impl and digest_utf8_array_impl are similar, should we integrate them into one?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It would be nice to

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I found that these two array types don't share a common trait that would allow implementing a single generic function. Since the key difference is in how bytes are accessed, I'll keep these functions separate to maintain type safety.

}
pub fn digest_process(
value: &ColumnarValue,
Expand All @@ -342,22 +367,27 @@ pub fn digest_process(
DataType::LargeBinary => {
digest_algorithm.digest_binary_array::<i64>(a.as_ref())
}
other => exec_err!(
"Unsupported data type {other:?} for function {digest_algorithm}"
),
},
ColumnarValue::Scalar(scalar) => match scalar {
ScalarValue::Utf8View(a)
| ScalarValue::Utf8(a)
| ScalarValue::LargeUtf8(a) => {
Ok(digest_algorithm
.digest_scalar(a.as_ref().map(|s: &String| s.as_bytes())))
DataType::BinaryView => {
digest_algorithm.digest_binary_array::<i32>(a.as_ref())
}
ScalarValue::Binary(a) | ScalarValue::LargeBinary(a) => Ok(digest_algorithm
.digest_scalar(a.as_ref().map(|v: &Vec<u8>| v.as_slice()))),
other => exec_err!(
"Unsupported data type {other:?} for function {digest_algorithm}"
),
},
ColumnarValue::Scalar(scalar) => {
match scalar {
ScalarValue::Utf8View(a)
| ScalarValue::Utf8(a)
| ScalarValue::LargeUtf8(a) => Ok(digest_algorithm
.digest_scalar(a.as_ref().map(|s: &String| s.as_bytes()))),
ScalarValue::Binary(a)
| ScalarValue::LargeBinary(a)
| ScalarValue::BinaryView(a) => Ok(digest_algorithm
.digest_scalar(a.as_ref().map(|v: &Vec<u8>| v.as_slice()))),
other => exec_err!(
"Unsupported data type {other:?} for function {digest_algorithm}"
),
}
}
}
}
22 changes: 14 additions & 8 deletions datafusion/functions/src/crypto/digest.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,15 @@
//! "crypto" DataFusion functions
use super::basic::{digest, utf8_or_binary_to_binary_type};
use arrow::datatypes::DataType;
use datafusion_common::Result;
use datafusion_common::{
types::{logical_binary, logical_string},
Result,
};
use datafusion_expr::{
ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature,
TypeSignature::*, Volatility,
TypeSignature, Volatility,
};
use datafusion_expr_common::signature::{Coercion, TypeSignatureClass};
use datafusion_macros::user_doc;
use std::any::Any;

Expand Down Expand Up @@ -64,15 +68,17 @@ impl Default for DigestFunc {

impl DigestFunc {
pub fn new() -> Self {
use DataType::*;
Self {
signature: Signature::one_of(
vec![
Exact(vec![Utf8View, Utf8View]),
Exact(vec![Utf8, Utf8]),
Exact(vec![LargeUtf8, Utf8]),
Exact(vec![Binary, Utf8]),
Exact(vec![LargeBinary, Utf8]),
TypeSignature::Coercible(vec![
Coercion::new_exact(TypeSignatureClass::Native(logical_string())),
Coercion::new_exact(TypeSignatureClass::Native(logical_string())),
]),
TypeSignature::Coercible(vec![
Coercion::new_exact(TypeSignatureClass::Native(logical_binary())),
Coercion::new_exact(TypeSignatureClass::Native(logical_string())),
]),
],
Volatility::Immutable,
),
Expand Down
30 changes: 22 additions & 8 deletions datafusion/functions/src/crypto/md5.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,16 @@
//! "crypto" DataFusion functions
use crate::crypto::basic::md5;
use arrow::datatypes::DataType;
use datafusion_common::{plan_err, Result};
use datafusion_common::{
plan_err,
types::{logical_binary, logical_string, NativeType},
Result,
};
use datafusion_expr::{
ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature,
Volatility,
TypeSignature, Volatility,
};
use datafusion_expr_common::signature::{Coercion, TypeSignatureClass};
use datafusion_macros::user_doc;
use std::any::Any;

Expand Down Expand Up @@ -52,11 +57,20 @@ impl Default for Md5Func {

impl Md5Func {
pub fn new() -> Self {
use DataType::*;
Self {
signature: Signature::uniform(
1,
vec![Utf8View, Utf8, LargeUtf8, Binary, LargeBinary],
signature: Signature::one_of(
vec![
TypeSignature::Coercible(vec![Coercion::new_implicit(
TypeSignatureClass::Native(logical_binary()),
vec![TypeSignatureClass::Native(logical_string())],
NativeType::String,
)]),
TypeSignature::Coercible(vec![Coercion::new_implicit(
TypeSignatureClass::Native(logical_binary()),
vec![TypeSignatureClass::Native(logical_binary())],
NativeType::Binary,
)]),
],
Volatility::Immutable,
),
}
Expand All @@ -79,11 +93,11 @@ impl ScalarUDFImpl for Md5Func {
use DataType::*;
Ok(match &arg_types[0] {
LargeUtf8 | LargeBinary => Utf8,
Utf8View | Utf8 | Binary => Utf8,
Utf8View | Utf8 | Binary | BinaryView => Utf8,
Null => Null,
Dictionary(_, t) => match **t {
LargeUtf8 | LargeBinary => Utf8,
Utf8 | Binary => Utf8,
Utf8 | Binary | BinaryView => Utf8,
Null => Null,
_ => {
return plan_err!(
Expand Down
25 changes: 19 additions & 6 deletions datafusion/functions/src/crypto/sha224.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,15 @@
//! "crypto" DataFusion functions
use super::basic::{sha224, utf8_or_binary_to_binary_type};
use arrow::datatypes::DataType;
use datafusion_common::Result;
use datafusion_common::{
types::{logical_binary, logical_string, NativeType},
Result,
};
use datafusion_expr::{
ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature,
Volatility,
TypeSignature, Volatility,
};
use datafusion_expr_common::signature::{Coercion, TypeSignatureClass};
use datafusion_macros::user_doc;
use std::any::Any;

Expand Down Expand Up @@ -53,11 +57,20 @@ impl Default for SHA224Func {

impl SHA224Func {
pub fn new() -> Self {
use DataType::*;
Self {
signature: Signature::uniform(
1,
vec![Utf8View, Utf8, LargeUtf8, Binary, LargeBinary],
signature: Signature::one_of(
vec![
TypeSignature::Coercible(vec![Coercion::new_implicit(
TypeSignatureClass::Native(logical_binary()),
vec![TypeSignatureClass::Native(logical_string())],
NativeType::String,
)]),
TypeSignature::Coercible(vec![Coercion::new_implicit(
TypeSignatureClass::Native(logical_binary()),
vec![TypeSignatureClass::Native(logical_binary())],
NativeType::Binary,
)]),
],
Volatility::Immutable,
),
}
Expand Down
25 changes: 19 additions & 6 deletions datafusion/functions/src/crypto/sha256.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,15 @@
//! "crypto" DataFusion functions
use super::basic::{sha256, utf8_or_binary_to_binary_type};
use arrow::datatypes::DataType;
use datafusion_common::Result;
use datafusion_common::{
types::{logical_binary, logical_string, NativeType},
Result,
};
use datafusion_expr::{
ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature,
Volatility,
TypeSignature, Volatility,
};
use datafusion_expr_common::signature::{Coercion, TypeSignatureClass};
use datafusion_macros::user_doc;
use std::any::Any;

Expand Down Expand Up @@ -52,11 +56,20 @@ impl Default for SHA256Func {

impl SHA256Func {
pub fn new() -> Self {
use DataType::*;
Self {
signature: Signature::uniform(
1,
vec![Utf8View, Utf8, LargeUtf8, Binary, LargeBinary],
signature: Signature::one_of(
vec![
TypeSignature::Coercible(vec![Coercion::new_implicit(
TypeSignatureClass::Native(logical_binary()),
vec![TypeSignatureClass::Native(logical_string())],
NativeType::String,
)]),
TypeSignature::Coercible(vec![Coercion::new_implicit(
TypeSignatureClass::Native(logical_binary()),
vec![TypeSignatureClass::Native(logical_binary())],
NativeType::Binary,
)]),
],
Volatility::Immutable,
),
}
Expand Down
25 changes: 19 additions & 6 deletions datafusion/functions/src/crypto/sha384.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,15 @@
//! "crypto" DataFusion functions
use super::basic::{sha384, utf8_or_binary_to_binary_type};
use arrow::datatypes::DataType;
use datafusion_common::Result;
use datafusion_common::{
types::{logical_binary, logical_string, NativeType},
Result,
};
use datafusion_expr::{
ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature,
Volatility,
TypeSignature, Volatility,
};
use datafusion_expr_common::signature::{Coercion, TypeSignatureClass};
use datafusion_macros::user_doc;
use std::any::Any;

Expand Down Expand Up @@ -52,11 +56,20 @@ impl Default for SHA384Func {

impl SHA384Func {
pub fn new() -> Self {
use DataType::*;
Self {
signature: Signature::uniform(
1,
vec![Utf8View, Utf8, LargeUtf8, Binary, LargeBinary],
signature: Signature::one_of(
vec![
TypeSignature::Coercible(vec![Coercion::new_implicit(
TypeSignatureClass::Native(logical_binary()),
vec![TypeSignatureClass::Native(logical_string())],
NativeType::String,
)]),
TypeSignature::Coercible(vec![Coercion::new_implicit(
TypeSignatureClass::Native(logical_binary()),
vec![TypeSignatureClass::Native(logical_binary())],
NativeType::Binary,
)]),
],
Volatility::Immutable,
),
}
Expand Down
Loading