Skip to content

Commit 417f32e

Browse files
committed
port other crypto functions
1 parent ce45bb2 commit 417f32e

File tree

17 files changed

+470
-535
lines changed

17 files changed

+470
-535
lines changed

datafusion/expr/src/built_in_function.rs

+1-42
Original file line numberDiff line numberDiff line change
@@ -202,8 +202,6 @@ pub enum BuiltinScalarFunction {
202202
Lower,
203203
/// ltrim
204204
Ltrim,
205-
/// md5
206-
MD5,
207205
/// octet_length
208206
OctetLength,
209207
/// random
@@ -220,14 +218,6 @@ pub enum BuiltinScalarFunction {
220218
Rpad,
221219
/// rtrim
222220
Rtrim,
223-
/// sha224
224-
SHA224,
225-
/// sha256
226-
SHA256,
227-
/// sha384
228-
SHA384,
229-
/// Sha512
230-
SHA512,
231221
/// split_part
232222
SplitPart,
233223
/// string_to_array
@@ -398,7 +388,6 @@ impl BuiltinScalarFunction {
398388
BuiltinScalarFunction::Lpad => Volatility::Immutable,
399389
BuiltinScalarFunction::Lower => Volatility::Immutable,
400390
BuiltinScalarFunction::Ltrim => Volatility::Immutable,
401-
BuiltinScalarFunction::MD5 => Volatility::Immutable,
402391
BuiltinScalarFunction::OctetLength => Volatility::Immutable,
403392
BuiltinScalarFunction::Radians => Volatility::Immutable,
404393
BuiltinScalarFunction::Repeat => Volatility::Immutable,
@@ -407,10 +396,6 @@ impl BuiltinScalarFunction {
407396
BuiltinScalarFunction::Right => Volatility::Immutable,
408397
BuiltinScalarFunction::Rpad => Volatility::Immutable,
409398
BuiltinScalarFunction::Rtrim => Volatility::Immutable,
410-
BuiltinScalarFunction::SHA224 => Volatility::Immutable,
411-
BuiltinScalarFunction::SHA256 => Volatility::Immutable,
412-
BuiltinScalarFunction::SHA384 => Volatility::Immutable,
413-
BuiltinScalarFunction::SHA512 => Volatility::Immutable,
414399
BuiltinScalarFunction::SplitPart => Volatility::Immutable,
415400
BuiltinScalarFunction::StringToArray => Volatility::Immutable,
416401
BuiltinScalarFunction::StartsWith => Volatility::Immutable,
@@ -646,7 +631,6 @@ impl BuiltinScalarFunction {
646631
BuiltinScalarFunction::Ltrim => {
647632
utf8_to_str_type(&input_expr_types[0], "ltrim")
648633
}
649-
BuiltinScalarFunction::MD5 => utf8_to_str_type(&input_expr_types[0], "md5"),
650634
BuiltinScalarFunction::OctetLength => {
651635
utf8_to_int_type(&input_expr_types[0], "octet_length")
652636
}
@@ -669,18 +653,6 @@ impl BuiltinScalarFunction {
669653
BuiltinScalarFunction::Rtrim => {
670654
utf8_to_str_type(&input_expr_types[0], "rtrim")
671655
}
672-
BuiltinScalarFunction::SHA224 => {
673-
utf8_or_binary_to_binary_type(&input_expr_types[0], "sha224")
674-
}
675-
BuiltinScalarFunction::SHA256 => {
676-
utf8_or_binary_to_binary_type(&input_expr_types[0], "sha256")
677-
}
678-
BuiltinScalarFunction::SHA384 => {
679-
utf8_or_binary_to_binary_type(&input_expr_types[0], "sha384")
680-
}
681-
BuiltinScalarFunction::SHA512 => {
682-
utf8_or_binary_to_binary_type(&input_expr_types[0], "sha512")
683-
}
684656
BuiltinScalarFunction::SplitPart => {
685657
utf8_to_str_type(&input_expr_types[0], "split_part")
686658
}
@@ -876,15 +848,6 @@ impl BuiltinScalarFunction {
876848
BuiltinScalarFunction::Coalesce => {
877849
Signature::variadic_equal(self.volatility())
878850
}
879-
BuiltinScalarFunction::SHA224
880-
| BuiltinScalarFunction::SHA256
881-
| BuiltinScalarFunction::SHA384
882-
| BuiltinScalarFunction::SHA512
883-
| BuiltinScalarFunction::MD5 => Signature::uniform(
884-
1,
885-
vec![Utf8, LargeUtf8, Binary, LargeBinary],
886-
self.volatility(),
887-
),
888851
BuiltinScalarFunction::Ascii
889852
| BuiltinScalarFunction::BitLength
890853
| BuiltinScalarFunction::CharacterLength
@@ -1359,11 +1322,6 @@ impl BuiltinScalarFunction {
13591322
BuiltinScalarFunction::FromUnixtime => &["from_unixtime"],
13601323

13611324
// hashing functions
1362-
BuiltinScalarFunction::MD5 => &["md5"],
1363-
BuiltinScalarFunction::SHA224 => &["sha224"],
1364-
BuiltinScalarFunction::SHA256 => &["sha256"],
1365-
BuiltinScalarFunction::SHA384 => &["sha384"],
1366-
BuiltinScalarFunction::SHA512 => &["sha512"],
13671325

13681326
// other functions
13691327
BuiltinScalarFunction::ArrowTypeof => &["arrow_typeof"],
@@ -1501,6 +1459,7 @@ get_optimal_return_type!(utf8_to_str_type, DataType::LargeUtf8, DataType::Utf8);
15011459
// `utf8_to_int_type`: returns either a Int32 or Int64 based on the input type size.
15021460
get_optimal_return_type!(utf8_to_int_type, DataType::Int64, DataType::Int32);
15031461

1462+
#[warn(dead_code)]
15041463
fn utf8_or_binary_to_binary_type(arg_type: &DataType, name: &str) -> Result<DataType> {
15051464
Ok(match arg_type {
15061465
DataType::LargeUtf8

datafusion/expr/src/expr_fn.rs

-5
Original file line numberDiff line numberDiff line change
@@ -769,7 +769,6 @@ scalar_expr!(
769769
string,
770770
"removes all characters, spaces by default, from the beginning of a string"
771771
);
772-
scalar_expr!(MD5, md5, string, "returns the MD5 hash of a string");
773772
scalar_expr!(
774773
OctetLength,
775774
octet_length,
@@ -786,10 +785,6 @@ scalar_expr!(
786785
string,
787786
"removes all characters, spaces by default, from the end of a string"
788787
);
789-
scalar_expr!(SHA224, sha224, string, "SHA-224 hash");
790-
scalar_expr!(SHA256, sha256, string, "SHA-256 hash");
791-
scalar_expr!(SHA384, sha384, string, "SHA-384 hash");
792-
scalar_expr!(SHA512, sha512, string, "SHA-512 hash");
793788
scalar_expr!(SplitPart, split_part, string delimiter index, "splits a string based on a delimiter and picks out the desired field based on the index.");
794789
scalar_expr!(StringToArray, string_to_array, string delimiter null_string, "splits a `string` based on a `delimiter` and returns an array of parts. Any parts matching the optional `null_string` will be replaced with `NULL`");
795790
scalar_expr!(StartsWith, starts_with, string prefix, "whether the `string` starts with the `prefix`");

datafusion/functions/src/crypto/basic.rs

+28-25
Original file line numberDiff line numberDiff line change
@@ -17,10 +17,13 @@
1717

1818
//! "crypto" DataFusion functions
1919
20+
use arrow::array::StringArray;
2021
use arrow::array::{Array, ArrayRef, BinaryArray, OffsetSizeTrait};
2122
use arrow::datatypes::DataType;
2223
use blake2::{Blake2b512, Blake2s256, Digest};
2324
use blake3::Hasher as Blake3;
25+
use datafusion_common::cast::as_binary_array;
26+
2427
use datafusion_common::plan_err;
2528
use datafusion_common::{
2629
cast::{as_generic_binary_array, as_generic_string_array},
@@ -169,31 +172,31 @@ impl fmt::Display for DigestAlgorithm {
169172
}
170173
}
171174
// /// computes md5 hash digest of the given input
172-
// pub fn md5(args: &[ColumnarValue]) -> Result<ColumnarValue> {
173-
// if args.len() != 1 {
174-
// return exec_err!(
175-
// "{:?} args were supplied but {} takes exactly one argument",
176-
// args.len(),
177-
// DigestAlgorithm::Md5
178-
// );
179-
// }
180-
// let value = digest_process(&args[0], DigestAlgorithm::Md5)?;
181-
// // md5 requires special handling because of its unique utf8 return type
182-
// Ok(match value {
183-
// ColumnarValue::Array(array) => {
184-
// let binary_array = as_binary_array(&array)?;
185-
// let string_array: StringArray = binary_array
186-
// .iter()
187-
// .map(|opt| opt.map(hex_encode::<_>))
188-
// .collect();
189-
// ColumnarValue::Array(Arc::new(string_array))
190-
// }
191-
// ColumnarValue::Scalar(ScalarValue::Binary(opt)) => {
192-
// ColumnarValue::Scalar(ScalarValue::Utf8(opt.map(hex_encode::<_>)))
193-
// }
194-
// _ => return exec_err!("Impossibly got invalid results from digest"),
195-
// })
196-
// }
175+
pub fn md5(args: &[ColumnarValue]) -> Result<ColumnarValue> {
176+
if args.len() != 1 {
177+
return exec_err!(
178+
"{:?} args were supplied but {} takes exactly one argument",
179+
args.len(),
180+
DigestAlgorithm::Md5
181+
);
182+
}
183+
let value = digest_process(&args[0], DigestAlgorithm::Md5)?;
184+
// md5 requires special handling because of its unique utf8 return type
185+
Ok(match value {
186+
ColumnarValue::Array(array) => {
187+
let binary_array = as_binary_array(&array)?;
188+
let string_array: StringArray = binary_array
189+
.iter()
190+
.map(|opt| opt.map(hex_encode::<_>))
191+
.collect();
192+
ColumnarValue::Array(Arc::new(string_array))
193+
}
194+
ColumnarValue::Scalar(ScalarValue::Binary(opt)) => {
195+
ColumnarValue::Scalar(ScalarValue::Utf8(opt.map(hex_encode::<_>)))
196+
}
197+
_ => return exec_err!("Impossibly got invalid results from digest"),
198+
})
199+
}
197200

198201
/// this function exists so that we do not need to pull in the crate hex. it is only used by md5
199202
/// function below
+83
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,83 @@
1+
// Licensed to the Apache Software Foundation (ASF) under one
2+
// or more contributor license agreements. See the NOTICE file
3+
// distributed with this work for additional information
4+
// regarding copyright ownership. The ASF licenses this file
5+
// to you under the Apache License, Version 2.0 (the
6+
// "License"); you may not use this file except in compliance
7+
// with the License. You may obtain a copy of the License at
8+
//
9+
// http://www.apache.org/licenses/LICENSE-2.0
10+
//
11+
// Unless required by applicable law or agreed to in writing,
12+
// software distributed under the License is distributed on an
13+
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
// KIND, either express or implied. See the License for the
15+
// specific language governing permissions and limitations
16+
// under the License.
17+
18+
//! "crypto" DataFusion functions
19+
use crate::crypto::basic::md5;
20+
use arrow::datatypes::DataType;
21+
use datafusion_common::{plan_err, Result};
22+
use datafusion_expr::{ColumnarValue, ScalarUDFImpl, Signature, Volatility};
23+
use std::any::Any;
24+
25+
use super::basic::utf8_or_binary_to_binary_type;
26+
27+
#[derive(Debug)]
28+
pub(super) struct Md5Func {
29+
signature: Signature,
30+
}
31+
impl Md5Func {
32+
pub fn new() -> Self {
33+
use DataType::*;
34+
Self {
35+
signature: Signature::uniform(
36+
1,
37+
vec![Utf8, LargeUtf8, Binary, LargeBinary],
38+
Volatility::Immutable,
39+
),
40+
}
41+
}
42+
}
43+
impl ScalarUDFImpl for Md5Func {
44+
fn as_any(&self) -> &dyn Any {
45+
self
46+
}
47+
48+
fn name(&self) -> &str {
49+
"md5"
50+
}
51+
52+
fn signature(&self) -> &Signature {
53+
&self.signature
54+
}
55+
56+
fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
57+
use DataType::*;
58+
Ok(match &arg_types[0] {
59+
LargeUtf8 | LargeBinary => LargeUtf8,
60+
Utf8 | Binary => Utf8,
61+
Null => Null,
62+
Dictionary(_, t) => match **t {
63+
LargeUtf8 | LargeBinary => LargeUtf8,
64+
Utf8 | Binary => Utf8,
65+
Null => Null,
66+
_ => {
67+
return plan_err!(
68+
"the md5 can only accept strings but got {:?}",
69+
**t
70+
);
71+
}
72+
},
73+
other => {
74+
return plan_err!(
75+
"The md5 function can only accept strings. Got {other}"
76+
);
77+
}
78+
})
79+
}
80+
fn invoke(&self, args: &[ColumnarValue]) -> Result<ColumnarValue> {
81+
md5(args)
82+
}
83+
}

datafusion/functions/src/crypto/mod.rs

+31-1
Original file line numberDiff line numberDiff line change
@@ -19,9 +19,39 @@
1919
2020
pub mod basic;
2121
pub mod digest;
22+
pub mod md5;
23+
pub mod sha224;
24+
pub mod sha256;
25+
pub mod sha384;
26+
pub mod sha512;
2227
make_udf_function!(digest::DigestFunc, DIGEST, digest);
28+
make_udf_function!(md5::Md5Func, MD5, md5);
29+
make_udf_function!(sha224::SHA224Func, SHA224, sha224);
30+
make_udf_function!(sha256::SHA256Func, SHA256, sha256);
31+
make_udf_function!(sha384::SHA384Func, SHA384, sha384);
32+
make_udf_function!(sha512::SHA512Func, SHA512, sha512);
2333
export_functions!((
2434
digest,
2535
input_arg1 input_arg2,
26-
"returns a list of regular expression matches in a string. "
36+
"Computes the binary hash of an expression using the specified algorithm."
37+
),(
38+
md5,
39+
input_arg,
40+
"Computes an MD5 128-bit checksum for a string expression."
41+
),(
42+
sha224,
43+
input_arg1,
44+
"Computes the SHA-224 hash of a binary string."
45+
),(
46+
sha256,
47+
input_arg1,
48+
"Computes the SHA-256 hash of a binary string."
49+
),(
50+
sha384,
51+
input_arg1,
52+
"Computes the SHA-384 hash of a binary string."
53+
),(
54+
sha512,
55+
input_arg1,
56+
"Computes the SHA-512 hash of a binary string."
2757
));
+60
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
// Licensed to the Apache Software Foundation (ASF) under one
2+
// or more contributor license agreements. See the NOTICE file
3+
// distributed with this work for additional information
4+
// regarding copyright ownership. The ASF licenses this file
5+
// to you under the Apache License, Version 2.0 (the
6+
// "License"); you may not use this file except in compliance
7+
// with the License. You may obtain a copy of the License at
8+
//
9+
// http://www.apache.org/licenses/LICENSE-2.0
10+
//
11+
// Unless required by applicable law or agreed to in writing,
12+
// software distributed under the License is distributed on an
13+
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
// KIND, either express or implied. See the License for the
15+
// specific language governing permissions and limitations
16+
// under the License.
17+
18+
//! "crypto" DataFusion functions
19+
use super::basic::{sha224, utf8_or_binary_to_binary_type};
20+
use arrow::datatypes::DataType;
21+
use datafusion_common::Result;
22+
use datafusion_expr::{ColumnarValue, ScalarUDFImpl, Signature, Volatility};
23+
use std::any::Any;
24+
25+
#[derive(Debug)]
26+
pub(super) struct SHA224Func {
27+
signature: Signature,
28+
}
29+
impl SHA224Func {
30+
pub fn new() -> Self {
31+
use DataType::*;
32+
Self {
33+
signature: Signature::uniform(
34+
1,
35+
vec![Utf8, LargeUtf8, Binary, LargeBinary],
36+
Volatility::Immutable,
37+
),
38+
}
39+
}
40+
}
41+
impl ScalarUDFImpl for SHA224Func {
42+
fn as_any(&self) -> &dyn Any {
43+
self
44+
}
45+
46+
fn name(&self) -> &str {
47+
"sha224"
48+
}
49+
50+
fn signature(&self) -> &Signature {
51+
&self.signature
52+
}
53+
54+
fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
55+
utf8_or_binary_to_binary_type(&arg_types[0], self.name())
56+
}
57+
fn invoke(&self, args: &[ColumnarValue]) -> Result<ColumnarValue> {
58+
sha224(args)
59+
}
60+
}

0 commit comments

Comments
 (0)