From a682d1ef965ad29093cbf546d20c3e53453668f0 Mon Sep 17 00:00:00 2001 From: Spaarsh-root Date: Mon, 27 Jan 2025 02:16:49 +0530 Subject: [PATCH 01/14] Added hash module with xxhash32 and xxhash64 functions. --- datafusion/functions/Cargo.toml | 3 + datafusion/functions/src/hash/mod.rs | 42 +++++ datafusion/functions/src/hash/xxhash.rs | 227 ++++++++++++++++++++++++ datafusion/functions/src/lib.rs | 7 + 4 files changed, 279 insertions(+) create mode 100644 datafusion/functions/src/hash/mod.rs create mode 100644 datafusion/functions/src/hash/xxhash.rs diff --git a/datafusion/functions/Cargo.toml b/datafusion/functions/Cargo.toml index db3e6838f6a5..f6336cd18af8 100644 --- a/datafusion/functions/Cargo.toml +++ b/datafusion/functions/Cargo.toml @@ -35,6 +35,7 @@ workspace = true # enable core functions core_expressions = [] crypto_expressions = ["md-5", "sha2", "blake2", "blake3"] +hash_expressions = ["twox-hash"] # enable datetime functions datetime_expressions = [] # Enable encoding by default so the doctests work. In general don't automatically enable all packages. @@ -46,6 +47,7 @@ default = [ "regex_expressions", "string_expressions", "unicode_expressions", + "hash_expressions", ] # enable encode/decode functions encoding_expressions = ["base64", "hex"] @@ -85,6 +87,7 @@ md-5 = { version = "^0.10.0", optional = true } rand = { workspace = true } regex = { workspace = true, optional = true } sha2 = { version = "^0.10.1", optional = true } +twox-hash = { version = "1.6.3", optional = true } unicode-segmentation = { version = "^1.7.1", optional = true } uuid = { version = "1.7", features = ["v4"], optional = true } diff --git a/datafusion/functions/src/hash/mod.rs b/datafusion/functions/src/hash/mod.rs new file mode 100644 index 000000000000..5f204ccd5c95 --- /dev/null +++ b/datafusion/functions/src/hash/mod.rs @@ -0,0 +1,42 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! "xxhash" DataFusion functions + +use datafusion_expr::ScalarUDF; +use std::sync::Arc; + +pub mod xxhash; +make_udf_function!(xxhash::XxHash32Func, xxhash32); +make_udf_function!(xxhash::XxHash64Func, xxhash64); + +pub mod expr_fn { + export_functions!(( + xxhash32, + "Computes the XXHash32 hash of a binary string.", + input + ),( + xxhash64, + "Computes the XXHash64 hash of a binary string.", + input + )); +} + +/// Returns all DataFusion functions defined in this package +pub fn functions() -> Vec> { + vec![xxhash32(), xxhash64()] +} diff --git a/datafusion/functions/src/hash/xxhash.rs b/datafusion/functions/src/hash/xxhash.rs new file mode 100644 index 000000000000..c967a1883779 --- /dev/null +++ b/datafusion/functions/src/hash/xxhash.rs @@ -0,0 +1,227 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use arrow::array::{Array, StringArray, UInt32Array, UInt64Array}; +use arrow::datatypes::DataType; +use datafusion_common::Result; +use datafusion_expr::{ + ColumnarValue, Documentation, ScalarUDFImpl, Signature, Volatility, +}; +use twox_hash::{XxHash64, XxHash32}; +use datafusion_macros::user_doc; +use std::any::Any; +use std::hash::Hasher; +use datafusion_common::DataFusionError; +use std::sync::Arc; + +#[user_doc( + doc_section(label = "Hashing Functions"), + description = "Computes the XXHash64 hash of a binary string.", + syntax_example = "xxhash64(expression)", + sql_example = r#"```sql +> select xxhash64('foo'); ++-------------------------------------------+ +| xxhash64(Utf8("foo")) | ++-------------------------------------------+ +| | ++-------------------------------------------+ +```"#, + standard_argument(name = "expression", prefix = "String") +)] + +#[derive(Debug)] +pub struct XxHash64Func { + signature: Signature, +} +impl Default for XxHash64Func { + fn default() -> Self { + Self::new() + } +} + +impl XxHash64Func { + pub fn new() -> Self { + use DataType::*; + Self { + signature: Signature::uniform( + 1, + vec![Utf8View, Utf8, LargeUtf8, Binary, LargeBinary], + Volatility::Immutable, + ), + } + } +} +impl ScalarUDFImpl for XxHash64Func { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + "xxhash64" + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, arg_types: &[DataType]) -> Result { + Ok(DataType::Utf8) + } + + fn invoke_batch( + &self, + args: &[ColumnarValue], + _number_rows: usize, + ) -> Result { + // Assuming that the argument is either a StringArray or BinaryArray + let input_data = &args[0]; + + // Collect the output hash results + let result = match input_data { + ColumnarValue::Array(array) => { + // Check if the array is a StringArray (or another type you expect) + if let Some(string_array) = array.as_any().downcast_ref::() { + let mut hash_results: Vec = Vec::with_capacity(string_array.len()); + for i in 0..string_array.len() { + if !string_array.is_null(i) { + let value = string_array.value(i); // returns &str + let mut hasher = XxHash64::default(); + hasher.write(value.as_bytes()); // as_bytes() on &str returns &[u8] + let hash = hasher.finish(); + let hash_hex = hex::encode(hash.to_be_bytes()); + hash_results.push(hash_hex); + } else { + hash_results.push(String::from("00000000")); // or handle null values differently + } + } + + // Create an UInt64Array from the hash results + let hash_array = StringArray::from(hash_results); + Arc::new(hash_array) as Arc + } else { + return Err(DataFusionError::Internal("Unsupported array type".to_string())); + } + }, + _ => return Err(DataFusionError::Internal("Unsupported input type".to_string())), + }; + + Ok(ColumnarValue::Array(result)) + } + + fn documentation(&self) -> Option<&Documentation> { + self.doc() + } +} + +#[user_doc( + doc_section(label = "Hashing Functions"), + description = "Computes the XXHash32 hash of a binary string.", + syntax_example = "xxhash32(expression)", + sql_example = r#"```sql +> select xxhash32('foo'); ++-------------------------------------------+ +| xxhash32(Utf8("foo")) | ++-------------------------------------------+ +| | ++-------------------------------------------+ +```"#, + standard_argument(name = "expression", prefix = "String") +)] + +#[derive(Debug)] +pub struct XxHash32Func { + signature: Signature, +} +impl Default for XxHash32Func { + fn default() -> Self { + Self::new() + } +} + +impl XxHash32Func { + pub fn new() -> Self { + use DataType::*; + Self { + signature: Signature::uniform( + 1, + vec![Utf8View, Utf8, LargeUtf8, Binary, LargeBinary], + Volatility::Immutable, + ), + } + } +} +impl ScalarUDFImpl for XxHash32Func { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + "xxhash32" + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, arg_types: &[DataType]) -> Result { + Ok(DataType::Utf8) + } + + fn invoke_batch( + &self, + args: &[ColumnarValue], + _number_rows: usize, + ) -> Result { + // Assuming that the argument is either a StringArray or BinaryArray + let input_data = &args[0]; + + // Collect the output hash results + let result = match input_data { + ColumnarValue::Array(array) => { + // Check if the array is a StringArray (or another type you expect) + if let Some(string_array) = array.as_any().downcast_ref::() { + let mut hash_results: Vec = Vec::with_capacity(string_array.len()); + for i in 0..string_array.len() { + if !string_array.is_null(i) { + let value = string_array.value(i); // returns &str + let mut hasher = XxHash32::default(); + hasher.write(value.as_bytes()); // as_bytes() on &str returns &[u8] + let hash: u32 = hasher.finish() as u32; + let hash_hex = hex::encode(hash.to_be_bytes()); + hash_results.push(hash_hex); + } else { + hash_results.push(String::from("00000000")); // or handle null values differently + } + } + + // Create a StringArray from the hash results + let hash_array = StringArray::from(hash_results); + Arc::new(hash_array) as Arc + } else { + return Err(DataFusionError::Internal("Unsupported array type".to_string())); + } + }, + _ => return Err(DataFusionError::Internal("Unsupported input type".to_string())), + }; + + Ok(ColumnarValue::Array(result)) + } + + fn documentation(&self) -> Option<&Documentation> { + self.doc() + } +} diff --git a/datafusion/functions/src/lib.rs b/datafusion/functions/src/lib.rs index 7278fe3ec536..0d2806255144 100644 --- a/datafusion/functions/src/lib.rs +++ b/datafusion/functions/src/lib.rs @@ -133,6 +133,10 @@ make_stub_package!(crypto, "crypto_expressions"); pub mod unicode; make_stub_package!(unicode, "unicode_expressions"); +#[cfg(feature = "hash_expressions")] +pub mod hash; +make_stub_package!(hash, "hash_expressions"); + #[cfg(any(feature = "datetime_expressions", feature = "unicode_expressions"))] pub mod planner; @@ -158,6 +162,8 @@ pub mod expr_fn { pub use super::string::expr_fn::*; #[cfg(feature = "unicode_expressions")] pub use super::unicode::expr_fn::*; + #[cfg(feature = "hash_expressions")] + pub use super::hash::expr_fn::*; } /// Return all default functions @@ -171,6 +177,7 @@ pub fn all_default_functions() -> Vec> { .chain(crypto::functions()) .chain(unicode::functions()) .chain(string::functions()) + .chain(hash::functions()) .collect::>() } From 25ebf13060211675bc18ad95dd94af5be403c6ef Mon Sep 17 00:00:00 2001 From: Spaarsh-root Date: Mon, 27 Jan 2025 21:05:45 +0530 Subject: [PATCH 02/14] Added support for individual values and for ingesting various types of input data --- datafusion/functions/src/hash/xxhash.rs | 215 ++++++++++++++++-------- 1 file changed, 149 insertions(+), 66 deletions(-) diff --git a/datafusion/functions/src/hash/xxhash.rs b/datafusion/functions/src/hash/xxhash.rs index c967a1883779..0fc50325ffc5 100644 --- a/datafusion/functions/src/hash/xxhash.rs +++ b/datafusion/functions/src/hash/xxhash.rs @@ -15,9 +15,9 @@ // specific language governing permissions and limitations // under the License. -use arrow::array::{Array, StringArray, UInt32Array, UInt64Array}; +use arrow::array::{Array, StringArray, Int32Array, Int64Array, UInt32Array, UInt64Array}; use arrow::datatypes::DataType; -use datafusion_common::Result; +use datafusion_common::{Result, ScalarValue}; use datafusion_expr::{ ColumnarValue, Documentation, ScalarUDFImpl, Signature, Volatility, }; @@ -64,6 +64,24 @@ impl XxHash64Func { ), } } + + pub fn hash_scalar(&self, value: &ColumnarValue) -> Result { + let mut hasher = XxHash64::default(); + let value_str = match value { + ColumnarValue::Scalar(scalar) => match scalar { + ScalarValue::Utf8(Some(v)) => v.clone(), + ScalarValue::Int32(Some(v)) => v.to_string(), + ScalarValue::Int64(Some(v)) => v.to_string(), + ScalarValue::UInt32(Some(v)) => v.to_string(), + ScalarValue::UInt64(Some(v)) => v.to_string(), + _ => return Err(DataFusionError::Internal("Unsupported scalar type".to_string())), + }, + _ => return Err(DataFusionError::Internal("Expected a scalar value".to_string())), + }; + hasher.write(value_str.as_bytes()); + let hash: u64 = hasher.finish() as u64; + Ok(hex::encode(hash.to_be_bytes())) + } } impl ScalarUDFImpl for XxHash64Func { fn as_any(&self) -> &dyn Any { @@ -90,34 +108,57 @@ impl ScalarUDFImpl for XxHash64Func { // Assuming that the argument is either a StringArray or BinaryArray let input_data = &args[0]; - // Collect the output hash results - let result = match input_data { - ColumnarValue::Array(array) => { - // Check if the array is a StringArray (or another type you expect) - if let Some(string_array) = array.as_any().downcast_ref::() { - let mut hash_results: Vec = Vec::with_capacity(string_array.len()); - for i in 0..string_array.len() { - if !string_array.is_null(i) { - let value = string_array.value(i); // returns &str - let mut hasher = XxHash64::default(); - hasher.write(value.as_bytes()); // as_bytes() on &str returns &[u8] - let hash = hasher.finish(); - let hash_hex = hex::encode(hash.to_be_bytes()); - hash_results.push(hash_hex); - } else { - hash_results.push(String::from("00000000")); // or handle null values differently - } - } - - // Create an UInt64Array from the hash results - let hash_array = StringArray::from(hash_results); - Arc::new(hash_array) as Arc - } else { - return Err(DataFusionError::Internal("Unsupported array type".to_string())); + // Collect the output hash results + let result = match input_data { + ColumnarValue::Array(array) => { + let mut hash_results: Vec = Vec::with_capacity(array.len()); + for i in 0..array.len() { + if array.is_null(i) { + hash_results.push(String::from("00000000")); // or handle null values differently + continue; } - }, - _ => return Err(DataFusionError::Internal("Unsupported input type".to_string())), - }; + + let mut hasher = XxHash64::default(); + let value_str = match array.data_type() { + DataType::Utf8 => { + let string_array = array.as_any().downcast_ref::().unwrap(); + string_array.value(i).to_string() + } + DataType::Int32 => { + let int_array = array.as_any().downcast_ref::().unwrap(); + int_array.value(i).to_string() + } + DataType::Int64 => { + let int_array = array.as_any().downcast_ref::().unwrap(); + int_array.value(i).to_string() + } + DataType::UInt32 => { + let uint_array = array.as_any().downcast_ref::().unwrap(); + uint_array.value(i).to_string() + } + DataType::UInt64 => { + let uint_array = array.as_any().downcast_ref::().unwrap(); + uint_array.value(i).to_string() + } + _ => return Err(DataFusionError::Internal("Unsupported array type".to_string())), + }; + hasher.write(value_str.as_bytes()); + let hash: u64 = hasher.finish() as u64; + let hash_hex = hex::encode(hash.to_be_bytes()); + hash_results.push(hash_hex); + } + + // Create a StringArray from the hash results + let hash_array = StringArray::from(hash_results); + Arc::new(hash_array) as Arc + }, + ColumnarValue::Scalar(scalar) => { + let hash_result = self.hash_scalar(&ColumnarValue::Scalar(scalar.clone()))?; + let hash_array = StringArray::from(vec![hash_result]); + Arc::new(hash_array) as Arc + }, + _ => return Err(DataFusionError::Internal("Unsupported input type".to_string())), + }; Ok(ColumnarValue::Array(result)) } @@ -163,7 +204,27 @@ impl XxHash32Func { ), } } + + pub fn hash_scalar(&self, value: &ColumnarValue) -> Result { + let mut hasher = XxHash32::default(); + let value_str = match value { + ColumnarValue::Scalar(scalar) => match scalar { + ScalarValue::Utf8(Some(v)) => v.clone(), + ScalarValue::Int32(Some(v)) => v.to_string(), + ScalarValue::Int64(Some(v)) => v.to_string(), + ScalarValue::UInt32(Some(v)) => v.to_string(), + ScalarValue::UInt64(Some(v)) => v.to_string(), + _ => return Err(DataFusionError::Internal("Unsupported scalar type".to_string())), + }, + _ => return Err(DataFusionError::Internal("Expected a scalar value".to_string())), + }; + hasher.write(value_str.as_bytes()); + let hash: u32 = hasher.finish() as u32; + Ok(hex::encode(hash.to_be_bytes())) + } } + + impl ScalarUDFImpl for XxHash32Func { fn as_any(&self) -> &dyn Any { self @@ -181,47 +242,69 @@ impl ScalarUDFImpl for XxHash32Func { Ok(DataType::Utf8) } - fn invoke_batch( - &self, - args: &[ColumnarValue], - _number_rows: usize, - ) -> Result { - // Assuming that the argument is either a StringArray or BinaryArray - let input_data = &args[0]; + fn invoke_batch( + &self, + args: &[ColumnarValue], + _number_rows: usize, + ) -> Result { + let input_data = &args[0]; - // Collect the output hash results - let result = match input_data { - ColumnarValue::Array(array) => { - // Check if the array is a StringArray (or another type you expect) - if let Some(string_array) = array.as_any().downcast_ref::() { - let mut hash_results: Vec = Vec::with_capacity(string_array.len()); - for i in 0..string_array.len() { - if !string_array.is_null(i) { - let value = string_array.value(i); // returns &str - let mut hasher = XxHash32::default(); - hasher.write(value.as_bytes()); // as_bytes() on &str returns &[u8] - let hash: u32 = hasher.finish() as u32; - let hash_hex = hex::encode(hash.to_be_bytes()); - hash_results.push(hash_hex); - } else { - hash_results.push(String::from("00000000")); // or handle null values differently - } + // Collect the output hash results + let result = match input_data { + ColumnarValue::Array(array) => { + let mut hash_results: Vec = Vec::with_capacity(array.len()); + for i in 0..array.len() { + if array.is_null(i) { + hash_results.push(String::from("00000000")); // or handle null values differently + continue; + } + + let mut hasher = XxHash32::default(); + let value_str = match array.data_type() { + DataType::Utf8 => { + let string_array = array.as_any().downcast_ref::().unwrap(); + string_array.value(i).to_string() + } + DataType::Int32 => { + let int_array = array.as_any().downcast_ref::().unwrap(); + int_array.value(i).to_string() + } + DataType::Int64 => { + let int_array = array.as_any().downcast_ref::().unwrap(); + int_array.value(i).to_string() + } + DataType::UInt32 => { + let uint_array = array.as_any().downcast_ref::().unwrap(); + uint_array.value(i).to_string() } + DataType::UInt64 => { + let uint_array = array.as_any().downcast_ref::().unwrap(); + uint_array.value(i).to_string() + } + _ => return Err(DataFusionError::Internal("Unsupported array type".to_string())), + }; + hasher.write(value_str.as_bytes()); + let hash: u32 = hasher.finish() as u32; + let hash_hex = hex::encode(hash.to_be_bytes()); + hash_results.push(hash_hex); + } - // Create a StringArray from the hash results - let hash_array = StringArray::from(hash_results); - Arc::new(hash_array) as Arc - } else { - return Err(DataFusionError::Internal("Unsupported array type".to_string())); - } - }, - _ => return Err(DataFusionError::Internal("Unsupported input type".to_string())), - }; - - Ok(ColumnarValue::Array(result)) - } + // Create a StringArray from the hash results + let hash_array = StringArray::from(hash_results); + Arc::new(hash_array) as Arc + }, + ColumnarValue::Scalar(scalar) => { + let hash_result = self.hash_scalar(&ColumnarValue::Scalar(scalar.clone()))?; + let hash_array = StringArray::from(vec![hash_result]); + Arc::new(hash_array) as Arc + }, + _ => return Err(DataFusionError::Internal("Unsupported input type".to_string())), + }; + + Ok(ColumnarValue::Array(result)) + } fn documentation(&self) -> Option<&Documentation> { self.doc() } -} +} \ No newline at end of file From 5fcc67bb55ea2f9e4e1c98c269940cd9759f7382 Mon Sep 17 00:00:00 2001 From: Spaarsh-root Date: Mon, 27 Jan 2025 22:46:26 +0530 Subject: [PATCH 03/14] Refactored code --- datafusion/functions/src/hash/xxhash.rs | 235 ++++++++++-------------- 1 file changed, 102 insertions(+), 133 deletions(-) diff --git a/datafusion/functions/src/hash/xxhash.rs b/datafusion/functions/src/hash/xxhash.rs index 0fc50325ffc5..cedc93d83f43 100644 --- a/datafusion/functions/src/hash/xxhash.rs +++ b/datafusion/functions/src/hash/xxhash.rs @@ -1,17 +1,17 @@ // Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file +// or more contributor license agreements. See the NOTICE file // distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file +// regarding copyright ownership. The ASF licenses this file // to you under the Apache License, Version 2.0 (the // "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at +// with the License. You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, // software distributed under the License is distributed on an // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the +// KIND, either express or implied. See the License for the // specific language governing permissions and limitations // under the License. @@ -42,11 +42,11 @@ use std::sync::Arc; ```"#, standard_argument(name = "expression", prefix = "String") )] - #[derive(Debug)] pub struct XxHash64Func { signature: Signature, } + impl Default for XxHash64Func { fn default() -> Self { Self::new() @@ -66,23 +66,11 @@ impl XxHash64Func { } pub fn hash_scalar(&self, value: &ColumnarValue) -> Result { - let mut hasher = XxHash64::default(); - let value_str = match value { - ColumnarValue::Scalar(scalar) => match scalar { - ScalarValue::Utf8(Some(v)) => v.clone(), - ScalarValue::Int32(Some(v)) => v.to_string(), - ScalarValue::Int64(Some(v)) => v.to_string(), - ScalarValue::UInt32(Some(v)) => v.to_string(), - ScalarValue::UInt64(Some(v)) => v.to_string(), - _ => return Err(DataFusionError::Internal("Unsupported scalar type".to_string())), - }, - _ => return Err(DataFusionError::Internal("Expected a scalar value".to_string())), - }; - hasher.write(value_str.as_bytes()); - let hash: u64 = hasher.finish() as u64; - Ok(hex::encode(hash.to_be_bytes())) + let value_str = to_string_from_scalar(value)?; + hash_value(&value_str, XxHash64::default(), HashType::U64) } } + impl ScalarUDFImpl for XxHash64Func { fn as_any(&self) -> &dyn Any { self @@ -96,59 +84,20 @@ impl ScalarUDFImpl for XxHash64Func { &self.signature } - fn return_type(&self, arg_types: &[DataType]) -> Result { + fn return_type(&self, _arg_types: &[DataType]) -> Result { Ok(DataType::Utf8) } - fn invoke_batch( - &self, - args: &[ColumnarValue], - _number_rows: usize, - ) -> Result { - // Assuming that the argument is either a StringArray or BinaryArray - let input_data = &args[0]; - - // Collect the output hash results + fn invoke_batch( + &self, + args: &[ColumnarValue], + _number_rows: usize, + ) -> Result { + let input_data = &args[0]; + let result = match input_data { ColumnarValue::Array(array) => { - let mut hash_results: Vec = Vec::with_capacity(array.len()); - for i in 0..array.len() { - if array.is_null(i) { - hash_results.push(String::from("00000000")); // or handle null values differently - continue; - } - - let mut hasher = XxHash64::default(); - let value_str = match array.data_type() { - DataType::Utf8 => { - let string_array = array.as_any().downcast_ref::().unwrap(); - string_array.value(i).to_string() - } - DataType::Int32 => { - let int_array = array.as_any().downcast_ref::().unwrap(); - int_array.value(i).to_string() - } - DataType::Int64 => { - let int_array = array.as_any().downcast_ref::().unwrap(); - int_array.value(i).to_string() - } - DataType::UInt32 => { - let uint_array = array.as_any().downcast_ref::().unwrap(); - uint_array.value(i).to_string() - } - DataType::UInt64 => { - let uint_array = array.as_any().downcast_ref::().unwrap(); - uint_array.value(i).to_string() - } - _ => return Err(DataFusionError::Internal("Unsupported array type".to_string())), - }; - hasher.write(value_str.as_bytes()); - let hash: u64 = hasher.finish() as u64; - let hash_hex = hex::encode(hash.to_be_bytes()); - hash_results.push(hash_hex); - } - - // Create a StringArray from the hash results + let hash_results = process_array(array, XxHash64::default(), HashType::U64)?; let hash_array = StringArray::from(hash_results); Arc::new(hash_array) as Arc }, @@ -156,12 +105,11 @@ impl ScalarUDFImpl for XxHash64Func { let hash_result = self.hash_scalar(&ColumnarValue::Scalar(scalar.clone()))?; let hash_array = StringArray::from(vec![hash_result]); Arc::new(hash_array) as Arc - }, - _ => return Err(DataFusionError::Internal("Unsupported input type".to_string())), + } }; - - Ok(ColumnarValue::Array(result)) - } + + Ok(ColumnarValue::Array(result)) + } fn documentation(&self) -> Option<&Documentation> { self.doc() @@ -182,11 +130,11 @@ impl ScalarUDFImpl for XxHash64Func { ```"#, standard_argument(name = "expression", prefix = "String") )] - #[derive(Debug)] pub struct XxHash32Func { signature: Signature, } + impl Default for XxHash32Func { fn default() -> Self { Self::new() @@ -204,27 +152,13 @@ impl XxHash32Func { ), } } - + pub fn hash_scalar(&self, value: &ColumnarValue) -> Result { - let mut hasher = XxHash32::default(); - let value_str = match value { - ColumnarValue::Scalar(scalar) => match scalar { - ScalarValue::Utf8(Some(v)) => v.clone(), - ScalarValue::Int32(Some(v)) => v.to_string(), - ScalarValue::Int64(Some(v)) => v.to_string(), - ScalarValue::UInt32(Some(v)) => v.to_string(), - ScalarValue::UInt64(Some(v)) => v.to_string(), - _ => return Err(DataFusionError::Internal("Unsupported scalar type".to_string())), - }, - _ => return Err(DataFusionError::Internal("Expected a scalar value".to_string())), - }; - hasher.write(value_str.as_bytes()); - let hash: u32 = hasher.finish() as u32; - Ok(hex::encode(hash.to_be_bytes())) + let value_str = to_string_from_scalar(value)?; + hash_value(&value_str, XxHash32::default(), HashType::U32) } } - impl ScalarUDFImpl for XxHash32Func { fn as_any(&self) -> &dyn Any { self @@ -238,7 +172,7 @@ impl ScalarUDFImpl for XxHash32Func { &self.signature } - fn return_type(&self, arg_types: &[DataType]) -> Result { + fn return_type(&self, _arg_types: &[DataType]) -> Result { Ok(DataType::Utf8) } @@ -249,47 +183,9 @@ impl ScalarUDFImpl for XxHash32Func { ) -> Result { let input_data = &args[0]; - // Collect the output hash results let result = match input_data { ColumnarValue::Array(array) => { - let mut hash_results: Vec = Vec::with_capacity(array.len()); - for i in 0..array.len() { - if array.is_null(i) { - hash_results.push(String::from("00000000")); // or handle null values differently - continue; - } - - let mut hasher = XxHash32::default(); - let value_str = match array.data_type() { - DataType::Utf8 => { - let string_array = array.as_any().downcast_ref::().unwrap(); - string_array.value(i).to_string() - } - DataType::Int32 => { - let int_array = array.as_any().downcast_ref::().unwrap(); - int_array.value(i).to_string() - } - DataType::Int64 => { - let int_array = array.as_any().downcast_ref::().unwrap(); - int_array.value(i).to_string() - } - DataType::UInt32 => { - let uint_array = array.as_any().downcast_ref::().unwrap(); - uint_array.value(i).to_string() - } - DataType::UInt64 => { - let uint_array = array.as_any().downcast_ref::().unwrap(); - uint_array.value(i).to_string() - } - _ => return Err(DataFusionError::Internal("Unsupported array type".to_string())), - }; - hasher.write(value_str.as_bytes()); - let hash: u32 = hasher.finish() as u32; - let hash_hex = hex::encode(hash.to_be_bytes()); - hash_results.push(hash_hex); - } - - // Create a StringArray from the hash results + let hash_results = process_array(array, XxHash32::default(), HashType::U32)?; let hash_array = StringArray::from(hash_results); Arc::new(hash_array) as Arc }, @@ -297,8 +193,7 @@ impl ScalarUDFImpl for XxHash32Func { let hash_result = self.hash_scalar(&ColumnarValue::Scalar(scalar.clone()))?; let hash_array = StringArray::from(vec![hash_result]); Arc::new(hash_array) as Arc - }, - _ => return Err(DataFusionError::Internal("Unsupported input type".to_string())), + } }; Ok(ColumnarValue::Array(result)) @@ -307,4 +202,78 @@ impl ScalarUDFImpl for XxHash32Func { fn documentation(&self) -> Option<&Documentation> { self.doc() } +} + +// Helper functions + +fn to_string_from_scalar(value: &ColumnarValue) -> Result { + match value { + ColumnarValue::Scalar(scalar) => match scalar { + ScalarValue::Utf8(Some(v)) => Ok(v.clone()), + ScalarValue::Int32(Some(v)) => Ok(v.to_string()), + ScalarValue::Int64(Some(v)) => Ok(v.to_string()), + ScalarValue::UInt32(Some(v)) => Ok(v.to_string()), + ScalarValue::UInt64(Some(v)) => Ok(v.to_string()), + _ => Err(DataFusionError::Internal("Unsupported scalar type".to_string())), + }, + _ => Err(DataFusionError::Internal("Expected a scalar value".to_string())), + } +} + +#[derive(Clone)] +pub enum HashType { + U32, + U64, +} + +fn hash_value(value_str: &str, mut hasher: T, hash_type: HashType) -> Result { + hasher.write(value_str.as_bytes()); + let hash = hasher.finish(); + match hash_type { + HashType::U32 => { + let hash_u32 = hash as u32; + Ok(hex::encode(hash_u32.to_be_bytes())) + }, + HashType::U64 => { + let hash_u64 = hash as u64; + Ok(hex::encode(hash_u64.to_be_bytes())) + }, + } +} + +fn process_array(array: &dyn Array, mut hasher: T, hash_type: HashType) -> Result> { + let mut hash_results: Vec = Vec::with_capacity(array.len()); + for i in 0..array.len() { + if array.is_null(i) { + hash_results.push(String::from("00000000")); // Handle null values + continue; + } + + let value_str = match array.data_type() { + DataType::Utf8 => { + let string_array = array.as_any().downcast_ref::().unwrap(); + string_array.value(i).to_string() + } + DataType::Int32 => { + let int_array = array.as_any().downcast_ref::().unwrap(); + int_array.value(i).to_string() + } + DataType::Int64 => { + let int_array = array.as_any().downcast_ref::().unwrap(); + int_array.value(i).to_string() + } + DataType::UInt32 => { + let uint_array = array.as_any().downcast_ref::().unwrap(); + uint_array.value(i).to_string() + } + DataType::UInt64 => { + let uint_array = array.as_any().downcast_ref::().unwrap(); + uint_array.value(i).to_string() + } + _ => return Err(DataFusionError::Internal("Unsupported array type".to_string())), + }; + + hash_results.push(hash_value(&value_str, &mut hasher, hash_type.clone())?); + } + Ok(hash_results) } \ No newline at end of file From 6353e6baba41b837dd0e395af292b5ab1acea09f Mon Sep 17 00:00:00 2001 From: Spaarsh-root Date: Wed, 29 Jan 2025 14:59:24 +0530 Subject: [PATCH 04/14] license fix --- datafusion/functions/src/hash/xxhash.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/datafusion/functions/src/hash/xxhash.rs b/datafusion/functions/src/hash/xxhash.rs index cedc93d83f43..ed0a8e37e014 100644 --- a/datafusion/functions/src/hash/xxhash.rs +++ b/datafusion/functions/src/hash/xxhash.rs @@ -1,17 +1,17 @@ // Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file +// or more contributor license agreements. See the NOTICE file // distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file +// regarding copyright ownership. The ASF licenses this file // to you under the Apache License, Version 2.0 (the // "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at +// with the License. You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, // software distributed under the License is distributed on an // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the +// KIND, either express or implied. See the License for the // specific language governing permissions and limitations // under the License. From 741fa18e391b37ad69692f0ac2c3dbb9d36484e1 Mon Sep 17 00:00:00 2001 From: Spaarsh-root Date: Wed, 5 Feb 2025 00:11:46 +0530 Subject: [PATCH 05/14] Optional seed added --- datafusion/functions/src/hash/mod.rs | 6 +- datafusion/functions/src/hash/xxhash.rs | 342 ++++++++++++++++-------- 2 files changed, 240 insertions(+), 108 deletions(-) diff --git a/datafusion/functions/src/hash/mod.rs b/datafusion/functions/src/hash/mod.rs index 5f204ccd5c95..fdf892669377 100644 --- a/datafusion/functions/src/hash/mod.rs +++ b/datafusion/functions/src/hash/mod.rs @@ -29,7 +29,7 @@ pub mod expr_fn { xxhash32, "Computes the XXHash32 hash of a binary string.", input - ),( + ) ,( xxhash64, "Computes the XXHash64 hash of a binary string.", input @@ -38,5 +38,7 @@ pub mod expr_fn { /// Returns all DataFusion functions defined in this package pub fn functions() -> Vec> { - vec![xxhash32(), xxhash64()] + vec![xxhash32(), + xxhash64() + ] } diff --git a/datafusion/functions/src/hash/xxhash.rs b/datafusion/functions/src/hash/xxhash.rs index ed0a8e37e014..9aa76394a949 100644 --- a/datafusion/functions/src/hash/xxhash.rs +++ b/datafusion/functions/src/hash/xxhash.rs @@ -15,11 +15,12 @@ // specific language governing permissions and limitations // under the License. -use arrow::array::{Array, StringArray, Int32Array, Int64Array, UInt32Array, UInt64Array}; +use arrow::array::{Array, StringArray, LargeStringArray, BinaryArray, LargeBinaryArray, }; use arrow::datatypes::DataType; -use datafusion_common::{Result, ScalarValue}; +use arrow::datatypes::DataType::{Utf8, Utf8View, LargeUtf8, Binary, LargeBinary, Int64, UInt32, UInt64}; +use datafusion_common::{Result, ScalarValue, plan_err}; use datafusion_expr::{ - ColumnarValue, Documentation, ScalarUDFImpl, Signature, Volatility, + ColumnarValue, Documentation, ScalarUDFImpl, Signature, Volatility, TypeSignature }; use twox_hash::{XxHash64, XxHash32}; use datafusion_macros::user_doc; @@ -30,84 +31,147 @@ use std::sync::Arc; #[user_doc( doc_section(label = "Hashing Functions"), - description = "Computes the XXHash64 hash of a binary string.", - syntax_example = "xxhash64(expression)", + description = "Computes the XXHash32 hash of a binary string.", + syntax_example = "xxhash32(expression)", sql_example = r#"```sql -> select xxhash64('foo'); +> select xxhash32('foo'); +-------------------------------------------+ -| xxhash64(Utf8("foo")) | +| xxhash32(Utf8("foo")) | +-------------------------------------------+ -| | +| | +-------------------------------------------+ ```"#, standard_argument(name = "expression", prefix = "String") )] #[derive(Debug)] -pub struct XxHash64Func { +pub struct XxHash32Func { signature: Signature, } -impl Default for XxHash64Func { +impl Default for XxHash32Func { fn default() -> Self { Self::new() } } -impl XxHash64Func { +impl XxHash32Func { pub fn new() -> Self { - use DataType::*; Self { - signature: Signature::uniform( - 1, - vec![Utf8View, Utf8, LargeUtf8, Binary, LargeBinary], + signature: Signature::one_of( + vec![ + TypeSignature::Exact(vec![Utf8View]), + TypeSignature::Exact(vec![Utf8]), + TypeSignature::Exact(vec![LargeUtf8]), + TypeSignature::Exact(vec![Binary]), + TypeSignature::Exact(vec![LargeBinary]), + TypeSignature::Exact(vec![Utf8View, Int64]), + TypeSignature::Exact(vec![Utf8, Int64]), + TypeSignature::Exact(vec![LargeUtf8, Int64]), + TypeSignature::Exact(vec![Binary, Int64]), + TypeSignature::Exact(vec![LargeBinary, Int64]), + ], Volatility::Immutable, ), } } - pub fn hash_scalar(&self, value: &ColumnarValue) -> Result { - let value_str = to_string_from_scalar(value)?; - hash_value(&value_str, XxHash64::default(), HashType::U64) + pub fn hash_scalar(&self, value: &[u8]) -> Result { + // let value_str = to_string_from_scalar(value)?; + hash_value(value, XxHash32::default(), HashType::U32) } } -impl ScalarUDFImpl for XxHash64Func { +impl ScalarUDFImpl for XxHash32Func { fn as_any(&self) -> &dyn Any { self } fn name(&self) -> &str { - "xxhash64" + "xxhash32" } fn signature(&self) -> &Signature { &self.signature } - fn return_type(&self, _arg_types: &[DataType]) -> Result { - Ok(DataType::Utf8) + fn return_type(&self, arg_types: &[DataType]) -> Result { + use DataType::*; + Ok(match &arg_types[0] { + LargeUtf8 | LargeBinary => Utf8, + Utf8View | Utf8 | Binary => Utf8, + Null => Null, + Dictionary(_, t) => match **t { + LargeUtf8 | LargeBinary => Utf8, + Utf8 | Binary => Utf8, + Null => Null, + _ => { + return plan_err!( + "the xxhash32 can only accept strings but got {:?}", + **t + ); + } + }, + other => { + return plan_err!( + "The xxhash32 function can only accept strings. Got {other}" + ); + } + }) } fn invoke_batch( &self, args: &[ColumnarValue], _number_rows: usize, - ) -> Result { + ) -> Result { let input_data = &args[0]; - let result = match input_data { - ColumnarValue::Array(array) => { - let hash_results = process_array(array, XxHash64::default(), HashType::U64)?; - let hash_array = StringArray::from(hash_results); - Arc::new(hash_array) as Arc - }, - ColumnarValue::Scalar(scalar) => { - let hash_result = self.hash_scalar(&ColumnarValue::Scalar(scalar.clone()))?; - let hash_array = StringArray::from(vec![hash_result]); - Arc::new(hash_array) as Arc + let seed = if args.len() > 1 { + if let ColumnarValue::Scalar(ScalarValue::Int64(Some(seed))) = &args[1] { + if *seed >= 0 && *seed <= u32::MAX as i64 { + *seed as u32 + } else { + return Err(DataFusionError::Execution(format!("Seed value out of range for UInt32: {}", seed))); + } + } + else{ + let actual_type = format!("{:?}", &args[1]); + return Err(DataFusionError::Execution(format!("Expected a Int64 seed value, but got {}", actual_type))); } + + } else { + 0 // Default seed value }; + let result = match input_data { + ColumnarValue::Array(array) => { + let hash_results = process_array(array, XxHash32::with_seed(seed), HashType::U32)?; + let hash_array = StringArray::from(hash_results); + Arc::new(hash_array) as Arc + }, + ColumnarValue::Scalar(scalar) => { + match scalar { + ScalarValue::Utf8(Some(ref v)) | ScalarValue::Utf8View(Some(ref v)) | ScalarValue::LargeUtf8(Some(ref v)) => { + if v.is_empty() { + return Ok(ColumnarValue::Array(Arc::new(StringArray::from(vec![""])))); + } + let hash_result = hash_value(v.as_bytes(), XxHash32::with_seed(seed), HashType::U32)?; + let hash_array = StringArray::from(vec![hash_result]); + Arc::new(hash_array) as Arc + } + ScalarValue::Binary(Some(ref v)) | ScalarValue::LargeBinary(Some(ref v)) => { + let hash_result = hash_value(v, XxHash32::with_seed(seed), HashType::U32)?; + let hash_array = StringArray::from(vec![hash_result]); + Arc::new(hash_array) as Arc + } + _ => { + let actual_type = format!("{:?}", scalar); + return Err(DataFusionError::Internal(format!("Unsupported scalar type: {}", actual_type))); + } + } + } + }; + Ok(ColumnarValue::Array(result)) } @@ -118,84 +182,147 @@ impl ScalarUDFImpl for XxHash64Func { #[user_doc( doc_section(label = "Hashing Functions"), - description = "Computes the XXHash32 hash of a binary string.", - syntax_example = "xxhash32(expression)", + description = "Computes the XXHash64 hash of a binary string.", + syntax_example = "xxhash64(expression)", sql_example = r#"```sql -> select xxhash32('foo'); +> select xxhash64('foo'); +-------------------------------------------+ -| xxhash32(Utf8("foo")) | +| xxhash64(Utf8("foo")) | +-------------------------------------------+ -| | +| | +-------------------------------------------+ ```"#, standard_argument(name = "expression", prefix = "String") )] #[derive(Debug)] -pub struct XxHash32Func { +pub struct XxHash64Func { signature: Signature, } -impl Default for XxHash32Func { +impl Default for XxHash64Func { fn default() -> Self { Self::new() } } -impl XxHash32Func { +impl XxHash64Func { pub fn new() -> Self { - use DataType::*; Self { - signature: Signature::uniform( - 1, - vec![Utf8View, Utf8, LargeUtf8, Binary, LargeBinary], + signature: Signature::one_of( + vec![ + TypeSignature::Exact(vec![Utf8View]), + TypeSignature::Exact(vec![Utf8]), + TypeSignature::Exact(vec![LargeUtf8]), + TypeSignature::Exact(vec![Binary]), + TypeSignature::Exact(vec![LargeBinary]), + TypeSignature::Exact(vec![Utf8View, Int64]), + TypeSignature::Exact(vec![Utf8, Int64]), + TypeSignature::Exact(vec![LargeUtf8, Int64]), + TypeSignature::Exact(vec![Binary, Int64]), + TypeSignature::Exact(vec![LargeBinary, Int64]), + ], Volatility::Immutable, ), } } - pub fn hash_scalar(&self, value: &ColumnarValue) -> Result { - let value_str = to_string_from_scalar(value)?; - hash_value(&value_str, XxHash32::default(), HashType::U32) + pub fn hash_scalar(&self, value: &[u8]) -> Result { + // let value_str = to_string_from_scalar(value)?; + hash_value(value, XxHash64::default(), HashType::U64) } } -impl ScalarUDFImpl for XxHash32Func { +impl ScalarUDFImpl for XxHash64Func { fn as_any(&self) -> &dyn Any { self } fn name(&self) -> &str { - "xxhash32" + "xxhash64" } fn signature(&self) -> &Signature { &self.signature } - fn return_type(&self, _arg_types: &[DataType]) -> Result { - Ok(DataType::Utf8) + fn return_type(&self, arg_types: &[DataType]) -> Result { + use DataType::*; + Ok(match &arg_types[0] { + LargeUtf8 | LargeBinary => Utf8, + Utf8View | Utf8 | Binary => Utf8, + Null => Null, + Dictionary(_, t) => match **t { + LargeUtf8 | LargeBinary => Utf8, + Utf8 | Binary => Utf8, + Null => Null, + _ => { + return plan_err!( + "the xxhash64 can only accept strings but got {:?}", + **t + ); + } + }, + other => { + return plan_err!( + "The xxhash64 function can only accept strings. Got {other}" + ); + } + }) } fn invoke_batch( &self, args: &[ColumnarValue], _number_rows: usize, - ) -> Result { + ) -> Result { let input_data = &args[0]; - let result = match input_data { - ColumnarValue::Array(array) => { - let hash_results = process_array(array, XxHash32::default(), HashType::U32)?; - let hash_array = StringArray::from(hash_results); - Arc::new(hash_array) as Arc - }, - ColumnarValue::Scalar(scalar) => { - let hash_result = self.hash_scalar(&ColumnarValue::Scalar(scalar.clone()))?; - let hash_array = StringArray::from(vec![hash_result]); - Arc::new(hash_array) as Arc + let seed = if args.len() > 1 { + if let ColumnarValue::Scalar(ScalarValue::Int64(Some(seed))) = &args[1] { + if *seed >= 0 { + *seed as u64 + } else { + return Err(DataFusionError::Execution(format!("Seed value out of range for UInt64: {}", seed))); + } } + else{ + let actual_type = format!("{:?}", &args[1]); + return Err(DataFusionError::Execution(format!("Expected a Int64 seed value, but got {}", actual_type))); + } + + } else { + 0 // Default seed value }; + let result = match input_data { + ColumnarValue::Array(array) => { + let hash_results = process_array(array, XxHash64::with_seed(seed), HashType::U64)?; + let hash_array = StringArray::from(hash_results); + Arc::new(hash_array) as Arc + }, + ColumnarValue::Scalar(scalar) => { + match scalar { + ScalarValue::Utf8(Some(ref v)) | ScalarValue::Utf8View(Some(ref v)) | ScalarValue::LargeUtf8(Some(ref v)) => { + if v.is_empty() { + return Ok(ColumnarValue::Array(Arc::new(StringArray::from(vec![""])))); + } + let hash_result = hash_value(v.as_bytes(), XxHash64::with_seed(seed), HashType::U64)?; + let hash_array = StringArray::from(vec![hash_result]); + Arc::new(hash_array) as Arc + } + ScalarValue::Binary(Some(ref v)) | ScalarValue::LargeBinary(Some(ref v)) => { + let hash_result = hash_value(v, XxHash64::with_seed(seed), HashType::U64)?; + let hash_array = StringArray::from(vec![hash_result]); + Arc::new(hash_array) as Arc + } + _ => { + let actual_type = format!("{:?}", scalar); + return Err(DataFusionError::Internal(format!("Unsupported scalar type: {}", actual_type))); + } + } + } + }; + Ok(ColumnarValue::Array(result)) } @@ -206,28 +333,14 @@ impl ScalarUDFImpl for XxHash32Func { // Helper functions -fn to_string_from_scalar(value: &ColumnarValue) -> Result { - match value { - ColumnarValue::Scalar(scalar) => match scalar { - ScalarValue::Utf8(Some(v)) => Ok(v.clone()), - ScalarValue::Int32(Some(v)) => Ok(v.to_string()), - ScalarValue::Int64(Some(v)) => Ok(v.to_string()), - ScalarValue::UInt32(Some(v)) => Ok(v.to_string()), - ScalarValue::UInt64(Some(v)) => Ok(v.to_string()), - _ => Err(DataFusionError::Internal("Unsupported scalar type".to_string())), - }, - _ => Err(DataFusionError::Internal("Expected a scalar value".to_string())), - } -} - #[derive(Clone)] pub enum HashType { U32, U64, } -fn hash_value(value_str: &str, mut hasher: T, hash_type: HashType) -> Result { - hasher.write(value_str.as_bytes()); +fn hash_value(value_bytes: &[u8], mut hasher: T, hash_type: HashType) -> Result { + hasher.write(value_bytes); let hash = hasher.finish(); match hash_type { HashType::U32 => { @@ -243,37 +356,54 @@ fn hash_value(value_str: &str, mut hasher: T, hash_type: HashType) -> fn process_array(array: &dyn Array, mut hasher: T, hash_type: HashType) -> Result> { let mut hash_results: Vec = Vec::with_capacity(array.len()); - for i in 0..array.len() { - if array.is_null(i) { - hash_results.push(String::from("00000000")); // Handle null values - continue; - } - let value_str = match array.data_type() { - DataType::Utf8 => { - let string_array = array.as_any().downcast_ref::().unwrap(); - string_array.value(i).to_string() - } - DataType::Int32 => { - let int_array = array.as_any().downcast_ref::().unwrap(); - int_array.value(i).to_string() + match array.data_type() { + Utf8 | Utf8View | LargeUtf8 => { + let string_array = array.as_any().downcast_ref::().unwrap(); + for i in 0..array.len() { + if array.is_null(i) { + hash_results.push(String::new()); // Handle null values + continue; + } + let value = string_array.value(i); + if value.is_empty() { + hash_results.push(String::new()); + continue; + } + hash_results.push(hash_value(value.as_bytes(), &mut hasher, hash_type.clone())?); } - DataType::Int64 => { - let int_array = array.as_any().downcast_ref::().unwrap(); - int_array.value(i).to_string() - } - DataType::UInt32 => { - let uint_array = array.as_any().downcast_ref::().unwrap(); - uint_array.value(i).to_string() - } - DataType::UInt64 => { - let uint_array = array.as_any().downcast_ref::().unwrap(); - uint_array.value(i).to_string() + } + + Binary | LargeBinary => { + let binary_array: &dyn Array = if array.data_type() == &Binary { + array.as_any().downcast_ref::().unwrap() + } else { + array.as_any().downcast_ref::().unwrap() + }; + for i in 0..array.len() { + if array.is_null(i) { + hash_results.push(String::new()); // Handle null values + continue; + } + let value = if let Some(binary_array) = binary_array.as_any().downcast_ref::() { + binary_array.value(i) + } else { + binary_array.as_any().downcast_ref::().unwrap().value(i) + }; + hash_results.push(hash_value(value, &mut hasher, hash_type.clone())?); } - _ => return Err(DataFusionError::Internal("Unsupported array type".to_string())), - }; + } - hash_results.push(hash_value(&value_str, &mut hasher, hash_type.clone())?); + DataType::Null => { + for _ in 0..array.len() { + hash_results.push(String::new()); // Handle null values + } + } + _ => { + let actual_type = format!("{:?}", array.data_type()); + return Err(DataFusionError::Internal(format!("Unsupported array type: {}", actual_type))); + }, } + Ok(hash_results) } \ No newline at end of file From 4d2475966a107a64d1e6f3efa8a11574eb32ea2a Mon Sep 17 00:00:00 2001 From: Spaarsh-root Date: Wed, 5 Feb 2025 00:50:38 +0530 Subject: [PATCH 06/14] Fixed test failings --- datafusion/functions/src/hash/xxhash.rs | 14 ++++-- .../source/user-guide/sql/scalar_functions.md | 48 +++++++++++++++++++ 2 files changed, 59 insertions(+), 3 deletions(-) diff --git a/datafusion/functions/src/hash/xxhash.rs b/datafusion/functions/src/hash/xxhash.rs index 9aa76394a949..4b3d496cb0dd 100644 --- a/datafusion/functions/src/hash/xxhash.rs +++ b/datafusion/functions/src/hash/xxhash.rs @@ -17,7 +17,7 @@ use arrow::array::{Array, StringArray, LargeStringArray, BinaryArray, LargeBinaryArray, }; use arrow::datatypes::DataType; -use arrow::datatypes::DataType::{Utf8, Utf8View, LargeUtf8, Binary, LargeBinary, Int64, UInt32, UInt64}; +use arrow::datatypes::DataType::{Utf8, Utf8View, LargeUtf8, Binary, LargeBinary, Int64}; use datafusion_common::{Result, ScalarValue, plan_err}; use datafusion_expr::{ ColumnarValue, Documentation, ScalarUDFImpl, Signature, Volatility, TypeSignature @@ -359,13 +359,21 @@ fn process_array(array: &dyn Array, mut hasher: T, hash_type: HashTyp match array.data_type() { Utf8 | Utf8View | LargeUtf8 => { - let string_array = array.as_any().downcast_ref::().unwrap(); + let string_array: &dyn Array = if array.data_type() == &Utf8 || array.data_type() == &Utf8View { + array.as_any().downcast_ref::().unwrap() + } else { + array.as_any().downcast_ref::().unwrap() + }; for i in 0..array.len() { if array.is_null(i) { hash_results.push(String::new()); // Handle null values continue; } - let value = string_array.value(i); + let value = if let Some(string_array) = string_array.as_any().downcast_ref::() { + string_array.value(i) + } else { + string_array.as_any().downcast_ref::().unwrap().value(i) + }; if value.is_empty() { hash_results.push(String::new()); continue; diff --git a/docs/source/user-guide/sql/scalar_functions.md b/docs/source/user-guide/sql/scalar_functions.md index a64ed597e007..657b5f3acc65 100644 --- a/docs/source/user-guide/sql/scalar_functions.md +++ b/docs/source/user-guide/sql/scalar_functions.md @@ -4185,6 +4185,8 @@ SELECT map_values(map([100, 5], [42, 43])); - [sha256](#sha256) - [sha384](#sha384) - [sha512](#sha512) +- [xxhash32](#xxhash32) +- [xxhash64](#xxhash64) ### `digest` @@ -4333,6 +4335,52 @@ sha512(expression) +-------------------------------------------+ ``` +### `xxhash32` + +Computes the XXHash32 hash of a binary string. + +``` +xxhash32(expression) +``` + +#### Arguments + +- **expression**: String expression to operate on. Can be a constant, column, or function, and any combination of operators. + +#### Example + +```sql +> select xxhash32('foo'); ++-------------------------------------------+ +| xxhash32(Utf8("foo")) | ++-------------------------------------------+ +| | ++-------------------------------------------+ +``` + +### `xxhash64` + +Computes the XXHash64 hash of a binary string. + +``` +xxhash64(expression) +``` + +#### Arguments + +- **expression**: String expression to operate on. Can be a constant, column, or function, and any combination of operators. + +#### Example + +```sql +> select xxhash64('foo'); ++-------------------------------------------+ +| xxhash64(Utf8("foo")) | ++-------------------------------------------+ +| | ++-------------------------------------------+ +``` + ## Other Functions - [arrow_cast](#arrow_cast) From 5f855115b0dea215e9638bc171fb0e0477f25c45 Mon Sep 17 00:00:00 2001 From: Spaarsh-root Date: Wed, 5 Feb 2025 01:21:57 +0530 Subject: [PATCH 07/14] Fixed clippy failing for unnecessary cast --- datafusion/functions/src/hash/xxhash.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/functions/src/hash/xxhash.rs b/datafusion/functions/src/hash/xxhash.rs index 4b3d496cb0dd..45e19ce80a4f 100644 --- a/datafusion/functions/src/hash/xxhash.rs +++ b/datafusion/functions/src/hash/xxhash.rs @@ -348,7 +348,7 @@ fn hash_value(value_bytes: &[u8], mut hasher: T, hash_type: HashType) Ok(hex::encode(hash_u32.to_be_bytes())) }, HashType::U64 => { - let hash_u64 = hash as u64; + let hash_u64 = hash; Ok(hex::encode(hash_u64.to_be_bytes())) }, } From 0d15f9cb52a9ad74605dc0356fa4d483cbe23286 Mon Sep 17 00:00:00 2001 From: Spaarsh-root Date: Wed, 5 Feb 2025 19:06:10 +0530 Subject: [PATCH 08/14] Fixed CI test fails --- datafusion/functions/src/hash/mod.rs | 25 +-- datafusion/functions/src/hash/xxhash.rs | 194 ++++++++++++++++-------- datafusion/functions/src/lib.rs | 4 +- 3 files changed, 142 insertions(+), 81 deletions(-) diff --git a/datafusion/functions/src/hash/mod.rs b/datafusion/functions/src/hash/mod.rs index fdf892669377..1268b463767c 100644 --- a/datafusion/functions/src/hash/mod.rs +++ b/datafusion/functions/src/hash/mod.rs @@ -25,20 +25,21 @@ make_udf_function!(xxhash::XxHash32Func, xxhash32); make_udf_function!(xxhash::XxHash64Func, xxhash64); pub mod expr_fn { - export_functions!(( - xxhash32, - "Computes the XXHash32 hash of a binary string.", - input - ) ,( - xxhash64, - "Computes the XXHash64 hash of a binary string.", - input - )); + export_functions!( + ( + xxhash32, + "Computes the XXHash32 hash of a binary string.", + input + ), + ( + xxhash64, + "Computes the XXHash64 hash of a binary string.", + input + ) + ); } /// Returns all DataFusion functions defined in this package pub fn functions() -> Vec> { - vec![xxhash32(), - xxhash64() - ] + vec![xxhash32(), xxhash64()] } diff --git a/datafusion/functions/src/hash/xxhash.rs b/datafusion/functions/src/hash/xxhash.rs index 45e19ce80a4f..d9a671c9ff48 100644 --- a/datafusion/functions/src/hash/xxhash.rs +++ b/datafusion/functions/src/hash/xxhash.rs @@ -15,19 +15,19 @@ // specific language governing permissions and limitations // under the License. -use arrow::array::{Array, StringArray, LargeStringArray, BinaryArray, LargeBinaryArray, }; +use arrow::array::{Array, BinaryArray, LargeBinaryArray, LargeStringArray, StringArray}; use arrow::datatypes::DataType; -use arrow::datatypes::DataType::{Utf8, Utf8View, LargeUtf8, Binary, LargeBinary, Int64}; -use datafusion_common::{Result, ScalarValue, plan_err}; +use arrow::datatypes::DataType::{Binary, Int64, LargeBinary, LargeUtf8, Utf8, Utf8View}; +use datafusion_common::DataFusionError; +use datafusion_common::{plan_err, Result, ScalarValue}; use datafusion_expr::{ - ColumnarValue, Documentation, ScalarUDFImpl, Signature, Volatility, TypeSignature + ColumnarValue, Documentation, ScalarUDFImpl, Signature, TypeSignature, Volatility, }; -use twox_hash::{XxHash64, XxHash32}; use datafusion_macros::user_doc; use std::any::Any; use std::hash::Hasher; -use datafusion_common::DataFusionError; use std::sync::Arc; +use twox_hash::{XxHash32, XxHash64}; #[user_doc( doc_section(label = "Hashing Functions"), @@ -123,7 +123,7 @@ impl ScalarUDFImpl for XxHash32Func { &self, args: &[ColumnarValue], _number_rows: usize, - ) -> Result { + ) -> Result { let input_data = &args[0]; let seed = if args.len() > 1 { @@ -131,46 +131,62 @@ impl ScalarUDFImpl for XxHash32Func { if *seed >= 0 && *seed <= u32::MAX as i64 { *seed as u32 } else { - return Err(DataFusionError::Execution(format!("Seed value out of range for UInt32: {}", seed))); + return Err(DataFusionError::Execution(format!( + "Seed value out of range for UInt32: {}", + seed + ))); } - } - else{ + } else { let actual_type = format!("{:?}", &args[1]); - return Err(DataFusionError::Execution(format!("Expected a Int64 seed value, but got {}", actual_type))); + return Err(DataFusionError::Execution(format!( + "Expected a Int64 seed value, but got {}", + actual_type + ))); } - } else { 0 // Default seed value }; let result = match input_data { - ColumnarValue::Array(array) => { - let hash_results = process_array(array, XxHash32::with_seed(seed), HashType::U32)?; - let hash_array = StringArray::from(hash_results); - Arc::new(hash_array) as Arc - }, - ColumnarValue::Scalar(scalar) => { - match scalar { - ScalarValue::Utf8(Some(ref v)) | ScalarValue::Utf8View(Some(ref v)) | ScalarValue::LargeUtf8(Some(ref v)) => { + ColumnarValue::Array(array) => { + let hash_results = + process_array(array, XxHash32::with_seed(seed), HashType::U32)?; + let hash_array = StringArray::from(hash_results); + Arc::new(hash_array) as Arc + } + ColumnarValue::Scalar(scalar) => match scalar { + ScalarValue::Utf8(Some(ref v)) + | ScalarValue::Utf8View(Some(ref v)) + | ScalarValue::LargeUtf8(Some(ref v)) => { if v.is_empty() { - return Ok(ColumnarValue::Array(Arc::new(StringArray::from(vec![""])))); + return Ok(ColumnarValue::Array(Arc::new(StringArray::from( + vec![""], + )))); } - let hash_result = hash_value(v.as_bytes(), XxHash32::with_seed(seed), HashType::U32)?; + let hash_result = hash_value( + v.as_bytes(), + XxHash32::with_seed(seed), + HashType::U32, + )?; let hash_array = StringArray::from(vec![hash_result]); Arc::new(hash_array) as Arc } - ScalarValue::Binary(Some(ref v)) | ScalarValue::LargeBinary(Some(ref v)) => { - let hash_result = hash_value(v, XxHash32::with_seed(seed), HashType::U32)?; + ScalarValue::Binary(Some(ref v)) + | ScalarValue::LargeBinary(Some(ref v)) => { + let hash_result = + hash_value(v, XxHash32::with_seed(seed), HashType::U32)?; let hash_array = StringArray::from(vec![hash_result]); Arc::new(hash_array) as Arc } _ => { let actual_type = format!("{:?}", scalar); - return Err(DataFusionError::Internal(format!("Unsupported scalar type: {}", actual_type))); + return Err(DataFusionError::Internal(format!( + "Unsupported scalar type: {}", + actual_type + ))); } - } - } - }; + }, + }; Ok(ColumnarValue::Array(result)) } @@ -274,7 +290,7 @@ impl ScalarUDFImpl for XxHash64Func { &self, args: &[ColumnarValue], _number_rows: usize, - ) -> Result { + ) -> Result { let input_data = &args[0]; let seed = if args.len() > 1 { @@ -282,46 +298,62 @@ impl ScalarUDFImpl for XxHash64Func { if *seed >= 0 { *seed as u64 } else { - return Err(DataFusionError::Execution(format!("Seed value out of range for UInt64: {}", seed))); + return Err(DataFusionError::Execution(format!( + "Seed value out of range for UInt64: {}", + seed + ))); } - } - else{ + } else { let actual_type = format!("{:?}", &args[1]); - return Err(DataFusionError::Execution(format!("Expected a Int64 seed value, but got {}", actual_type))); + return Err(DataFusionError::Execution(format!( + "Expected a Int64 seed value, but got {}", + actual_type + ))); } - } else { 0 // Default seed value }; let result = match input_data { - ColumnarValue::Array(array) => { - let hash_results = process_array(array, XxHash64::with_seed(seed), HashType::U64)?; - let hash_array = StringArray::from(hash_results); - Arc::new(hash_array) as Arc - }, - ColumnarValue::Scalar(scalar) => { - match scalar { - ScalarValue::Utf8(Some(ref v)) | ScalarValue::Utf8View(Some(ref v)) | ScalarValue::LargeUtf8(Some(ref v)) => { + ColumnarValue::Array(array) => { + let hash_results = + process_array(array, XxHash64::with_seed(seed), HashType::U64)?; + let hash_array = StringArray::from(hash_results); + Arc::new(hash_array) as Arc + } + ColumnarValue::Scalar(scalar) => match scalar { + ScalarValue::Utf8(Some(ref v)) + | ScalarValue::Utf8View(Some(ref v)) + | ScalarValue::LargeUtf8(Some(ref v)) => { if v.is_empty() { - return Ok(ColumnarValue::Array(Arc::new(StringArray::from(vec![""])))); + return Ok(ColumnarValue::Array(Arc::new(StringArray::from( + vec![""], + )))); } - let hash_result = hash_value(v.as_bytes(), XxHash64::with_seed(seed), HashType::U64)?; + let hash_result = hash_value( + v.as_bytes(), + XxHash64::with_seed(seed), + HashType::U64, + )?; let hash_array = StringArray::from(vec![hash_result]); Arc::new(hash_array) as Arc } - ScalarValue::Binary(Some(ref v)) | ScalarValue::LargeBinary(Some(ref v)) => { - let hash_result = hash_value(v, XxHash64::with_seed(seed), HashType::U64)?; + ScalarValue::Binary(Some(ref v)) + | ScalarValue::LargeBinary(Some(ref v)) => { + let hash_result = + hash_value(v, XxHash64::with_seed(seed), HashType::U64)?; let hash_array = StringArray::from(vec![hash_result]); Arc::new(hash_array) as Arc } _ => { let actual_type = format!("{:?}", scalar); - return Err(DataFusionError::Internal(format!("Unsupported scalar type: {}", actual_type))); + return Err(DataFusionError::Internal(format!( + "Unsupported scalar type: {}", + actual_type + ))); } - } - } - }; + }, + }; Ok(ColumnarValue::Array(result)) } @@ -339,46 +371,65 @@ pub enum HashType { U64, } -fn hash_value(value_bytes: &[u8], mut hasher: T, hash_type: HashType) -> Result { +fn hash_value( + value_bytes: &[u8], + mut hasher: T, + hash_type: HashType, +) -> Result { hasher.write(value_bytes); let hash = hasher.finish(); match hash_type { HashType::U32 => { let hash_u32 = hash as u32; Ok(hex::encode(hash_u32.to_be_bytes())) - }, + } HashType::U64 => { let hash_u64 = hash; Ok(hex::encode(hash_u64.to_be_bytes())) - }, + } } } -fn process_array(array: &dyn Array, mut hasher: T, hash_type: HashType) -> Result> { +fn process_array( + array: &dyn Array, + mut hasher: T, + hash_type: HashType, +) -> Result> { let mut hash_results: Vec = Vec::with_capacity(array.len()); match array.data_type() { Utf8 | Utf8View | LargeUtf8 => { - let string_array: &dyn Array = if array.data_type() == &Utf8 || array.data_type() == &Utf8View { - array.as_any().downcast_ref::().unwrap() - } else { - array.as_any().downcast_ref::().unwrap() - }; + let string_array: &dyn Array = + if array.data_type() == &Utf8 || array.data_type() == &Utf8View { + array.as_any().downcast_ref::().unwrap() + } else { + array.as_any().downcast_ref::().unwrap() + }; for i in 0..array.len() { if array.is_null(i) { hash_results.push(String::new()); // Handle null values continue; } - let value = if let Some(string_array) = string_array.as_any().downcast_ref::() { + let value = if let Some(string_array) = + string_array.as_any().downcast_ref::() + { string_array.value(i) } else { - string_array.as_any().downcast_ref::().unwrap().value(i) + string_array + .as_any() + .downcast_ref::() + .unwrap() + .value(i) }; if value.is_empty() { hash_results.push(String::new()); continue; } - hash_results.push(hash_value(value.as_bytes(), &mut hasher, hash_type.clone())?); + hash_results.push(hash_value( + value.as_bytes(), + &mut hasher, + hash_type.clone(), + )?); } } @@ -393,10 +444,16 @@ fn process_array(array: &dyn Array, mut hasher: T, hash_type: HashTyp hash_results.push(String::new()); // Handle null values continue; } - let value = if let Some(binary_array) = binary_array.as_any().downcast_ref::() { + let value = if let Some(binary_array) = + binary_array.as_any().downcast_ref::() + { binary_array.value(i) } else { - binary_array.as_any().downcast_ref::().unwrap().value(i) + binary_array + .as_any() + .downcast_ref::() + .unwrap() + .value(i) }; hash_results.push(hash_value(value, &mut hasher, hash_type.clone())?); } @@ -409,9 +466,12 @@ fn process_array(array: &dyn Array, mut hasher: T, hash_type: HashTyp } _ => { let actual_type = format!("{:?}", array.data_type()); - return Err(DataFusionError::Internal(format!("Unsupported array type: {}", actual_type))); - }, + return Err(DataFusionError::Internal(format!( + "Unsupported array type: {}", + actual_type + ))); + } } Ok(hash_results) -} \ No newline at end of file +} diff --git a/datafusion/functions/src/lib.rs b/datafusion/functions/src/lib.rs index 0d2806255144..b722740fece1 100644 --- a/datafusion/functions/src/lib.rs +++ b/datafusion/functions/src/lib.rs @@ -154,6 +154,8 @@ pub mod expr_fn { pub use super::datetime::expr_fn::*; #[cfg(feature = "encoding_expressions")] pub use super::encoding::expr_fn::*; + #[cfg(feature = "hash_expressions")] + pub use super::hash::expr_fn::*; #[cfg(feature = "math_expressions")] pub use super::math::expr_fn::*; #[cfg(feature = "regex_expressions")] @@ -162,8 +164,6 @@ pub mod expr_fn { pub use super::string::expr_fn::*; #[cfg(feature = "unicode_expressions")] pub use super::unicode::expr_fn::*; - #[cfg(feature = "hash_expressions")] - pub use super::hash::expr_fn::*; } /// Return all default functions From a72994231015a18aec9f20878a520f15b97ad8ba Mon Sep 17 00:00:00 2001 From: Spaarsh-root Date: Wed, 5 Feb 2025 22:14:34 +0530 Subject: [PATCH 09/14] Added support for Null inputs and corrected output for empty inputs --- datafusion/functions/src/hash/xxhash.rs | 28 ++++++++++++------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/datafusion/functions/src/hash/xxhash.rs b/datafusion/functions/src/hash/xxhash.rs index d9a671c9ff48..fae8a341d094 100644 --- a/datafusion/functions/src/hash/xxhash.rs +++ b/datafusion/functions/src/hash/xxhash.rs @@ -155,14 +155,16 @@ impl ScalarUDFImpl for XxHash32Func { Arc::new(hash_array) as Arc } ColumnarValue::Scalar(scalar) => match scalar { + ScalarValue::Utf8(None) + | ScalarValue::Utf8View(None) + | ScalarValue::LargeUtf8(None) => { + let hash_array = StringArray::from(vec![String::new()]); + Arc::new(hash_array) as Arc + } ScalarValue::Utf8(Some(ref v)) | ScalarValue::Utf8View(Some(ref v)) | ScalarValue::LargeUtf8(Some(ref v)) => { - if v.is_empty() { - return Ok(ColumnarValue::Array(Arc::new(StringArray::from( - vec![""], - )))); - } + let hash_result = hash_value( v.as_bytes(), XxHash32::with_seed(seed), @@ -322,14 +324,15 @@ impl ScalarUDFImpl for XxHash64Func { Arc::new(hash_array) as Arc } ColumnarValue::Scalar(scalar) => match scalar { + ScalarValue::Utf8(None) + | ScalarValue::Utf8View(None) + | ScalarValue::LargeUtf8(None) => { + let hash_array = StringArray::from(vec![String::new()]); + Arc::new(hash_array) as Arc + } ScalarValue::Utf8(Some(ref v)) | ScalarValue::Utf8View(Some(ref v)) | ScalarValue::LargeUtf8(Some(ref v)) => { - if v.is_empty() { - return Ok(ColumnarValue::Array(Arc::new(StringArray::from( - vec![""], - )))); - } let hash_result = hash_value( v.as_bytes(), XxHash64::with_seed(seed), @@ -421,10 +424,7 @@ fn process_array( .unwrap() .value(i) }; - if value.is_empty() { - hash_results.push(String::new()); - continue; - } + hash_results.push(hash_value( value.as_bytes(), &mut hasher, From 60e4db74b9315dbd80aa4749f64c1c32686df55b Mon Sep 17 00:00:00 2001 From: Spaarsh-root Date: Wed, 5 Feb 2025 23:20:13 +0530 Subject: [PATCH 10/14] Fixed failing fmt checks --- datafusion/functions/src/hash/xxhash.rs | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/datafusion/functions/src/hash/xxhash.rs b/datafusion/functions/src/hash/xxhash.rs index fae8a341d094..1239ca89246f 100644 --- a/datafusion/functions/src/hash/xxhash.rs +++ b/datafusion/functions/src/hash/xxhash.rs @@ -155,8 +155,8 @@ impl ScalarUDFImpl for XxHash32Func { Arc::new(hash_array) as Arc } ColumnarValue::Scalar(scalar) => match scalar { - ScalarValue::Utf8(None) - | ScalarValue::Utf8View(None) + ScalarValue::Utf8(None) + | ScalarValue::Utf8View(None) | ScalarValue::LargeUtf8(None) => { let hash_array = StringArray::from(vec![String::new()]); Arc::new(hash_array) as Arc @@ -164,7 +164,6 @@ impl ScalarUDFImpl for XxHash32Func { ScalarValue::Utf8(Some(ref v)) | ScalarValue::Utf8View(Some(ref v)) | ScalarValue::LargeUtf8(Some(ref v)) => { - let hash_result = hash_value( v.as_bytes(), XxHash32::with_seed(seed), @@ -324,8 +323,8 @@ impl ScalarUDFImpl for XxHash64Func { Arc::new(hash_array) as Arc } ColumnarValue::Scalar(scalar) => match scalar { - ScalarValue::Utf8(None) - | ScalarValue::Utf8View(None) + ScalarValue::Utf8(None) + | ScalarValue::Utf8View(None) | ScalarValue::LargeUtf8(None) => { let hash_array = StringArray::from(vec![String::new()]); Arc::new(hash_array) as Arc From f8e871b45fa8a48bf9add4a4b3b2ced4e9f8f576 Mon Sep 17 00:00:00 2001 From: Spaarsh-root Date: Thu, 6 Feb 2025 00:11:30 +0530 Subject: [PATCH 11/14] Added hash test file (xxhash32 and xxhash64) --- datafusion/sqllogictest/test_files/hash.slt | 113 ++++++++++++++++++++ 1 file changed, 113 insertions(+) create mode 100644 datafusion/sqllogictest/test_files/hash.slt diff --git a/datafusion/sqllogictest/test_files/hash.slt b/datafusion/sqllogictest/test_files/hash.slt new file mode 100644 index 000000000000..1b4134ab6893 --- /dev/null +++ b/datafusion/sqllogictest/test_files/hash.slt @@ -0,0 +1,113 @@ +# Test xxhash32 with string input +query T +SELECT xxhash32('foo') AS hash_value; +---- +e20f0dd9 + +# Test xxhash32 with array input +query T +SELECT xxhash32(column1) AS xxhash32_result FROM ( SELECT UNNEST(ARRAY[1, 2, 3, 4, 5]) AS column1 ) AS subquery; +---- +b6ecc8b2 +d43589af +b6855437 +01543429 +b30d56b4 + +query T +SELECT xxhash32(NULL) AS hash_value; +---- +(empty) + +# Test xxhash32 with string input and seed of 1 +query T +SELECT xxhash32('foo', 1) AS hash_value; +---- +1742761f + +# Test xxhash32 with array input and seed of 1 +query T +SELECT xxhash32(column1, 1) AS xxhash32_result FROM ( SELECT UNNEST(ARRAY[1, 2, 3, 4, 5]) AS column1 ) AS subquery; +---- +642684c5 +df0e3329 +99280b78 +e17e2fa9 +97a348b6 + +# Test xxhash32 with null input and seed of 1 +query T +SELECT xxhash32(NULL, 1) AS hash_value; +---- +(empty) + +# Test xxhash32 with binary input +query T +SELECT xxhash32(X'1') AS hash_value; +---- +3892f731 + + +# Test xxhash32 with binary input and seed of 1 +query T +SELECT xxhash32('foo'::BYTEA, 1) AS hash_value; +---- +1742761f + +# Tests for xxhash64 + +# Test xxhash64 with string input +query T +SELECT xxhash64('foo') AS hash_value; +---- +33bf00a859c4ba3f + +# Test xxhash64 with array input +query T +SELECT xxhash64(column1) AS xxhash64_result FROM ( SELECT UNNEST(ARRAY[1, 2, 3, 4, 5]) AS column1 ) AS subquery; +---- +b7b41276360564d4 +5460f49adbe7aba2 +3c697d223fa7e885 +d8316e61d84f6ba4 +c6f2d2dd0ad64fb6 + +query T +SELECT xxhash64(NULL) AS hash_value; +---- +(empty) + +# Test xxhash64 with string input and seed of 1 +query T +SELECT xxhash64('foo', 1) AS hash_value; +---- +c34823c5bf4f2cbd + +# Test xxhash64 with array input and seed of 1 +query T +SELECT xxhash64(column1, 1) AS xxhash64_result FROM ( SELECT UNNEST(ARRAY[1, 2, 3, 4, 5]) AS column1 ) AS subquery; +---- +192aba5fd13fb67d +75b53fdb7dce12fa +4b805d862c3b7497 +e9feb3476d8788cb +8b4dc636e784c7e5 + +# Test xxhash64 with null input and seed of 1 +query T +SELECT xxhash64(NULL, 1) AS hash_value; +---- +(empty) + +# Test xxhash64 with binary input +query T +SELECT xxhash64(X'1') AS hash_value; +---- +8a4127811b21e730 + + +# Test xxhash64 with binary input and seed of 1 +query T +SELECT xxhash64('foo'::BYTEA, 1) AS hash_value; +---- +c34823c5bf4f2cbd \ No newline at end of file From 564af36107fd2781ca5b5aa9fdcb6f9b85aaec9d Mon Sep 17 00:00:00 2001 From: Spaarsh-root Date: Thu, 6 Feb 2025 00:40:43 +0530 Subject: [PATCH 12/14] Minor fix --- datafusion-cli/Cargo.lock | 269 ++++++++++++++++++++++---------------- 1 file changed, 154 insertions(+), 115 deletions(-) diff --git a/datafusion-cli/Cargo.lock b/datafusion-cli/Cargo.lock index 901e1f33460a..4a97d3472edb 100644 --- a/datafusion-cli/Cargo.lock +++ b/datafusion-cli/Cargo.lock @@ -31,7 +31,7 @@ checksum = "e89da841a80418a9b391ebaea17f5c112ffaaa96f621d2c285b5174da76b9011" dependencies = [ "cfg-if", "const-random", - "getrandom", + "getrandom 0.2.15", "once_cell", "version_check", "zerocopy", @@ -175,9 +175,9 @@ checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50" [[package]] name = "arrow" -version = "54.0.0" +version = "54.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d2ccdcc8fb14508ca20aaec7076032e5c0b0751b906036d4496786e2f227a37a" +checksum = "6422e12ac345a0678d7a17e316238e3a40547ae7f92052b77bd86d5e0239f3fc" dependencies = [ "arrow-arith", "arrow-array", @@ -196,9 +196,9 @@ dependencies = [ [[package]] name = "arrow-arith" -version = "54.0.0" +version = "54.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a1aad8e27f32e411a0fc0bf5a625a35f0bf9b9f871cf4542abe31f7cef4beea2" +checksum = "23cf34bb1f48c41d3475927bcc7be498665b8e80b379b88f62a840337f8b8248" dependencies = [ "arrow-array", "arrow-buffer", @@ -210,9 +210,9 @@ dependencies = [ [[package]] name = "arrow-array" -version = "54.0.0" +version = "54.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bd6ed90c28c6f73a706c55799b8cc3a094e89257238e5b1d65ca7c70bd3ae23f" +checksum = "fb4a06d507f54b70a277be22a127c8ffe0cec6cd98c0ad8a48e77779bbda8223" dependencies = [ "ahash", "arrow-buffer", @@ -227,9 +227,9 @@ dependencies = [ [[package]] name = "arrow-buffer" -version = "54.0.0" +version = "54.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fe4a40bdc1552ea10fbdeae4e5a945d8572c32f66bce457b96c13d9c46b80447" +checksum = "d69d326d5ad1cb82dcefa9ede3fee8fdca98f9982756b16f9cb142f4aa6edc89" dependencies = [ "bytes", "half", @@ -238,9 +238,9 @@ dependencies = [ [[package]] name = "arrow-cast" -version = "54.0.0" +version = "54.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "430c0a21aa7f81bcf0f97c57216d7127795ea755f494d27bae2bd233be43c2cc" +checksum = "626e65bd42636a84a238bed49d09c8777e3d825bf81f5087a70111c2831d9870" dependencies = [ "arrow-array", "arrow-buffer", @@ -259,9 +259,9 @@ dependencies = [ [[package]] name = "arrow-csv" -version = "54.0.0" +version = "54.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b4444c8f8c57ac00e6a679ede67d1ae8872c170797dc45b46f75702437a77888" +checksum = "71c8f959f7a1389b1dbd883cdcd37c3ed12475329c111912f7f69dad8195d8c6" dependencies = [ "arrow-array", "arrow-cast", @@ -275,9 +275,9 @@ dependencies = [ [[package]] name = "arrow-data" -version = "54.0.0" +version = "54.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "09af476cfbe9879937e50b1334c73189de6039186e025b1b1ac84b283b87b20e" +checksum = "1858e7c7d01c44cf71c21a85534fd1a54501e8d60d1195d0d6fbcc00f4b10754" dependencies = [ "arrow-buffer", "arrow-schema", @@ -287,9 +287,9 @@ dependencies = [ [[package]] name = "arrow-ipc" -version = "54.0.0" +version = "54.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "136296e8824333a8a4c4a6e508e4aa65d5678b801246d0408825ae7b2523c628" +checksum = "a6bb3f727f049884c7603f0364bc9315363f356b59e9f605ea76541847e06a1e" dependencies = [ "arrow-array", "arrow-buffer", @@ -301,9 +301,9 @@ dependencies = [ [[package]] name = "arrow-json" -version = "54.0.0" +version = "54.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e222ad0e419ab8276818c5605a5bb1e35ed86fa8c5e550726433cc63b09c3c78" +checksum = "35de94f165ed8830aede72c35f238763794f0d49c69d30c44d49c9834267ff8c" dependencies = [ "arrow-array", "arrow-buffer", @@ -321,9 +321,9 @@ dependencies = [ [[package]] name = "arrow-ord" -version = "54.0.0" +version = "54.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eddf14c5f03b679ec8ceac4dfac43f63cdc4ed54dab3cc120a4ef46af38481eb" +checksum = "8aa06e5f267dc53efbacb933485c79b6fc1685d3ffbe870a16ce4e696fb429da" dependencies = [ "arrow-array", "arrow-buffer", @@ -334,9 +334,9 @@ dependencies = [ [[package]] name = "arrow-row" -version = "54.0.0" +version = "54.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e9acdc58da19f383f4ba381fa0e3583534ae2ceb31269aaf4a03f08ff13e8443" +checksum = "66f1144bb456a2f9d82677bd3abcea019217e572fc8f07de5a7bac4b2c56eb2c" dependencies = [ "arrow-array", "arrow-buffer", @@ -347,15 +347,15 @@ dependencies = [ [[package]] name = "arrow-schema" -version = "54.0.0" +version = "54.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3a1822a1a952955637e85e8f9d6b0e04dd75d65492b87ec548dd593d3a1f772b" +checksum = "105f01ec0090259e9a33a9263ec18ff223ab91a0ea9fbc18042f7e38005142f6" [[package]] name = "arrow-select" -version = "54.0.0" +version = "54.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5c4172e9a12dfe15303d3926269f9ead471ea93bdd067d113abc65cb6c48e246" +checksum = "f690752fdbd2dee278b5f1636fefad8f2f7134c85e20fd59c4199e15a39a6807" dependencies = [ "ahash", "arrow-array", @@ -367,9 +367,9 @@ dependencies = [ [[package]] name = "arrow-string" -version = "54.0.0" +version = "54.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "73683040445f4932342781926189901c9521bb1a787c35dbe628a3ce51372d3c" +checksum = "d0fff9cd745a7039b66c47ecaf5954460f9fa12eed628f65170117ea93e64ee0" dependencies = [ "arrow-array", "arrow-buffer", @@ -417,9 +417,9 @@ dependencies = [ [[package]] name = "async-trait" -version = "0.1.85" +version = "0.1.86" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3f934833b4b7233644e5848f235df3f57ed8c80f1528a26c3dfa13d2147fa056" +checksum = "644dd749086bf3771a2fbc5f256fdb982d53f011c7d5d560304eafeecebce79d" dependencies = [ "proc-macro2", "quote", @@ -491,9 +491,9 @@ dependencies = [ [[package]] name = "aws-runtime" -version = "1.5.4" +version = "1.5.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bee7643696e7fdd74c10f9eb42848a87fe469d35eae9c3323f80aa98f350baac" +checksum = "76dd04d39cc12844c0994f2c9c5a6f5184c22e9188ec1ff723de41910a21dcad" dependencies = [ "aws-credential-types", "aws-sigv4", @@ -583,9 +583,9 @@ dependencies = [ [[package]] name = "aws-sigv4" -version = "1.2.7" +version = "1.2.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "690118821e46967b3c4501d67d7d52dd75106a9c54cf36cefa1985cedbe94e05" +checksum = "0bc5bbd1e4a2648fd8c5982af03935972c24a2f9846b396de661d351ee3ce837" dependencies = [ "aws-credential-types", "aws-smithy-http", @@ -665,9 +665,9 @@ dependencies = [ [[package]] name = "aws-smithy-runtime" -version = "1.7.7" +version = "1.7.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "865f7050bbc7107a6c98a397a9fcd9413690c27fa718446967cf03b2d3ac517e" +checksum = "d526a12d9ed61fadefda24abe2e682892ba288c2018bcb38b1b4c111d13f6d92" dependencies = [ "aws-smithy-async", "aws-smithy-http", @@ -709,9 +709,9 @@ dependencies = [ [[package]] name = "aws-smithy-types" -version = "1.2.12" +version = "1.2.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a28f6feb647fb5e0d5b50f0472c19a7db9462b74e2fec01bb0b44eedcc834e97" +checksum = "c7b8a53819e42f10d0821f56da995e1470b199686a1809168db6ca485665f042" dependencies = [ "base64-simd", "bytes", @@ -744,9 +744,9 @@ dependencies = [ [[package]] name = "aws-types" -version = "1.3.4" +version = "1.3.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b0df5a18c4f951c645300d365fec53a61418bcf4650f604f85fe2a665bfaa0c2" +checksum = "dfbd0a668309ec1f66c0f6bda4840dd6d4796ae26d699ebc266d7cc95c6d040f" dependencies = [ "aws-credential-types", "aws-smithy-async", @@ -863,9 +863,9 @@ dependencies = [ [[package]] name = "brotli-decompressor" -version = "4.0.1" +version = "4.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9a45bd2e4095a8b518033b128020dd4a55aab1c0a381ba4404a472630f4bc362" +checksum = "74fa05ad7d803d413eb8380983b092cbbaf9a85f151b871360e7b00cd7060b37" dependencies = [ "alloc-no-stdlib", "alloc-stdlib", @@ -884,9 +884,9 @@ dependencies = [ [[package]] name = "bumpalo" -version = "3.16.0" +version = "3.17.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "79296716171880943b8470b5f8d03aa55eb2e645a4874bdbb28adb49162e012c" +checksum = "1628fb46dfa0b37568d12e5edd512553eccf6a22a78e8bde00bb4aed84d5bdbf" [[package]] name = "byteorder" @@ -896,9 +896,9 @@ checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" [[package]] name = "bytes" -version = "1.9.0" +version = "1.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "325918d6fe32f23b19878fe4b34794ae41fc19ddbe53b10571a4874d44ffd39b" +checksum = "f61dac84819c6588b558454b194026eb1f09c293b9036ae9b159e74e73ab6cf9" [[package]] name = "bytes-utils" @@ -943,9 +943,9 @@ dependencies = [ [[package]] name = "cc" -version = "1.2.9" +version = "1.2.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c8293772165d9345bdaaa39b45b2109591e63fe5e6fbc23c6ff930a048aa310b" +checksum = "755717a7de9ec452bf7f3f1a3099085deabd7f2962b861dae91ecd7a365903d2" dependencies = [ "jobserver", "libc", @@ -985,9 +985,9 @@ dependencies = [ [[package]] name = "chrono-tz" -version = "0.10.0" +version = "0.10.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cd6dd8046d00723a59a2f8c5f295c515b9bb9a331ee4f8f3d4dd49e428acd3b6" +checksum = "9c6ac4f2c0bf0f44e9161aec9675e1050aa4a530663c4a9e37e108fa948bca9f" dependencies = [ "chrono", "chrono-tz-build", @@ -1006,9 +1006,9 @@ dependencies = [ [[package]] name = "clap" -version = "4.5.27" +version = "4.5.28" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "769b0145982b4b48713e01ec42d61614425f27b7058bda7180a3a41f30104796" +checksum = "3e77c3243bd94243c03672cb5154667347c457ca271254724f9f393aee1c05ff" dependencies = [ "clap_builder", "clap_derive", @@ -1028,9 +1028,9 @@ dependencies = [ [[package]] name = "clap_derive" -version = "4.5.24" +version = "4.5.28" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "54b755194d6389280185988721fffba69495eed5ee9feeee9a599b53db80318c" +checksum = "bf4ced95c6f4a675af3da73304b9ac4ed991640c36374e4b46795c49e17cf1ed" dependencies = [ "heck", "proc-macro2", @@ -1085,7 +1085,7 @@ version = "0.1.16" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f9d839f2a20b0aee515dc581a6172f2321f96cab76c1a38a4c584a194955390e" dependencies = [ - "getrandom", + "getrandom 0.2.15", "once_cell", "tiny-keccak", ] @@ -1133,9 +1133,9 @@ dependencies = [ [[package]] name = "cpufeatures" -version = "0.2.16" +version = "0.2.17" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "16b80225097f2e5ae4e7179dd2266824648f3e2f49d9134d584b76389d31c4c3" +checksum = "59ed5838eebb26a2bb2e58f6d5b5316989ae9d08bab10e0e6d103e656d1b0280" dependencies = [ "libc", ] @@ -1157,9 +1157,9 @@ checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28" [[package]] name = "crunchy" -version = "0.2.2" +version = "0.2.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7a81dae078cea95a014a339291cec439d2f232ebe854a9d672b796c6afafa9b7" +checksum = "43da5946c66ffcc7745f48db692ffbb10a83bfe0afd96235c5c2a4fb23994929" [[package]] name = "crypto-common" @@ -1426,6 +1426,7 @@ dependencies = [ "rand", "regex", "sha2", + "twox-hash", "unicode-segmentation", "uuid", ] @@ -1952,10 +1953,22 @@ dependencies = [ "cfg-if", "js-sys", "libc", - "wasi", + "wasi 0.11.0+wasi-snapshot-preview1", "wasm-bindgen", ] +[[package]] +name = "getrandom" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "43a49c392881ce6d5c3b8cb70f98717b7c07aabbdff06687b9030dbfbe2725f8" +dependencies = [ + "cfg-if", + "libc", + "wasi 0.13.3+wasi-0.2.2", + "windows-targets", +] + [[package]] name = "gimli" version = "0.31.1" @@ -2121,9 +2134,9 @@ dependencies = [ [[package]] name = "httparse" -version = "1.9.5" +version = "1.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7d71d3574edd2771538b901e6549113b4006ece66150fb69c0fb6d9a2adae946" +checksum = "f2d708df4e7140240a16cd6ab0ab65c972d7433ab77819ea693fde9c43811e2a" [[package]] name = "httpdate" @@ -2163,9 +2176,9 @@ dependencies = [ [[package]] name = "hyper" -version = "1.5.2" +version = "1.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "256fb8d4bd6413123cc9d91832d78325c48ff41677595be797d90f42969beae0" +checksum = "cc2b571658e38e0c01b1fdca3bbbe93c00d3d71693ff2770043f8c29bc7d6f80" dependencies = [ "bytes", "futures-channel", @@ -2205,9 +2218,9 @@ checksum = "2d191583f3da1305256f22463b9bb0471acad48a4e534a5218b9963e9c1f59b2" dependencies = [ "futures-util", "http 1.2.0", - "hyper 1.5.2", + "hyper 1.6.0", "hyper-util", - "rustls 0.23.21", + "rustls 0.23.22", "rustls-native-certs 0.8.1", "rustls-pki-types", "tokio", @@ -2226,7 +2239,7 @@ dependencies = [ "futures-util", "http 1.2.0", "http-body 1.0.1", - "hyper 1.5.2", + "hyper 1.6.0", "pin-project-lite", "socket2", "tokio", @@ -2414,9 +2427,9 @@ checksum = "8bb03732005da905c88227371639bf1ad885cc712789c011c31c5fb3ab3ccf02" [[package]] name = "ipnet" -version = "2.10.1" +version = "2.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ddc24109865250148c2e0f3d25d4f0f479571723792d3802153c60922a4fb708" +checksum = "469fb0b9cefa57e3ef31275ee7cacb78f2fdca44e4765491884a2b119d4eb130" [[package]] name = "is_terminal_polyfill" @@ -2688,7 +2701,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2886843bf800fba2e3377cff24abf6379b4c4d5c6681eaf9ea5b0d15090450bd" dependencies = [ "libc", - "wasi", + "wasi 0.11.0+wasi-snapshot-preview1", "windows-sys 0.52.0", ] @@ -2821,7 +2834,7 @@ dependencies = [ "chrono", "futures", "humantime", - "hyper 1.5.2", + "hyper 1.6.0", "itertools 0.13.0", "md-5", "parking_lot", @@ -2848,9 +2861,9 @@ checksum = "1261fe7e33c73b354eab43b1273a57c8f967d0391e80353e51f764ac02cf6775" [[package]] name = "openssl-probe" -version = "0.1.5" +version = "0.1.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf" +checksum = "d05e27ee213611ffe7d6348b942e8f942b37114c00cc03cec254295a4a17852e" [[package]] name = "option-ext" @@ -2898,9 +2911,9 @@ dependencies = [ [[package]] name = "parquet" -version = "54.0.0" +version = "54.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3334c50239d9f4951653d84fa6f636da86f53742e5e5849a30fbe852f3ff4383" +checksum = "8a01a0efa30bbd601ae85b375c728efdb211ade54390281628a7b16708beb235" dependencies = [ "ahash", "arrow-array", @@ -2924,6 +2937,7 @@ dependencies = [ "object_store", "paste", "seq-macro", + "simdutf8", "snap", "thrift", "tokio", @@ -3118,7 +3132,7 @@ dependencies = [ "quinn-proto", "quinn-udp", "rustc-hash", - "rustls 0.23.21", + "rustls 0.23.22", "socket2", "thiserror 2.0.11", "tokio", @@ -3132,11 +3146,11 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a2fe5ef3495d7d2e377ff17b1a8ce2ee2ec2a18cde8b6ad6619d65d0701c135d" dependencies = [ "bytes", - "getrandom", + "getrandom 0.2.15", "rand", "ring", "rustc-hash", - "rustls 0.23.21", + "rustls 0.23.22", "rustls-pki-types", "slab", "thiserror 2.0.11", @@ -3205,7 +3219,7 @@ version = "0.6.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" dependencies = [ - "getrandom", + "getrandom 0.2.15", ] [[package]] @@ -3243,7 +3257,7 @@ version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dd6f9d3d47bdd2ad6945c5015a226ec6155d0bcdfd8f7cd29f86b71f8de99d2b" dependencies = [ - "getrandom", + "getrandom 0.2.15", "libredox", "thiserror 2.0.11", ] @@ -3303,7 +3317,7 @@ dependencies = [ "http 1.2.0", "http-body 1.0.1", "http-body-util", - "hyper 1.5.2", + "hyper 1.6.0", "hyper-rustls 0.27.5", "hyper-util", "ipnet", @@ -3314,7 +3328,7 @@ dependencies = [ "percent-encoding", "pin-project-lite", "quinn", - "rustls 0.23.21", + "rustls 0.23.22", "rustls-native-certs 0.8.1", "rustls-pemfile 2.2.0", "rustls-pki-types", @@ -3343,7 +3357,7 @@ checksum = "c17fa4cb658e3583423e915b9f3acc01cceaee1860e33d59ebae66adc3a2dc0d" dependencies = [ "cc", "cfg-if", - "getrandom", + "getrandom 0.2.15", "libc", "spin", "untrusted", @@ -3394,9 +3408,9 @@ checksum = "719b953e2095829ee67db738b3bfa9fa368c94900df327b3f07fe6e794d2fe1f" [[package]] name = "rustc-hash" -version = "2.1.0" +version = "2.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c7fb8039b3032c191086b10f11f319a6e99e1e82889c5cc6046f515c9db1d497" +checksum = "357703d41365b4b27c590e3ed91eabb1b663f07c4c084095e60cbed4362dff0d" [[package]] name = "rustc_version" @@ -3409,9 +3423,9 @@ dependencies = [ [[package]] name = "rustix" -version = "0.38.43" +version = "0.38.44" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a78891ee6bf2340288408954ac787aa063d8e8817e9f53abb37c695c6d834ef6" +checksum = "fdb5bc1ae2baa591800df16c9ca78619bf65c0488b41b96ccec5d11220d8c154" dependencies = [ "bitflags 2.8.0", "errno", @@ -3434,9 +3448,9 @@ dependencies = [ [[package]] name = "rustls" -version = "0.23.21" +version = "0.23.22" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8f287924602bf649d949c63dc8ac8b235fa5387d394020705b80c4eb597ce5b8" +checksum = "9fb9263ab4eb695e42321db096e3b8fbd715a59b154d5c88d82db2175b681ba7" dependencies = [ "once_cell", "ring", @@ -3490,9 +3504,9 @@ dependencies = [ [[package]] name = "rustls-pki-types" -version = "1.10.1" +version = "1.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d2bf47e6ff922db3825eb750c4e2ff784c6ff8fb9e13046ef6a1d1c5401b0b37" +checksum = "917ce264624a4b4db1c364dcc35bfca9ded014d0a958cd47ad3e960e988ea51c" dependencies = [ "web-time", ] @@ -3548,9 +3562,9 @@ dependencies = [ [[package]] name = "ryu" -version = "1.0.18" +version = "1.0.19" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f3cb5ba0dc43242ce17de99c180e96db90b235b8a9fdc9543c96d2209116bd9f" +checksum = "6ea1a2d0a644769cc99faa24c3ad26b379b786fe7c36fd3c546254801650e6dd" [[package]] name = "same-file" @@ -3624,9 +3638,9 @@ dependencies = [ [[package]] name = "semver" -version = "1.0.24" +version = "1.0.25" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3cb6eb87a131f756572d7fb904f6e7b68633f09cca868c5df1c4b8d1a694bbba" +checksum = "f79dfe2d285b0488816f30e700a7438c5a73d816b5b7d3ac72fbc48b0d185e03" [[package]] name = "seq-macro" @@ -3665,9 +3679,9 @@ dependencies = [ [[package]] name = "serde_json" -version = "1.0.137" +version = "1.0.138" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "930cfb6e6abf99298aaad7d29abbef7a9999a9a8806a40088f55f0dcec03146b" +checksum = "d434192e7da787e94a6ea7e9670b26a036d0ca41e0b7efb2676dd32bae872949" dependencies = [ "itoa", "memchr", @@ -3713,6 +3727,12 @@ dependencies = [ "libc", ] +[[package]] +name = "simdutf8" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e3a9fe34e3e7a50316060351f37187a3f546bce95496156754b601a5fa71b76e" + [[package]] name = "siphasher" version = "1.0.1" @@ -3856,9 +3876,9 @@ checksum = "13c2bddecc57b384dee18652358fb23172facb8a2c51ccc10d74c157bdea3292" [[package]] name = "syn" -version = "2.0.96" +version = "2.0.98" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d5d0adab1ae378d7f53bdebc67a39f1f151407ef230f0ce2883572f5d8985c80" +checksum = "36147f1a48ae0ec2b5b3bc5b537d267457555a10dc06f3dbc8cb11ba3006d3b1" dependencies = [ "proc-macro2", "quote", @@ -3887,13 +3907,13 @@ dependencies = [ [[package]] name = "tempfile" -version = "3.15.0" +version = "3.16.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9a8a559c81686f576e8cd0290cd2a24a2a9ad80c98b3478856500fcbd7acd704" +checksum = "38c246215d7d24f48ae091a2902398798e05d978b24315d6efbc00ede9a8bb91" dependencies = [ "cfg-if", "fastrand", - "getrandom", + "getrandom 0.3.1", "once_cell", "rustix", "windows-sys 0.59.0", @@ -4065,7 +4085,7 @@ version = "0.26.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5f6d0975eaace0cf0fcadee4e4aaa5da15b5c079146f2cffb67c113be122bf37" dependencies = [ - "rustls 0.23.21", + "rustls 0.23.22", "tokio", ] @@ -4090,9 +4110,9 @@ checksum = "0dd7358ecb8fc2f8d014bf86f6f638ce72ba252a2c3a2572f2a795f1d23efb41" [[package]] name = "toml_edit" -version = "0.22.22" +version = "0.22.23" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4ae48d6208a266e853d946088ed816055e556cc6028c5e8e2b84d9fa5dd7c7f5" +checksum = "02a8b472d1a3d7c18e2d61a489aee3453fd9031c33e4f55bd533f4a7adca1bee" dependencies = [ "indexmap", "toml_datetime", @@ -4170,6 +4190,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "97fee6b57c6a41524a810daee9286c02d7752c4253064d0b05472833a438f675" dependencies = [ "cfg-if", + "rand", "static_assertions", ] @@ -4201,9 +4222,9 @@ checksum = "42ff0bf0c66b8238c6f3b578df37d0b7848e55df8577b3f74f92a69acceeb825" [[package]] name = "unicode-ident" -version = "1.0.14" +version = "1.0.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "adb9e6ca4f869e1180728b7950e35922a7fc6397f7b641499e8f3ef06e50dc83" +checksum = "a210d160f08b701c8721ba1c726c11662f877ea6b7094007e1ca9a1041945034" [[package]] name = "unicode-segmentation" @@ -4266,11 +4287,11 @@ checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" [[package]] name = "uuid" -version = "1.12.0" +version = "1.13.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "744018581f9a3454a9e15beb8a33b017183f1e7c0cd170232a2d1453b23a51c4" +checksum = "ced87ca4be083373936a67f8de945faa23b6b42384bd5b64434850802c6dccd0" dependencies = [ - "getrandom", + "getrandom 0.3.1", "serde", ] @@ -4288,9 +4309,9 @@ checksum = "5c3082ca00d5a5ef149bb8b555a72ae84c9c59f7250f013ac822ac2e49b19c64" [[package]] name = "wait-timeout" -version = "0.2.0" +version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9f200f5b12eb75f8c1ed65abd4b2db8a6e1b138a20de009dacee265a2498f3f6" +checksum = "09ac3b126d3914f9849036f826e054cbabdc8519970b8998ddaf3b5bd3c65f11" dependencies = [ "libc", ] @@ -4320,6 +4341,15 @@ version = "0.11.0+wasi-snapshot-preview1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" +[[package]] +name = "wasi" +version = "0.13.3+wasi-0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "26816d2e1a4a36a2940b96c5296ce403917633dff8f3440e9b236ed6f6bacad2" +dependencies = [ + "wit-bindgen-rt", +] + [[package]] name = "wasm-bindgen" version = "0.2.100" @@ -4556,13 +4586,22 @@ checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" [[package]] name = "winnow" -version = "0.6.24" +version = "0.7.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c8d71a593cc5c42ad7876e2c1fda56f314f3754c084128833e64f1345ff8a03a" +checksum = "86e376c75f4f43f44db463cf729e0d3acbf954d13e22c51e26e4c264b4ab545f" dependencies = [ "memchr", ] +[[package]] +name = "wit-bindgen-rt" +version = "0.33.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3268f3d866458b787f390cf61f4bbb563b922d091359f9608842999eaee3943c" +dependencies = [ + "bitflags 2.8.0", +] + [[package]] name = "write16" version = "1.0.0" From ed2324fe42b9b55c8efe27ec07a65a8be46f4bd7 Mon Sep 17 00:00:00 2001 From: Spaarsh-root Date: Fri, 7 Feb 2025 21:44:05 +0530 Subject: [PATCH 13/14] Implemented feedback items --- datafusion/functions/src/hash/xxhash.rs | 120 +++++++++++--------- datafusion/sqllogictest/test_files/hash.slt | 16 +++ 2 files changed, 80 insertions(+), 56 deletions(-) diff --git a/datafusion/functions/src/hash/xxhash.rs b/datafusion/functions/src/hash/xxhash.rs index 1239ca89246f..74bb517a5e0c 100644 --- a/datafusion/functions/src/hash/xxhash.rs +++ b/datafusion/functions/src/hash/xxhash.rs @@ -15,11 +15,10 @@ // specific language governing permissions and limitations // under the License. -use arrow::array::{Array, BinaryArray, LargeBinaryArray, LargeStringArray, StringArray}; +use arrow::array::{Array, AsArray, BinaryArray, LargeBinaryArray, LargeStringArray, StringArray, StringBuilder}; use arrow::datatypes::DataType; use arrow::datatypes::DataType::{Binary, Int64, LargeBinary, LargeUtf8, Utf8, Utf8View}; -use datafusion_common::DataFusionError; -use datafusion_common::{plan_err, Result, ScalarValue}; +use datafusion_common::{plan_err,exec_err, internal_err, Result, ScalarValue}; use datafusion_expr::{ ColumnarValue, Documentation, ScalarUDFImpl, Signature, TypeSignature, Volatility, }; @@ -32,7 +31,7 @@ use twox_hash::{XxHash32, XxHash64}; #[user_doc( doc_section(label = "Hashing Functions"), description = "Computes the XXHash32 hash of a binary string.", - syntax_example = "xxhash32(expression)", + syntax_example = "xxhash32(expression [,seed])", sql_example = r#"```sql > select xxhash32('foo'); +-------------------------------------------+ @@ -106,14 +105,14 @@ impl ScalarUDFImpl for XxHash32Func { Null => Null, _ => { return plan_err!( - "the xxhash32 can only accept strings but got {:?}", + "The xxhash32 can only accept Utf8, Utf8View, LargeUtf8, Binary and LargeBinary but got {:?}", **t ); } }, other => { return plan_err!( - "The xxhash32 function can only accept strings. Got {other}" + "The xxhash32 can only accept Utf8, Utf8View, LargeUtf8, Binary and LargeBinary but got {other}" ); } }) @@ -131,17 +130,17 @@ impl ScalarUDFImpl for XxHash32Func { if *seed >= 0 && *seed <= u32::MAX as i64 { *seed as u32 } else { - return Err(DataFusionError::Execution(format!( + return exec_err!( "Seed value out of range for UInt32: {}", seed - ))); + ); } } else { let actual_type = format!("{:?}", &args[1]); - return Err(DataFusionError::Execution(format!( + return exec_err!( "Expected a Int64 seed value, but got {}", actual_type - ))); + ); } } else { 0 // Default seed value @@ -180,11 +179,7 @@ impl ScalarUDFImpl for XxHash32Func { Arc::new(hash_array) as Arc } _ => { - let actual_type = format!("{:?}", scalar); - return Err(DataFusionError::Internal(format!( - "Unsupported scalar type: {}", - actual_type - ))); + return internal_err!("Unsupported scalar type: {:?}", scalar); } }, }; @@ -200,7 +195,7 @@ impl ScalarUDFImpl for XxHash32Func { #[user_doc( doc_section(label = "Hashing Functions"), description = "Computes the XXHash64 hash of a binary string.", - syntax_example = "xxhash64(expression)", + syntax_example = "xxhash64(expression [,seed])", sql_example = r#"```sql > select xxhash64('foo'); +-------------------------------------------+ @@ -274,14 +269,14 @@ impl ScalarUDFImpl for XxHash64Func { Null => Null, _ => { return plan_err!( - "the xxhash64 can only accept strings but got {:?}", + "The xxhash64 can only accept Utf8, Utf8View, LargeUtf8, Binary and LargeBinary but got {:?}", **t ); } }, other => { return plan_err!( - "The xxhash64 function can only accept strings. Got {other}" + "The xxhash64 can only accept Utf8, Utf8View, LargeUtf8, Binary and LargeBinary but {other}" ); } }) @@ -299,17 +294,17 @@ impl ScalarUDFImpl for XxHash64Func { if *seed >= 0 { *seed as u64 } else { - return Err(DataFusionError::Execution(format!( + return exec_err!( "Seed value out of range for UInt64: {}", seed - ))); + ); } } else { let actual_type = format!("{:?}", &args[1]); - return Err(DataFusionError::Execution(format!( + return exec_err!( "Expected a Int64 seed value, but got {}", actual_type - ))); + ); } } else { 0 // Default seed value @@ -349,10 +344,10 @@ impl ScalarUDFImpl for XxHash64Func { } _ => { let actual_type = format!("{:?}", scalar); - return Err(DataFusionError::Internal(format!( + return exec_err!( "Unsupported scalar type: {}", actual_type - ))); + ); } }, }; @@ -377,7 +372,7 @@ fn hash_value( value_bytes: &[u8], mut hasher: T, hash_type: HashType, -) -> Result { +) -> Result { hasher.write(value_bytes); let hash = hasher.finish(); match hash_type { @@ -396,35 +391,51 @@ fn process_array( array: &dyn Array, mut hasher: T, hash_type: HashType, -) -> Result> { - let mut hash_results: Vec = Vec::with_capacity(array.len()); +) -> Result { + let mut hash_results = StringBuilder::new(); match array.data_type() { - Utf8 | Utf8View | LargeUtf8 => { - let string_array: &dyn Array = - if array.data_type() == &Utf8 || array.data_type() == &Utf8View { - array.as_any().downcast_ref::().unwrap() - } else { - array.as_any().downcast_ref::().unwrap() - }; + Utf8View => { + let string_view_array = array.as_string_view(); for i in 0..array.len() { if array.is_null(i) { - hash_results.push(String::new()); // Handle null values + hash_results.append_value(String::new()); continue; } - let value = if let Some(string_array) = - string_array.as_any().downcast_ref::() - { - string_array.value(i) - } else { - string_array - .as_any() - .downcast_ref::() - .unwrap() - .value(i) - }; + let value = string_view_array.value(i); + hash_results.append_value(hash_value( + value.as_bytes(), + &mut hasher, + hash_type.clone(), + )?); + } + } + + Utf8 => { + let string_array = array.as_any().downcast_ref::().unwrap(); + for i in 0..array.len() { + if array.is_null(i) { + hash_results.append_value(String::new()); + continue; + } + let value = string_array.value(i); + hash_results.append_value(hash_value( + value.as_bytes(), + &mut hasher, + hash_type.clone(), + )?); + } + } - hash_results.push(hash_value( + LargeUtf8 => { + let large_string_array = array.as_any().downcast_ref::().unwrap(); + for i in 0..array.len() { + if array.is_null(i) { + hash_results.append_value(String::new()); + continue; + } + let value = large_string_array.value(i); + hash_results.append_value(hash_value( value.as_bytes(), &mut hasher, hash_type.clone(), @@ -440,7 +451,7 @@ fn process_array( }; for i in 0..array.len() { if array.is_null(i) { - hash_results.push(String::new()); // Handle null values + hash_results.append_value(String::new()); continue; } let value = if let Some(binary_array) = @@ -454,23 +465,20 @@ fn process_array( .unwrap() .value(i) }; - hash_results.push(hash_value(value, &mut hasher, hash_type.clone())?); + hash_results.append_value(hash_value(value, &mut hasher, hash_type.clone())?); } } DataType::Null => { for _ in 0..array.len() { - hash_results.push(String::new()); // Handle null values + hash_results.append_value(String::new()); } } _ => { let actual_type = format!("{:?}", array.data_type()); - return Err(DataFusionError::Internal(format!( - "Unsupported array type: {}", - actual_type - ))); + return exec_err!("Unsupported array type: {}", actual_type); } } - Ok(hash_results) -} + Ok(hash_results.finish()) +} \ No newline at end of file diff --git a/datafusion/sqllogictest/test_files/hash.slt b/datafusion/sqllogictest/test_files/hash.slt index 1b4134ab6893..bc9048565045 100644 --- a/datafusion/sqllogictest/test_files/hash.slt +++ b/datafusion/sqllogictest/test_files/hash.slt @@ -14,6 +14,14 @@ b6855437 01543429 b30d56b4 +# Test xxhash32 with Utf8View array input +query T +WITH input_data AS (SELECT arrow_cast(column1, 'Utf8View') as utf8view_value FROM (VALUES ('foobar1'),('foobar2'),('foobar3')) AS t(column1)) SELECT xxhash32(utf8view_value) as hash_value FROM input_data; +---- +2b0d1874 +9925f907 +df748f36 + query T SELECT xxhash32(NULL) AS hash_value; ---- @@ -72,6 +80,14 @@ b7b41276360564d4 d8316e61d84f6ba4 c6f2d2dd0ad64fb6 +# Test xxhash64 with Utf8View array input +query T +WITH input_data AS (SELECT arrow_cast(column1, 'Utf8View') as utf8view_value FROM (VALUES ('foobar1'),('foobar2'),('foobar3')) AS t(column1)) SELECT xxhash64(utf8view_value) as hash_value FROM input_data; +---- +36425528f43b829c +b24f52e2956da1a9 +5e75bd6e3aac89a9 + query T SELECT xxhash64(NULL) AS hash_value; ---- From 18b193704329a85e52a86087023acbaf61bf7776 Mon Sep 17 00:00:00 2001 From: Spaarsh-root Date: Fri, 7 Feb 2025 23:00:11 +0530 Subject: [PATCH 14/14] Fixed fmt and .md test fails --- datafusion/functions/src/hash/xxhash.rs | 45 ++++++++----------- .../source/user-guide/sql/scalar_functions.md | 4 +- 2 files changed, 21 insertions(+), 28 deletions(-) diff --git a/datafusion/functions/src/hash/xxhash.rs b/datafusion/functions/src/hash/xxhash.rs index 74bb517a5e0c..c01646b62db8 100644 --- a/datafusion/functions/src/hash/xxhash.rs +++ b/datafusion/functions/src/hash/xxhash.rs @@ -15,10 +15,13 @@ // specific language governing permissions and limitations // under the License. -use arrow::array::{Array, AsArray, BinaryArray, LargeBinaryArray, LargeStringArray, StringArray, StringBuilder}; +use arrow::array::{ + Array, AsArray, BinaryArray, LargeBinaryArray, LargeStringArray, StringArray, + StringBuilder, +}; use arrow::datatypes::DataType; use arrow::datatypes::DataType::{Binary, Int64, LargeBinary, LargeUtf8, Utf8, Utf8View}; -use datafusion_common::{plan_err,exec_err, internal_err, Result, ScalarValue}; +use datafusion_common::{exec_err, internal_err, plan_err, Result, ScalarValue}; use datafusion_expr::{ ColumnarValue, Documentation, ScalarUDFImpl, Signature, TypeSignature, Volatility, }; @@ -130,17 +133,11 @@ impl ScalarUDFImpl for XxHash32Func { if *seed >= 0 && *seed <= u32::MAX as i64 { *seed as u32 } else { - return exec_err!( - "Seed value out of range for UInt32: {}", - seed - ); + return exec_err!("Seed value out of range for UInt32: {}", seed); } } else { let actual_type = format!("{:?}", &args[1]); - return exec_err!( - "Expected a Int64 seed value, but got {}", - actual_type - ); + return exec_err!("Expected a Int64 seed value, but got {}", actual_type); } } else { 0 // Default seed value @@ -294,17 +291,11 @@ impl ScalarUDFImpl for XxHash64Func { if *seed >= 0 { *seed as u64 } else { - return exec_err!( - "Seed value out of range for UInt64: {}", - seed - ); + return exec_err!("Seed value out of range for UInt64: {}", seed); } } else { let actual_type = format!("{:?}", &args[1]); - return exec_err!( - "Expected a Int64 seed value, but got {}", - actual_type - ); + return exec_err!("Expected a Int64 seed value, but got {}", actual_type); } } else { 0 // Default seed value @@ -344,10 +335,7 @@ impl ScalarUDFImpl for XxHash64Func { } _ => { let actual_type = format!("{:?}", scalar); - return exec_err!( - "Unsupported scalar type: {}", - actual_type - ); + return exec_err!("Unsupported scalar type: {}", actual_type); } }, }; @@ -410,7 +398,7 @@ fn process_array( )?); } } - + Utf8 => { let string_array = array.as_any().downcast_ref::().unwrap(); for i in 0..array.len() { @@ -428,7 +416,8 @@ fn process_array( } LargeUtf8 => { - let large_string_array = array.as_any().downcast_ref::().unwrap(); + let large_string_array = + array.as_any().downcast_ref::().unwrap(); for i in 0..array.len() { if array.is_null(i) { hash_results.append_value(String::new()); @@ -465,7 +454,11 @@ fn process_array( .unwrap() .value(i) }; - hash_results.append_value(hash_value(value, &mut hasher, hash_type.clone())?); + hash_results.append_value(hash_value( + value, + &mut hasher, + hash_type.clone(), + )?); } } @@ -481,4 +474,4 @@ fn process_array( } Ok(hash_results.finish()) -} \ No newline at end of file +} diff --git a/docs/source/user-guide/sql/scalar_functions.md b/docs/source/user-guide/sql/scalar_functions.md index 657b5f3acc65..874a59f91b00 100644 --- a/docs/source/user-guide/sql/scalar_functions.md +++ b/docs/source/user-guide/sql/scalar_functions.md @@ -4340,7 +4340,7 @@ sha512(expression) Computes the XXHash32 hash of a binary string. ``` -xxhash32(expression) +xxhash32(expression [,seed]) ``` #### Arguments @@ -4363,7 +4363,7 @@ xxhash32(expression) Computes the XXHash64 hash of a binary string. ``` -xxhash64(expression) +xxhash64(expression [,seed]) ``` #### Arguments