From 33f3e9c13550d6d390f6128b8d94836938f652b3 Mon Sep 17 00:00:00 2001
From: Andy Grove <agrove@apache.org>
Date: Wed, 10 Jul 2024 13:32:40 -0600
Subject: [PATCH 01/68] feat: Create new `datafusion-comet-spark-expr` crate
 containing Spark-compatible DataFusion expressions (#638)

* convert into workspace project

* update GitHub actions

* update Makefile

* fix regression

* update target path

* update protobuf path in pom.xml

* update more paths

* add new datafusion-comet-expr crate

* rename CometAbsFunc to Abs and add documentation

* fix error message

* improve error handling

* update crate description

* remove unused dep

* address feedback

* finish renaming crate

* update README for datafusion-spark-expr

* rename crate to datafusion-comet-spark-expr
---
 Cargo.toml | 38 +++++++++++++++++++++++
 README.md  | 23 ++++++++++++++
 src/abs.rs | 88 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 src/lib.rs | 56 ++++++++++++++++++++++++++++++++++
 4 files changed, 205 insertions(+)
 create mode 100644 Cargo.toml
 create mode 100644 README.md
 create mode 100644 src/abs.rs
 create mode 100644 src/lib.rs

diff --git a/Cargo.toml b/Cargo.toml
new file mode 100644
index 000000000000..d10d04944b76
--- /dev/null
+++ b/Cargo.toml
@@ -0,0 +1,38 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+[package]
+name = "datafusion-comet-spark-expr"
+description = "DataFusion expressions that emulate Apache Spark's behavior"
+version = { workspace = true }
+homepage = { workspace = true }
+repository = { workspace = true }
+authors = { workspace = true }
+readme = { workspace = true }
+license = { workspace = true }
+edition = { workspace = true }
+
+[dependencies]
+arrow = { workspace = true }
+arrow-schema = { workspace = true }
+datafusion = { workspace = true }
+datafusion-common = { workspace = true }
+datafusion-functions = { workspace = true }
+
+[lib]
+name = "datafusion_comet_spark_expr"
+path = "src/lib.rs"
diff --git a/README.md b/README.md
new file mode 100644
index 000000000000..a7ee7536328e
--- /dev/null
+++ b/README.md
@@ -0,0 +1,23 @@
+<!--
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements.  See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership.  The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing,
+software distributed under the License is distributed on an
+"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, either express or implied.  See the License for the
+specific language governing permissions and limitations
+under the License.
+-->
+
+# datafusion-comet-spark-expr: Spark-compatible Expressions
+
+This crate provides Apache Spark-compatible expressions for use with DataFusion and is maintained as part of the 
+[Apache DataFusion Comet](https://github.com/apache/datafusion-comet/) subproject.
\ No newline at end of file
diff --git a/src/abs.rs b/src/abs.rs
new file mode 100644
index 000000000000..198a96e571f3
--- /dev/null
+++ b/src/abs.rs
@@ -0,0 +1,88 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Spark-compatible implementation of abs function
+
+use std::{any::Any, sync::Arc};
+
+use arrow::datatypes::DataType;
+use arrow_schema::ArrowError;
+
+use datafusion::logical_expr::{ColumnarValue, ScalarUDFImpl, Signature};
+use datafusion_common::DataFusionError;
+use datafusion_functions::math;
+
+use super::{EvalMode, SparkError};
+
+/// Spark-compatible ABS expression
+#[derive(Debug)]
+pub struct Abs {
+    inner_abs_func: Arc<dyn ScalarUDFImpl>,
+    eval_mode: EvalMode,
+    data_type_name: String,
+}
+
+impl Abs {
+    pub fn new(eval_mode: EvalMode, data_type_name: String) -> Result<Self, DataFusionError> {
+        if let EvalMode::Legacy | EvalMode::Ansi = eval_mode {
+            Ok(Self {
+                inner_abs_func: math::abs().inner().clone(),
+                eval_mode,
+                data_type_name,
+            })
+        } else {
+            Err(DataFusionError::Execution(format!(
+                "Invalid EvalMode: \"{:?}\"",
+                eval_mode
+            )))
+        }
+    }
+}
+
+impl ScalarUDFImpl for Abs {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+    fn name(&self) -> &str {
+        "abs"
+    }
+
+    fn signature(&self) -> &Signature {
+        self.inner_abs_func.signature()
+    }
+
+    fn return_type(&self, arg_types: &[DataType]) -> Result<DataType, DataFusionError> {
+        self.inner_abs_func.return_type(arg_types)
+    }
+
+    fn invoke(&self, args: &[ColumnarValue]) -> Result<ColumnarValue, DataFusionError> {
+        match self.inner_abs_func.invoke(args) {
+            Err(DataFusionError::ArrowError(ArrowError::ComputeError(msg), _))
+                if msg.contains("overflow") =>
+            {
+                if self.eval_mode == EvalMode::Legacy {
+                    Ok(args[0].clone())
+                } else {
+                    Err(DataFusionError::External(Box::new(
+                        SparkError::ArithmeticOverflow(self.data_type_name.clone()),
+                    )))
+                }
+            }
+            other => other,
+        }
+    }
+}
diff --git a/src/lib.rs b/src/lib.rs
new file mode 100644
index 000000000000..3873754be5b0
--- /dev/null
+++ b/src/lib.rs
@@ -0,0 +1,56 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::error::Error;
+use std::fmt::{Display, Formatter};
+
+pub mod abs;
+
+/// Spark supports three evaluation modes when evaluating expressions, which affect
+/// the behavior when processing input values that are invalid or would result in an
+/// error, such as divide by zero errors, and also affects behavior when converting
+/// between types.
+#[derive(Debug, Hash, PartialEq, Clone, Copy)]
+pub enum EvalMode {
+    /// Legacy is the default behavior in Spark prior to Spark 4.0. This mode silently ignores
+    /// or replaces errors during SQL operations. Operations resulting in errors (like
+    /// division by zero) will produce NULL values instead of failing. Legacy mode also
+    /// enables implicit type conversions.
+    Legacy,
+    /// Adheres to the ANSI SQL standard for error handling by throwing exceptions for
+    /// operations that result in errors. Does not perform implicit type conversions.
+    Ansi,
+    /// Same as Ansi mode, except that it converts errors to NULL values without
+    /// failing the entire query.
+    Try,
+}
+
+#[derive(Debug)]
+pub enum SparkError {
+    ArithmeticOverflow(String),
+}
+
+impl Error for SparkError {}
+
+impl Display for SparkError {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        match self {
+            Self::ArithmeticOverflow(data_type) =>
+                write!(f, "[ARITHMETIC_OVERFLOW] {} overflow. If necessary set \"spark.sql.ansi.enabled\" to \"false\" to bypass this error.", data_type)
+        }
+    }
+}

From 96a2f41dd4b6eca1cb0d6bab977fde596c9e22c9 Mon Sep 17 00:00:00 2001
From: Andy Grove <agrove@apache.org>
Date: Thu, 11 Jul 2024 05:39:58 -0600
Subject: [PATCH 02/68] feat: Move `IfExpr` to `spark-expr` crate (#653)

---
 Cargo.toml     |   2 +
 src/if_expr.rs | 231 +++++++++++++++++++++++++++++++++++++++++++++++++
 src/lib.rs     |   6 +-
 3 files changed, 238 insertions(+), 1 deletion(-)
 create mode 100644 src/if_expr.rs

diff --git a/Cargo.toml b/Cargo.toml
index d10d04944b76..8bf76dff6e25 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -32,6 +32,8 @@ arrow-schema = { workspace = true }
 datafusion = { workspace = true }
 datafusion-common = { workspace = true }
 datafusion-functions = { workspace = true }
+datafusion-physical-expr = { workspace = true }
+datafusion-comet-utils = { workspace = true }
 
 [lib]
 name = "datafusion_comet_spark_expr"
diff --git a/src/if_expr.rs b/src/if_expr.rs
new file mode 100644
index 000000000000..c04494ec4ffb
--- /dev/null
+++ b/src/if_expr.rs
@@ -0,0 +1,231 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::{
+    any::Any,
+    hash::{Hash, Hasher},
+    sync::Arc,
+};
+
+use arrow::{
+    array::*,
+    compute::{and, is_null, kernels::zip::zip, not, or_kleene},
+    datatypes::{DataType, Schema},
+    record_batch::RecordBatch,
+};
+use datafusion::logical_expr::ColumnarValue;
+use datafusion_common::{cast::as_boolean_array, Result};
+use datafusion_physical_expr::PhysicalExpr;
+
+use datafusion_comet_utils::down_cast_any_ref;
+
+#[derive(Debug, Hash)]
+pub struct IfExpr {
+    if_expr: Arc<dyn PhysicalExpr>,
+    true_expr: Arc<dyn PhysicalExpr>,
+    false_expr: Arc<dyn PhysicalExpr>,
+}
+
+impl std::fmt::Display for IfExpr {
+    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
+        write!(
+            f,
+            "If [if: {}, true_expr: {}, false_expr: {}]",
+            self.if_expr, self.true_expr, self.false_expr
+        )
+    }
+}
+
+impl IfExpr {
+    /// Create a new IF expression
+    pub fn new(
+        if_expr: Arc<dyn PhysicalExpr>,
+        true_expr: Arc<dyn PhysicalExpr>,
+        false_expr: Arc<dyn PhysicalExpr>,
+    ) -> Self {
+        Self {
+            if_expr,
+            true_expr,
+            false_expr,
+        }
+    }
+}
+
+impl PhysicalExpr for IfExpr {
+    /// Return a reference to Any that can be used for down-casting
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn data_type(&self, input_schema: &Schema) -> Result<DataType> {
+        let data_type = self.true_expr.data_type(input_schema)?;
+        Ok(data_type)
+    }
+
+    fn nullable(&self, _input_schema: &Schema) -> Result<bool> {
+        if self.true_expr.nullable(_input_schema)? || self.true_expr.nullable(_input_schema)? {
+            Ok(true)
+        } else {
+            Ok(false)
+        }
+    }
+
+    fn evaluate(&self, batch: &RecordBatch) -> Result<ColumnarValue> {
+        let mut remainder = BooleanArray::from(vec![true; batch.num_rows()]);
+
+        // evaluate if condition on batch
+        let if_value = self.if_expr.evaluate_selection(batch, &remainder)?;
+        let if_value = if_value.into_array(batch.num_rows())?;
+        let if_value =
+            as_boolean_array(&if_value).expect("if expression did not return a BooleanArray");
+
+        let true_value = self.true_expr.evaluate_selection(batch, if_value)?;
+        let true_value = true_value.into_array(batch.num_rows())?;
+
+        remainder = and(
+            &remainder,
+            &or_kleene(&not(if_value)?, &is_null(if_value)?)?,
+        )?;
+
+        let false_value = self
+            .false_expr
+            .evaluate_selection(batch, &remainder)?
+            .into_array(batch.num_rows())?;
+        let current_value = zip(&remainder, &false_value, &true_value)?;
+
+        Ok(ColumnarValue::Array(current_value))
+    }
+
+    fn children(&self) -> Vec<&Arc<dyn PhysicalExpr>> {
+        vec![&self.if_expr, &self.true_expr, &self.false_expr]
+    }
+
+    fn with_new_children(
+        self: Arc<Self>,
+        children: Vec<Arc<dyn PhysicalExpr>>,
+    ) -> Result<Arc<dyn PhysicalExpr>> {
+        Ok(Arc::new(IfExpr::new(
+            children[0].clone(),
+            children[1].clone(),
+            children[2].clone(),
+        )))
+    }
+
+    fn dyn_hash(&self, state: &mut dyn Hasher) {
+        let mut s = state;
+        self.if_expr.hash(&mut s);
+        self.true_expr.hash(&mut s);
+        self.false_expr.hash(&mut s);
+        self.hash(&mut s);
+    }
+}
+
+impl PartialEq<dyn Any> for IfExpr {
+    fn eq(&self, other: &dyn Any) -> bool {
+        down_cast_any_ref(other)
+            .downcast_ref::<Self>()
+            .map(|x| {
+                self.if_expr.eq(&x.if_expr)
+                    && self.true_expr.eq(&x.true_expr)
+                    && self.false_expr.eq(&x.false_expr)
+            })
+            .unwrap_or(false)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use arrow::{array::StringArray, datatypes::*};
+    use datafusion::logical_expr::Operator;
+    use datafusion_common::cast::as_int32_array;
+    use datafusion_physical_expr::expressions::{binary, col, lit};
+
+    use super::*;
+
+    /// Create an If expression
+    fn if_fn(
+        if_expr: Arc<dyn PhysicalExpr>,
+        true_expr: Arc<dyn PhysicalExpr>,
+        false_expr: Arc<dyn PhysicalExpr>,
+    ) -> Result<Arc<dyn PhysicalExpr>> {
+        Ok(Arc::new(IfExpr::new(if_expr, true_expr, false_expr)))
+    }
+
+    #[test]
+    fn test_if_1() -> Result<()> {
+        let schema = Schema::new(vec![Field::new("a", DataType::Utf8, true)]);
+        let a = StringArray::from(vec![Some("foo"), Some("baz"), None, Some("bar")]);
+        let batch = RecordBatch::try_new(Arc::new(schema), vec![Arc::new(a)])?;
+        let schema_ref = batch.schema();
+
+        // if a = 'foo' 123 else 999
+        let if_expr = binary(
+            col("a", &schema_ref)?,
+            Operator::Eq,
+            lit("foo"),
+            &schema_ref,
+        )?;
+        let true_expr = lit(123i32);
+        let false_expr = lit(999i32);
+
+        let expr = if_fn(if_expr, true_expr, false_expr);
+        let result = expr?.evaluate(&batch)?.into_array(batch.num_rows())?;
+        let result = as_int32_array(&result)?;
+
+        let expected = &Int32Array::from(vec![Some(123), Some(999), Some(999), Some(999)]);
+
+        assert_eq!(expected, result);
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_if_2() -> Result<()> {
+        let schema = Schema::new(vec![Field::new("a", DataType::Int32, true)]);
+        let a = Int32Array::from(vec![Some(1), Some(0), None, Some(5)]);
+        let batch = RecordBatch::try_new(Arc::new(schema), vec![Arc::new(a)])?;
+        let schema_ref = batch.schema();
+
+        // if a >=1 123 else 999
+        let if_expr = binary(col("a", &schema_ref)?, Operator::GtEq, lit(1), &schema_ref)?;
+        let true_expr = lit(123i32);
+        let false_expr = lit(999i32);
+
+        let expr = if_fn(if_expr, true_expr, false_expr);
+        let result = expr?.evaluate(&batch)?.into_array(batch.num_rows())?;
+        let result = as_int32_array(&result)?;
+
+        let expected = &Int32Array::from(vec![Some(123), Some(999), Some(999), Some(123)]);
+        assert_eq!(expected, result);
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_if_children() {
+        let if_expr = lit(true);
+        let true_expr = lit(123i32);
+        let false_expr = lit(999i32);
+
+        let expr = if_fn(if_expr, true_expr, false_expr).unwrap();
+        let children = expr.children();
+        assert_eq!(children.len(), 3);
+        assert_eq!(children[0].to_string(), "true");
+        assert_eq!(children[1].to_string(), "123");
+        assert_eq!(children[2].to_string(), "999");
+    }
+}
diff --git a/src/lib.rs b/src/lib.rs
index 3873754be5b0..c36e8855edf8 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -18,7 +18,11 @@
 use std::error::Error;
 use std::fmt::{Display, Formatter};
 
-pub mod abs;
+mod abs;
+mod if_expr;
+
+pub use abs::Abs;
+pub use if_expr::IfExpr;
 
 /// Spark supports three evaluation modes when evaluating expressions, which affect
 /// the behavior when processing input values that are invalid or would result in an

From 2f22a4dff765bf6e5e77ebed4c2cdc7baf02276a Mon Sep 17 00:00:00 2001
From: Andy Grove <agrove@apache.org>
Date: Fri, 12 Jul 2024 05:52:16 -0600
Subject: [PATCH 03/68] chore: Refactoring of CometError/SparkError (#655)

---
 Cargo.toml   |  1 +
 src/abs.rs   |  7 ++---
 src/error.rs | 73 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 src/lib.rs   | 21 ++-------------
 4 files changed, 80 insertions(+), 22 deletions(-)
 create mode 100644 src/error.rs

diff --git a/Cargo.toml b/Cargo.toml
index 8bf76dff6e25..4a9b94087321 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -34,6 +34,7 @@ datafusion-common = { workspace = true }
 datafusion-functions = { workspace = true }
 datafusion-physical-expr = { workspace = true }
 datafusion-comet-utils = { workspace = true }
+thiserror = { workspace = true }
 
 [lib]
 name = "datafusion_comet_spark_expr"
diff --git a/src/abs.rs b/src/abs.rs
index 198a96e571f3..fa25a7775ae7 100644
--- a/src/abs.rs
+++ b/src/abs.rs
@@ -77,9 +77,10 @@ impl ScalarUDFImpl for Abs {
                 if self.eval_mode == EvalMode::Legacy {
                     Ok(args[0].clone())
                 } else {
-                    Err(DataFusionError::External(Box::new(
-                        SparkError::ArithmeticOverflow(self.data_type_name.clone()),
-                    )))
+                    Err(SparkError::ArithmeticOverflow {
+                        from_type: self.data_type_name.clone(),
+                    }
+                    .into())
                 }
             }
             other => other,
diff --git a/src/error.rs b/src/error.rs
new file mode 100644
index 000000000000..728a35a9d2e0
--- /dev/null
+++ b/src/error.rs
@@ -0,0 +1,73 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow_schema::ArrowError;
+use datafusion_common::DataFusionError;
+
+#[derive(thiserror::Error, Debug)]
+pub enum SparkError {
+    // Note that this message format is based on Spark 3.4 and is more detailed than the message
+    // returned by Spark 3.3
+    #[error("[CAST_INVALID_INPUT] The value '{value}' of the type \"{from_type}\" cannot be cast to \"{to_type}\" \
+        because it is malformed. Correct the value as per the syntax, or change its target type. \
+        Use `try_cast` to tolerate malformed input and return NULL instead. If necessary \
+        set \"spark.sql.ansi.enabled\" to \"false\" to bypass this error.")]
+    CastInvalidValue {
+        value: String,
+        from_type: String,
+        to_type: String,
+    },
+
+    #[error("[NUMERIC_VALUE_OUT_OF_RANGE] {value} cannot be represented as Decimal({precision}, {scale}). If necessary set \"spark.sql.ansi.enabled\" to \"false\" to bypass this error, and return NULL instead.")]
+    NumericValueOutOfRange {
+        value: String,
+        precision: u8,
+        scale: i8,
+    },
+
+    #[error("[CAST_OVERFLOW] The value {value} of the type \"{from_type}\" cannot be cast to \"{to_type}\" \
+        due to an overflow. Use `try_cast` to tolerate overflow and return NULL instead. If necessary \
+        set \"spark.sql.ansi.enabled\" to \"false\" to bypass this error.")]
+    CastOverFlow {
+        value: String,
+        from_type: String,
+        to_type: String,
+    },
+
+    #[error("[ARITHMETIC_OVERFLOW] {from_type} overflow. If necessary set \"spark.sql.ansi.enabled\" to \"false\" to bypass this error.")]
+    ArithmeticOverflow { from_type: String },
+
+    #[error("ArrowError: {0}.")]
+    Arrow(ArrowError),
+
+    #[error("InternalError: {0}.")]
+    Internal(String),
+}
+
+pub type SparkResult<T> = Result<T, SparkError>;
+
+impl From<ArrowError> for SparkError {
+    fn from(value: ArrowError) -> Self {
+        SparkError::Arrow(value)
+    }
+}
+
+impl From<SparkError> for DataFusionError {
+    fn from(value: SparkError) -> Self {
+        DataFusionError::External(Box::new(value))
+    }
+}
diff --git a/src/lib.rs b/src/lib.rs
index c36e8855edf8..57da56f9aca6 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -15,13 +15,12 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use std::error::Error;
-use std::fmt::{Display, Formatter};
-
 mod abs;
+mod error;
 mod if_expr;
 
 pub use abs::Abs;
+pub use error::{SparkError, SparkResult};
 pub use if_expr::IfExpr;
 
 /// Spark supports three evaluation modes when evaluating expressions, which affect
@@ -42,19 +41,3 @@ pub enum EvalMode {
     /// failing the entire query.
     Try,
 }
-
-#[derive(Debug)]
-pub enum SparkError {
-    ArithmeticOverflow(String),
-}
-
-impl Error for SparkError {}
-
-impl Display for SparkError {
-    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
-        match self {
-            Self::ArithmeticOverflow(data_type) =>
-                write!(f, "[ARITHMETIC_OVERFLOW] {} overflow. If necessary set \"spark.sql.ansi.enabled\" to \"false\" to bypass this error.", data_type)
-        }
-    }
-}

From 11138bb47ee9f15523c8da457d5966bb468653fc Mon Sep 17 00:00:00 2001
From: Andy Grove <agrove@apache.org>
Date: Fri, 12 Jul 2024 13:21:50 -0600
Subject: [PATCH 04/68] chore: Move `cast` to `spark-expr` crate (#654)

* refactor in preparation for moving cast to spark-expr crate

* errors

* move cast to spark-expr crate

* machete

* refactor errors

* clean up imports
---
 Cargo.toml  |    5 +
 src/cast.rs | 2016 +++++++++++++++++++++++++++++++++++++++++++++++++++
 src/lib.rs  |    1 +
 3 files changed, 2022 insertions(+)
 create mode 100644 src/cast.rs

diff --git a/Cargo.toml b/Cargo.toml
index 4a9b94087321..220417fe8b05 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -28,12 +28,17 @@ edition = { workspace = true }
 
 [dependencies]
 arrow = { workspace = true }
+arrow-array = { workspace = true }
 arrow-schema = { workspace = true }
+chrono = { workspace = true }
 datafusion = { workspace = true }
 datafusion-common = { workspace = true }
 datafusion-functions = { workspace = true }
+datafusion-expr = { workspace = true }
 datafusion-physical-expr = { workspace = true }
 datafusion-comet-utils = { workspace = true }
+num = { workspace = true }
+regex = { workspace = true }
 thiserror = { workspace = true }
 
 [lib]
diff --git a/src/cast.rs b/src/cast.rs
new file mode 100644
index 000000000000..b9cf2790b5d2
--- /dev/null
+++ b/src/cast.rs
@@ -0,0 +1,2016 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::{
+    any::Any,
+    fmt::{Debug, Display, Formatter},
+    hash::{Hash, Hasher},
+    num::Wrapping,
+    sync::Arc,
+};
+
+use arrow::{
+    array::{
+        cast::AsArray,
+        types::{Date32Type, Int16Type, Int32Type, Int8Type},
+        Array, ArrayRef, BooleanArray, Decimal128Array, Float32Array, Float64Array,
+        GenericStringArray, Int16Array, Int32Array, Int64Array, Int8Array, OffsetSizeTrait,
+        PrimitiveArray,
+    },
+    compute::{cast_with_options, unary, CastOptions},
+    datatypes::{
+        ArrowPrimitiveType, Decimal128Type, DecimalType, Float32Type, Float64Type, Int64Type,
+        TimestampMicrosecondType,
+    },
+    error::ArrowError,
+    record_batch::RecordBatch,
+    util::display::FormatOptions,
+};
+use arrow_schema::{DataType, Schema};
+
+use datafusion_common::{
+    cast::as_generic_string_array, internal_err, Result as DataFusionResult, ScalarValue,
+};
+use datafusion_expr::ColumnarValue;
+use datafusion_physical_expr::PhysicalExpr;
+
+use chrono::{NaiveDate, NaiveDateTime, TimeZone, Timelike};
+use num::{
+    cast::AsPrimitive, integer::div_floor, traits::CheckedNeg, CheckedSub, Integer, Num,
+    ToPrimitive,
+};
+use regex::Regex;
+
+use datafusion_comet_utils::{array_with_timezone, down_cast_any_ref};
+
+use crate::{EvalMode, SparkError, SparkResult};
+
+static TIMESTAMP_FORMAT: Option<&str> = Some("%Y-%m-%d %H:%M:%S%.f");
+
+const MICROS_PER_SECOND: i64 = 1000000;
+
+static CAST_OPTIONS: CastOptions = CastOptions {
+    safe: true,
+    format_options: FormatOptions::new()
+        .with_timestamp_tz_format(TIMESTAMP_FORMAT)
+        .with_timestamp_format(TIMESTAMP_FORMAT),
+};
+
+#[derive(Debug, Hash)]
+pub struct Cast {
+    pub child: Arc<dyn PhysicalExpr>,
+    pub data_type: DataType,
+    pub eval_mode: EvalMode,
+
+    /// When cast from/to timezone related types, we need timezone, which will be resolved with
+    /// session local timezone by an analyzer in Spark.
+    pub timezone: String,
+}
+
+macro_rules! cast_utf8_to_int {
+    ($array:expr, $eval_mode:expr, $array_type:ty, $cast_method:ident) => {{
+        let len = $array.len();
+        let mut cast_array = PrimitiveArray::<$array_type>::builder(len);
+        for i in 0..len {
+            if $array.is_null(i) {
+                cast_array.append_null()
+            } else if let Some(cast_value) = $cast_method($array.value(i), $eval_mode)? {
+                cast_array.append_value(cast_value);
+            } else {
+                cast_array.append_null()
+            }
+        }
+        let result: SparkResult<ArrayRef> = Ok(Arc::new(cast_array.finish()) as ArrayRef);
+        result
+    }};
+}
+
+macro_rules! cast_utf8_to_timestamp {
+    ($array:expr, $eval_mode:expr, $array_type:ty, $cast_method:ident) => {{
+        let len = $array.len();
+        let mut cast_array = PrimitiveArray::<$array_type>::builder(len).with_timezone("UTC");
+        for i in 0..len {
+            if $array.is_null(i) {
+                cast_array.append_null()
+            } else if let Ok(Some(cast_value)) = $cast_method($array.value(i).trim(), $eval_mode) {
+                cast_array.append_value(cast_value);
+            } else {
+                cast_array.append_null()
+            }
+        }
+        let result: ArrayRef = Arc::new(cast_array.finish()) as ArrayRef;
+        result
+    }};
+}
+
+macro_rules! cast_float_to_string {
+    ($from:expr, $eval_mode:expr, $type:ty, $output_type:ty, $offset_type:ty) => {{
+
+        fn cast<OffsetSize>(
+            from: &dyn Array,
+            _eval_mode: EvalMode,
+        ) -> SparkResult<ArrayRef>
+        where
+            OffsetSize: OffsetSizeTrait, {
+                let array = from.as_any().downcast_ref::<$output_type>().unwrap();
+
+                // If the absolute number is less than 10,000,000 and greater or equal than 0.001, the
+                // result is expressed without scientific notation with at least one digit on either side of
+                // the decimal point. Otherwise, Spark uses a mantissa followed by E and an
+                // exponent. The mantissa has an optional leading minus sign followed by one digit to the
+                // left of the decimal point, and the minimal number of digits greater than zero to the
+                // right. The exponent has and optional leading minus sign.
+                // source: https://docs.databricks.com/en/sql/language-manual/functions/cast.html
+
+                const LOWER_SCIENTIFIC_BOUND: $type = 0.001;
+                const UPPER_SCIENTIFIC_BOUND: $type = 10000000.0;
+
+                let output_array = array
+                    .iter()
+                    .map(|value| match value {
+                        Some(value) if value == <$type>::INFINITY => Ok(Some("Infinity".to_string())),
+                        Some(value) if value == <$type>::NEG_INFINITY => Ok(Some("-Infinity".to_string())),
+                        Some(value)
+                            if (value.abs() < UPPER_SCIENTIFIC_BOUND
+                                && value.abs() >= LOWER_SCIENTIFIC_BOUND)
+                                || value.abs() == 0.0 =>
+                        {
+                            let trailing_zero = if value.fract() == 0.0 { ".0" } else { "" };
+
+                            Ok(Some(format!("{value}{trailing_zero}")))
+                        }
+                        Some(value)
+                            if value.abs() >= UPPER_SCIENTIFIC_BOUND
+                                || value.abs() < LOWER_SCIENTIFIC_BOUND =>
+                        {
+                            let formatted = format!("{value:E}");
+
+                            if formatted.contains(".") {
+                                Ok(Some(formatted))
+                            } else {
+                                // `formatted` is already in scientific notation and can be split up by E
+                                // in order to add the missing trailing 0 which gets removed for numbers with a fraction of 0.0
+                                let prepare_number: Vec<&str> = formatted.split("E").collect();
+
+                                let coefficient = prepare_number[0];
+
+                                let exponent = prepare_number[1];
+
+                                Ok(Some(format!("{coefficient}.0E{exponent}")))
+                            }
+                        }
+                        Some(value) => Ok(Some(value.to_string())),
+                        _ => Ok(None),
+                    })
+                    .collect::<Result<GenericStringArray<OffsetSize>, SparkError>>()?;
+
+                Ok(Arc::new(output_array))
+            }
+
+        cast::<$offset_type>($from, $eval_mode)
+    }};
+}
+
+macro_rules! cast_int_to_int_macro {
+    (
+        $array: expr,
+        $eval_mode:expr,
+        $from_arrow_primitive_type: ty,
+        $to_arrow_primitive_type: ty,
+        $from_data_type: expr,
+        $to_native_type: ty,
+        $spark_from_data_type_name: expr,
+        $spark_to_data_type_name: expr
+    ) => {{
+        let cast_array = $array
+            .as_any()
+            .downcast_ref::<PrimitiveArray<$from_arrow_primitive_type>>()
+            .unwrap();
+        let spark_int_literal_suffix = match $from_data_type {
+            &DataType::Int64 => "L",
+            &DataType::Int16 => "S",
+            &DataType::Int8 => "T",
+            _ => "",
+        };
+
+        let output_array = match $eval_mode {
+            EvalMode::Legacy => cast_array
+                .iter()
+                .map(|value| match value {
+                    Some(value) => {
+                        Ok::<Option<$to_native_type>, SparkError>(Some(value as $to_native_type))
+                    }
+                    _ => Ok(None),
+                })
+                .collect::<Result<PrimitiveArray<$to_arrow_primitive_type>, _>>(),
+            _ => cast_array
+                .iter()
+                .map(|value| match value {
+                    Some(value) => {
+                        let res = <$to_native_type>::try_from(value);
+                        if res.is_err() {
+                            Err(cast_overflow(
+                                &(value.to_string() + spark_int_literal_suffix),
+                                $spark_from_data_type_name,
+                                $spark_to_data_type_name,
+                            ))
+                        } else {
+                            Ok::<Option<$to_native_type>, SparkError>(Some(res.unwrap()))
+                        }
+                    }
+                    _ => Ok(None),
+                })
+                .collect::<Result<PrimitiveArray<$to_arrow_primitive_type>, _>>(),
+        }?;
+        let result: SparkResult<ArrayRef> = Ok(Arc::new(output_array) as ArrayRef);
+        result
+    }};
+}
+
+// When Spark casts to Byte/Short Types, it does not cast directly to Byte/Short.
+// It casts to Int first and then to Byte/Short. Because of potential overflows in the Int cast,
+// this can cause unexpected Short/Byte cast results. Replicate this behavior.
+macro_rules! cast_float_to_int16_down {
+    (
+        $array:expr,
+        $eval_mode:expr,
+        $src_array_type:ty,
+        $dest_array_type:ty,
+        $rust_src_type:ty,
+        $rust_dest_type:ty,
+        $src_type_str:expr,
+        $dest_type_str:expr,
+        $format_str:expr
+    ) => {{
+        let cast_array = $array
+            .as_any()
+            .downcast_ref::<$src_array_type>()
+            .expect(concat!("Expected a ", stringify!($src_array_type)));
+
+        let output_array = match $eval_mode {
+            EvalMode::Ansi => cast_array
+                .iter()
+                .map(|value| match value {
+                    Some(value) => {
+                        let is_overflow = value.is_nan() || value.abs() as i32 == i32::MAX;
+                        if is_overflow {
+                            return Err(cast_overflow(
+                                &format!($format_str, value).replace("e", "E"),
+                                $src_type_str,
+                                $dest_type_str,
+                            ));
+                        }
+                        let i32_value = value as i32;
+                        <$rust_dest_type>::try_from(i32_value)
+                            .map_err(|_| {
+                                cast_overflow(
+                                    &format!($format_str, value).replace("e", "E"),
+                                    $src_type_str,
+                                    $dest_type_str,
+                                )
+                            })
+                            .map(Some)
+                    }
+                    None => Ok(None),
+                })
+                .collect::<Result<$dest_array_type, _>>()?,
+            _ => cast_array
+                .iter()
+                .map(|value| match value {
+                    Some(value) => {
+                        let i32_value = value as i32;
+                        Ok::<Option<$rust_dest_type>, SparkError>(Some(
+                            i32_value as $rust_dest_type,
+                        ))
+                    }
+                    None => Ok(None),
+                })
+                .collect::<Result<$dest_array_type, _>>()?,
+        };
+        Ok(Arc::new(output_array) as ArrayRef)
+    }};
+}
+
+macro_rules! cast_float_to_int32_up {
+    (
+        $array:expr,
+        $eval_mode:expr,
+        $src_array_type:ty,
+        $dest_array_type:ty,
+        $rust_src_type:ty,
+        $rust_dest_type:ty,
+        $src_type_str:expr,
+        $dest_type_str:expr,
+        $max_dest_val:expr,
+        $format_str:expr
+    ) => {{
+        let cast_array = $array
+            .as_any()
+            .downcast_ref::<$src_array_type>()
+            .expect(concat!("Expected a ", stringify!($src_array_type)));
+
+        let output_array = match $eval_mode {
+            EvalMode::Ansi => cast_array
+                .iter()
+                .map(|value| match value {
+                    Some(value) => {
+                        let is_overflow =
+                            value.is_nan() || value.abs() as $rust_dest_type == $max_dest_val;
+                        if is_overflow {
+                            return Err(cast_overflow(
+                                &format!($format_str, value).replace("e", "E"),
+                                $src_type_str,
+                                $dest_type_str,
+                            ));
+                        }
+                        Ok(Some(value as $rust_dest_type))
+                    }
+                    None => Ok(None),
+                })
+                .collect::<Result<$dest_array_type, _>>()?,
+            _ => cast_array
+                .iter()
+                .map(|value| match value {
+                    Some(value) => {
+                        Ok::<Option<$rust_dest_type>, SparkError>(Some(value as $rust_dest_type))
+                    }
+                    None => Ok(None),
+                })
+                .collect::<Result<$dest_array_type, _>>()?,
+        };
+        Ok(Arc::new(output_array) as ArrayRef)
+    }};
+}
+
+// When Spark casts to Byte/Short Types, it does not cast directly to Byte/Short.
+// It casts to Int first and then to Byte/Short. Because of potential overflows in the Int cast,
+// this can cause unexpected Short/Byte cast results. Replicate this behavior.
+macro_rules! cast_decimal_to_int16_down {
+    (
+        $array:expr,
+        $eval_mode:expr,
+        $dest_array_type:ty,
+        $rust_dest_type:ty,
+        $dest_type_str:expr,
+        $precision:expr,
+        $scale:expr
+    ) => {{
+        let cast_array = $array
+            .as_any()
+            .downcast_ref::<Decimal128Array>()
+            .expect(concat!("Expected a Decimal128ArrayType"));
+
+        let output_array = match $eval_mode {
+            EvalMode::Ansi => cast_array
+                .iter()
+                .map(|value| match value {
+                    Some(value) => {
+                        let divisor = 10_i128.pow($scale as u32);
+                        let (truncated, decimal) = (value / divisor, (value % divisor).abs());
+                        let is_overflow = truncated.abs() > i32::MAX.into();
+                        if is_overflow {
+                            return Err(cast_overflow(
+                                &format!("{}.{}BD", truncated, decimal),
+                                &format!("DECIMAL({},{})", $precision, $scale),
+                                $dest_type_str,
+                            ));
+                        }
+                        let i32_value = truncated as i32;
+                        <$rust_dest_type>::try_from(i32_value)
+                            .map_err(|_| {
+                                cast_overflow(
+                                    &format!("{}.{}BD", truncated, decimal),
+                                    &format!("DECIMAL({},{})", $precision, $scale),
+                                    $dest_type_str,
+                                )
+                            })
+                            .map(Some)
+                    }
+                    None => Ok(None),
+                })
+                .collect::<Result<$dest_array_type, _>>()?,
+            _ => cast_array
+                .iter()
+                .map(|value| match value {
+                    Some(value) => {
+                        let divisor = 10_i128.pow($scale as u32);
+                        let i32_value = (value / divisor) as i32;
+                        Ok::<Option<$rust_dest_type>, SparkError>(Some(
+                            i32_value as $rust_dest_type,
+                        ))
+                    }
+                    None => Ok(None),
+                })
+                .collect::<Result<$dest_array_type, _>>()?,
+        };
+        Ok(Arc::new(output_array) as ArrayRef)
+    }};
+}
+
+macro_rules! cast_decimal_to_int32_up {
+    (
+        $array:expr,
+        $eval_mode:expr,
+        $dest_array_type:ty,
+        $rust_dest_type:ty,
+        $dest_type_str:expr,
+        $max_dest_val:expr,
+        $precision:expr,
+        $scale:expr
+    ) => {{
+        let cast_array = $array
+            .as_any()
+            .downcast_ref::<Decimal128Array>()
+            .expect(concat!("Expected a Decimal128ArrayType"));
+
+        let output_array = match $eval_mode {
+            EvalMode::Ansi => cast_array
+                .iter()
+                .map(|value| match value {
+                    Some(value) => {
+                        let divisor = 10_i128.pow($scale as u32);
+                        let (truncated, decimal) = (value / divisor, (value % divisor).abs());
+                        let is_overflow = truncated.abs() > $max_dest_val.into();
+                        if is_overflow {
+                            return Err(cast_overflow(
+                                &format!("{}.{}BD", truncated, decimal),
+                                &format!("DECIMAL({},{})", $precision, $scale),
+                                $dest_type_str,
+                            ));
+                        }
+                        Ok(Some(truncated as $rust_dest_type))
+                    }
+                    None => Ok(None),
+                })
+                .collect::<Result<$dest_array_type, _>>()?,
+            _ => cast_array
+                .iter()
+                .map(|value| match value {
+                    Some(value) => {
+                        let divisor = 10_i128.pow($scale as u32);
+                        let truncated = value / divisor;
+                        Ok::<Option<$rust_dest_type>, SparkError>(Some(
+                            truncated as $rust_dest_type,
+                        ))
+                    }
+                    None => Ok(None),
+                })
+                .collect::<Result<$dest_array_type, _>>()?,
+        };
+        Ok(Arc::new(output_array) as ArrayRef)
+    }};
+}
+
+impl Cast {
+    pub fn new(
+        child: Arc<dyn PhysicalExpr>,
+        data_type: DataType,
+        eval_mode: EvalMode,
+        timezone: String,
+    ) -> Self {
+        Self {
+            child,
+            data_type,
+            timezone,
+            eval_mode,
+        }
+    }
+
+    pub fn new_without_timezone(
+        child: Arc<dyn PhysicalExpr>,
+        data_type: DataType,
+        eval_mode: EvalMode,
+    ) -> Self {
+        Self {
+            child,
+            data_type,
+            timezone: "".to_string(),
+            eval_mode,
+        }
+    }
+
+    fn cast_array(&self, array: ArrayRef) -> DataFusionResult<ArrayRef> {
+        let to_type = &self.data_type;
+        let array = array_with_timezone(array, self.timezone.clone(), Some(to_type))?;
+        let from_type = array.data_type().clone();
+
+        // unpack dictionary string arrays first
+        // TODO: we are unpacking a dictionary-encoded array and then performing
+        // the cast. We could potentially improve performance here by casting the
+        // dictionary values directly without unpacking the array first, although this
+        // would add more complexity to the code
+        let array = match &from_type {
+            DataType::Dictionary(key_type, value_type)
+                if key_type.as_ref() == &DataType::Int32
+                    && (value_type.as_ref() == &DataType::Utf8
+                        || value_type.as_ref() == &DataType::LargeUtf8) =>
+            {
+                cast_with_options(&array, value_type.as_ref(), &CAST_OPTIONS)?
+            }
+            _ => array,
+        };
+        let from_type = array.data_type();
+
+        let cast_result = match (from_type, to_type) {
+            (DataType::Utf8, DataType::Boolean) => {
+                Self::spark_cast_utf8_to_boolean::<i32>(&array, self.eval_mode)
+            }
+            (DataType::LargeUtf8, DataType::Boolean) => {
+                Self::spark_cast_utf8_to_boolean::<i64>(&array, self.eval_mode)
+            }
+            (DataType::Utf8, DataType::Timestamp(_, _)) => {
+                Self::cast_string_to_timestamp(&array, to_type, self.eval_mode)
+            }
+            (DataType::Utf8, DataType::Date32) => {
+                Self::cast_string_to_date(&array, to_type, self.eval_mode)
+            }
+            (DataType::Int64, DataType::Int32)
+            | (DataType::Int64, DataType::Int16)
+            | (DataType::Int64, DataType::Int8)
+            | (DataType::Int32, DataType::Int16)
+            | (DataType::Int32, DataType::Int8)
+            | (DataType::Int16, DataType::Int8)
+                if self.eval_mode != EvalMode::Try =>
+            {
+                Self::spark_cast_int_to_int(&array, self.eval_mode, from_type, to_type)
+            }
+            (
+                DataType::Utf8,
+                DataType::Int8 | DataType::Int16 | DataType::Int32 | DataType::Int64,
+            ) => Self::cast_string_to_int::<i32>(to_type, &array, self.eval_mode),
+            (
+                DataType::LargeUtf8,
+                DataType::Int8 | DataType::Int16 | DataType::Int32 | DataType::Int64,
+            ) => Self::cast_string_to_int::<i64>(to_type, &array, self.eval_mode),
+            (DataType::Float64, DataType::Utf8) => {
+                Self::spark_cast_float64_to_utf8::<i32>(&array, self.eval_mode)
+            }
+            (DataType::Float64, DataType::LargeUtf8) => {
+                Self::spark_cast_float64_to_utf8::<i64>(&array, self.eval_mode)
+            }
+            (DataType::Float32, DataType::Utf8) => {
+                Self::spark_cast_float32_to_utf8::<i32>(&array, self.eval_mode)
+            }
+            (DataType::Float32, DataType::LargeUtf8) => {
+                Self::spark_cast_float32_to_utf8::<i64>(&array, self.eval_mode)
+            }
+            (DataType::Float32, DataType::Decimal128(precision, scale)) => {
+                Self::cast_float32_to_decimal128(&array, *precision, *scale, self.eval_mode)
+            }
+            (DataType::Float64, DataType::Decimal128(precision, scale)) => {
+                Self::cast_float64_to_decimal128(&array, *precision, *scale, self.eval_mode)
+            }
+            (DataType::Float32, DataType::Int8)
+            | (DataType::Float32, DataType::Int16)
+            | (DataType::Float32, DataType::Int32)
+            | (DataType::Float32, DataType::Int64)
+            | (DataType::Float64, DataType::Int8)
+            | (DataType::Float64, DataType::Int16)
+            | (DataType::Float64, DataType::Int32)
+            | (DataType::Float64, DataType::Int64)
+            | (DataType::Decimal128(_, _), DataType::Int8)
+            | (DataType::Decimal128(_, _), DataType::Int16)
+            | (DataType::Decimal128(_, _), DataType::Int32)
+            | (DataType::Decimal128(_, _), DataType::Int64)
+                if self.eval_mode != EvalMode::Try =>
+            {
+                Self::spark_cast_nonintegral_numeric_to_integral(
+                    &array,
+                    self.eval_mode,
+                    from_type,
+                    to_type,
+                )
+            }
+            _ if Self::is_datafusion_spark_compatible(from_type, to_type) => {
+                // use DataFusion cast only when we know that it is compatible with Spark
+                Ok(cast_with_options(&array, to_type, &CAST_OPTIONS)?)
+            }
+            _ => {
+                // we should never reach this code because the Scala code should be checking
+                // for supported cast operations and falling back to Spark for anything that
+                // is not yet supported
+                Err(SparkError::Internal(format!(
+                    "Native cast invoked for unsupported cast from {from_type:?} to {to_type:?}"
+                )))
+            }
+        };
+        Ok(spark_cast(cast_result?, from_type, to_type))
+    }
+
+    /// Determines if DataFusion supports the given cast in a way that is
+    /// compatible with Spark
+    fn is_datafusion_spark_compatible(from_type: &DataType, to_type: &DataType) -> bool {
+        if from_type == to_type {
+            return true;
+        }
+        match from_type {
+            DataType::Boolean => matches!(
+                to_type,
+                DataType::Int8
+                    | DataType::Int16
+                    | DataType::Int32
+                    | DataType::Int64
+                    | DataType::Float32
+                    | DataType::Float64
+                    | DataType::Utf8
+            ),
+            DataType::Int8 | DataType::Int16 | DataType::Int32 | DataType::Int64 => {
+                // note that the cast from Int32/Int64 -> Decimal128 here is actually
+                // not compatible with Spark (no overflow checks) but we have tests that
+                // rely on this cast working so we have to leave it here for now
+                matches!(
+                    to_type,
+                    DataType::Boolean
+                        | DataType::Int8
+                        | DataType::Int16
+                        | DataType::Int32
+                        | DataType::Int64
+                        | DataType::Float32
+                        | DataType::Float64
+                        | DataType::Decimal128(_, _)
+                        | DataType::Utf8
+                )
+            }
+            DataType::Float32 | DataType::Float64 => matches!(
+                to_type,
+                DataType::Boolean
+                    | DataType::Int8
+                    | DataType::Int16
+                    | DataType::Int32
+                    | DataType::Int64
+                    | DataType::Float32
+                    | DataType::Float64
+            ),
+            DataType::Decimal128(_, _) | DataType::Decimal256(_, _) => matches!(
+                to_type,
+                DataType::Int8
+                    | DataType::Int16
+                    | DataType::Int32
+                    | DataType::Int64
+                    | DataType::Float32
+                    | DataType::Float64
+                    | DataType::Decimal128(_, _)
+                    | DataType::Decimal256(_, _)
+            ),
+            DataType::Utf8 => matches!(to_type, DataType::Binary),
+            DataType::Date32 => matches!(to_type, DataType::Utf8),
+            DataType::Timestamp(_, _) => {
+                matches!(
+                    to_type,
+                    DataType::Int64 | DataType::Date32 | DataType::Utf8 | DataType::Timestamp(_, _)
+                )
+            }
+            DataType::Binary => {
+                // note that this is not completely Spark compatible because
+                // DataFusion only supports binary data containing valid UTF-8 strings
+                matches!(to_type, DataType::Utf8)
+            }
+            _ => false,
+        }
+    }
+
+    fn cast_string_to_int<OffsetSize: OffsetSizeTrait>(
+        to_type: &DataType,
+        array: &ArrayRef,
+        eval_mode: EvalMode,
+    ) -> SparkResult<ArrayRef> {
+        let string_array = array
+            .as_any()
+            .downcast_ref::<GenericStringArray<OffsetSize>>()
+            .expect("cast_string_to_int expected a string array");
+
+        let cast_array: ArrayRef = match to_type {
+            DataType::Int8 => {
+                cast_utf8_to_int!(string_array, eval_mode, Int8Type, cast_string_to_i8)?
+            }
+            DataType::Int16 => {
+                cast_utf8_to_int!(string_array, eval_mode, Int16Type, cast_string_to_i16)?
+            }
+            DataType::Int32 => {
+                cast_utf8_to_int!(string_array, eval_mode, Int32Type, cast_string_to_i32)?
+            }
+            DataType::Int64 => {
+                cast_utf8_to_int!(string_array, eval_mode, Int64Type, cast_string_to_i64)?
+            }
+            dt => unreachable!(
+                "{}",
+                format!("invalid integer type {dt} in cast from string")
+            ),
+        };
+        Ok(cast_array)
+    }
+
+    fn cast_string_to_date(
+        array: &ArrayRef,
+        to_type: &DataType,
+        eval_mode: EvalMode,
+    ) -> SparkResult<ArrayRef> {
+        let string_array = array
+            .as_any()
+            .downcast_ref::<GenericStringArray<i32>>()
+            .expect("Expected a string array");
+
+        let cast_array: ArrayRef = match to_type {
+            DataType::Date32 => {
+                let len = string_array.len();
+                let mut cast_array = PrimitiveArray::<Date32Type>::builder(len);
+                for i in 0..len {
+                    if !string_array.is_null(i) {
+                        match date_parser(string_array.value(i), eval_mode) {
+                            Ok(Some(cast_value)) => cast_array.append_value(cast_value),
+                            Ok(None) => cast_array.append_null(),
+                            Err(e) => return Err(e),
+                        }
+                    } else {
+                        cast_array.append_null()
+                    }
+                }
+                Arc::new(cast_array.finish()) as ArrayRef
+            }
+            _ => unreachable!("Invalid data type {:?} in cast from string", to_type),
+        };
+        Ok(cast_array)
+    }
+
+    fn cast_string_to_timestamp(
+        array: &ArrayRef,
+        to_type: &DataType,
+        eval_mode: EvalMode,
+    ) -> SparkResult<ArrayRef> {
+        let string_array = array
+            .as_any()
+            .downcast_ref::<GenericStringArray<i32>>()
+            .expect("Expected a string array");
+
+        let cast_array: ArrayRef = match to_type {
+            DataType::Timestamp(_, _) => {
+                cast_utf8_to_timestamp!(
+                    string_array,
+                    eval_mode,
+                    TimestampMicrosecondType,
+                    timestamp_parser
+                )
+            }
+            _ => unreachable!("Invalid data type {:?} in cast from string", to_type),
+        };
+        Ok(cast_array)
+    }
+
+    fn cast_float64_to_decimal128(
+        array: &dyn Array,
+        precision: u8,
+        scale: i8,
+        eval_mode: EvalMode,
+    ) -> SparkResult<ArrayRef> {
+        Self::cast_floating_point_to_decimal128::<Float64Type>(array, precision, scale, eval_mode)
+    }
+
+    fn cast_float32_to_decimal128(
+        array: &dyn Array,
+        precision: u8,
+        scale: i8,
+        eval_mode: EvalMode,
+    ) -> SparkResult<ArrayRef> {
+        Self::cast_floating_point_to_decimal128::<Float32Type>(array, precision, scale, eval_mode)
+    }
+
+    fn cast_floating_point_to_decimal128<T: ArrowPrimitiveType>(
+        array: &dyn Array,
+        precision: u8,
+        scale: i8,
+        eval_mode: EvalMode,
+    ) -> SparkResult<ArrayRef>
+    where
+        <T as ArrowPrimitiveType>::Native: AsPrimitive<f64>,
+    {
+        let input = array.as_any().downcast_ref::<PrimitiveArray<T>>().unwrap();
+        let mut cast_array = PrimitiveArray::<Decimal128Type>::builder(input.len());
+
+        let mul = 10_f64.powi(scale as i32);
+
+        for i in 0..input.len() {
+            if input.is_null(i) {
+                cast_array.append_null();
+            } else {
+                let input_value = input.value(i).as_();
+                let value = (input_value * mul).round().to_i128();
+
+                match value {
+                    Some(v) => {
+                        if Decimal128Type::validate_decimal_precision(v, precision).is_err() {
+                            if eval_mode == EvalMode::Ansi {
+                                return Err(SparkError::NumericValueOutOfRange {
+                                    value: input_value.to_string(),
+                                    precision,
+                                    scale,
+                                });
+                            } else {
+                                cast_array.append_null();
+                            }
+                        }
+                        cast_array.append_value(v);
+                    }
+                    None => {
+                        if eval_mode == EvalMode::Ansi {
+                            return Err(SparkError::NumericValueOutOfRange {
+                                value: input_value.to_string(),
+                                precision,
+                                scale,
+                            });
+                        } else {
+                            cast_array.append_null();
+                        }
+                    }
+                }
+            }
+        }
+
+        let res = Arc::new(
+            cast_array
+                .with_precision_and_scale(precision, scale)?
+                .finish(),
+        ) as ArrayRef;
+        Ok(res)
+    }
+
+    fn spark_cast_float64_to_utf8<OffsetSize>(
+        from: &dyn Array,
+        _eval_mode: EvalMode,
+    ) -> SparkResult<ArrayRef>
+    where
+        OffsetSize: OffsetSizeTrait,
+    {
+        cast_float_to_string!(from, _eval_mode, f64, Float64Array, OffsetSize)
+    }
+
+    fn spark_cast_float32_to_utf8<OffsetSize>(
+        from: &dyn Array,
+        _eval_mode: EvalMode,
+    ) -> SparkResult<ArrayRef>
+    where
+        OffsetSize: OffsetSizeTrait,
+    {
+        cast_float_to_string!(from, _eval_mode, f32, Float32Array, OffsetSize)
+    }
+
+    fn spark_cast_int_to_int(
+        array: &dyn Array,
+        eval_mode: EvalMode,
+        from_type: &DataType,
+        to_type: &DataType,
+    ) -> SparkResult<ArrayRef> {
+        match (from_type, to_type) {
+            (DataType::Int64, DataType::Int32) => cast_int_to_int_macro!(
+                array, eval_mode, Int64Type, Int32Type, from_type, i32, "BIGINT", "INT"
+            ),
+            (DataType::Int64, DataType::Int16) => cast_int_to_int_macro!(
+                array, eval_mode, Int64Type, Int16Type, from_type, i16, "BIGINT", "SMALLINT"
+            ),
+            (DataType::Int64, DataType::Int8) => cast_int_to_int_macro!(
+                array, eval_mode, Int64Type, Int8Type, from_type, i8, "BIGINT", "TINYINT"
+            ),
+            (DataType::Int32, DataType::Int16) => cast_int_to_int_macro!(
+                array, eval_mode, Int32Type, Int16Type, from_type, i16, "INT", "SMALLINT"
+            ),
+            (DataType::Int32, DataType::Int8) => cast_int_to_int_macro!(
+                array, eval_mode, Int32Type, Int8Type, from_type, i8, "INT", "TINYINT"
+            ),
+            (DataType::Int16, DataType::Int8) => cast_int_to_int_macro!(
+                array, eval_mode, Int16Type, Int8Type, from_type, i8, "SMALLINT", "TINYINT"
+            ),
+            _ => unreachable!(
+                "{}",
+                format!("invalid integer type {to_type} in cast from {from_type}")
+            ),
+        }
+    }
+
+    fn spark_cast_utf8_to_boolean<OffsetSize>(
+        from: &dyn Array,
+        eval_mode: EvalMode,
+    ) -> SparkResult<ArrayRef>
+    where
+        OffsetSize: OffsetSizeTrait,
+    {
+        let array = from
+            .as_any()
+            .downcast_ref::<GenericStringArray<OffsetSize>>()
+            .unwrap();
+
+        let output_array = array
+            .iter()
+            .map(|value| match value {
+                Some(value) => match value.to_ascii_lowercase().trim() {
+                    "t" | "true" | "y" | "yes" | "1" => Ok(Some(true)),
+                    "f" | "false" | "n" | "no" | "0" => Ok(Some(false)),
+                    _ if eval_mode == EvalMode::Ansi => Err(SparkError::CastInvalidValue {
+                        value: value.to_string(),
+                        from_type: "STRING".to_string(),
+                        to_type: "BOOLEAN".to_string(),
+                    }),
+                    _ => Ok(None),
+                },
+                _ => Ok(None),
+            })
+            .collect::<Result<BooleanArray, _>>()?;
+
+        Ok(Arc::new(output_array))
+    }
+
+    fn spark_cast_nonintegral_numeric_to_integral(
+        array: &dyn Array,
+        eval_mode: EvalMode,
+        from_type: &DataType,
+        to_type: &DataType,
+    ) -> SparkResult<ArrayRef> {
+        match (from_type, to_type) {
+            (DataType::Float32, DataType::Int8) => cast_float_to_int16_down!(
+                array,
+                eval_mode,
+                Float32Array,
+                Int8Array,
+                f32,
+                i8,
+                "FLOAT",
+                "TINYINT",
+                "{:e}"
+            ),
+            (DataType::Float32, DataType::Int16) => cast_float_to_int16_down!(
+                array,
+                eval_mode,
+                Float32Array,
+                Int16Array,
+                f32,
+                i16,
+                "FLOAT",
+                "SMALLINT",
+                "{:e}"
+            ),
+            (DataType::Float32, DataType::Int32) => cast_float_to_int32_up!(
+                array,
+                eval_mode,
+                Float32Array,
+                Int32Array,
+                f32,
+                i32,
+                "FLOAT",
+                "INT",
+                i32::MAX,
+                "{:e}"
+            ),
+            (DataType::Float32, DataType::Int64) => cast_float_to_int32_up!(
+                array,
+                eval_mode,
+                Float32Array,
+                Int64Array,
+                f32,
+                i64,
+                "FLOAT",
+                "BIGINT",
+                i64::MAX,
+                "{:e}"
+            ),
+            (DataType::Float64, DataType::Int8) => cast_float_to_int16_down!(
+                array,
+                eval_mode,
+                Float64Array,
+                Int8Array,
+                f64,
+                i8,
+                "DOUBLE",
+                "TINYINT",
+                "{:e}D"
+            ),
+            (DataType::Float64, DataType::Int16) => cast_float_to_int16_down!(
+                array,
+                eval_mode,
+                Float64Array,
+                Int16Array,
+                f64,
+                i16,
+                "DOUBLE",
+                "SMALLINT",
+                "{:e}D"
+            ),
+            (DataType::Float64, DataType::Int32) => cast_float_to_int32_up!(
+                array,
+                eval_mode,
+                Float64Array,
+                Int32Array,
+                f64,
+                i32,
+                "DOUBLE",
+                "INT",
+                i32::MAX,
+                "{:e}D"
+            ),
+            (DataType::Float64, DataType::Int64) => cast_float_to_int32_up!(
+                array,
+                eval_mode,
+                Float64Array,
+                Int64Array,
+                f64,
+                i64,
+                "DOUBLE",
+                "BIGINT",
+                i64::MAX,
+                "{:e}D"
+            ),
+            (DataType::Decimal128(precision, scale), DataType::Int8) => {
+                cast_decimal_to_int16_down!(
+                    array, eval_mode, Int8Array, i8, "TINYINT", precision, *scale
+                )
+            }
+            (DataType::Decimal128(precision, scale), DataType::Int16) => {
+                cast_decimal_to_int16_down!(
+                    array, eval_mode, Int16Array, i16, "SMALLINT", precision, *scale
+                )
+            }
+            (DataType::Decimal128(precision, scale), DataType::Int32) => {
+                cast_decimal_to_int32_up!(
+                    array,
+                    eval_mode,
+                    Int32Array,
+                    i32,
+                    "INT",
+                    i32::MAX,
+                    *precision,
+                    *scale
+                )
+            }
+            (DataType::Decimal128(precision, scale), DataType::Int64) => {
+                cast_decimal_to_int32_up!(
+                    array,
+                    eval_mode,
+                    Int64Array,
+                    i64,
+                    "BIGINT",
+                    i64::MAX,
+                    *precision,
+                    *scale
+                )
+            }
+            _ => unreachable!(
+                "{}",
+                format!("invalid cast from non-integral numeric type: {from_type} to integral numeric type: {to_type}")
+            ),
+        }
+    }
+}
+
+/// Equivalent to org.apache.spark.unsafe.types.UTF8String.toByte
+fn cast_string_to_i8(str: &str, eval_mode: EvalMode) -> SparkResult<Option<i8>> {
+    Ok(cast_string_to_int_with_range_check(
+        str,
+        eval_mode,
+        "TINYINT",
+        i8::MIN as i32,
+        i8::MAX as i32,
+    )?
+    .map(|v| v as i8))
+}
+
+/// Equivalent to org.apache.spark.unsafe.types.UTF8String.toShort
+fn cast_string_to_i16(str: &str, eval_mode: EvalMode) -> SparkResult<Option<i16>> {
+    Ok(cast_string_to_int_with_range_check(
+        str,
+        eval_mode,
+        "SMALLINT",
+        i16::MIN as i32,
+        i16::MAX as i32,
+    )?
+    .map(|v| v as i16))
+}
+
+/// Equivalent to org.apache.spark.unsafe.types.UTF8String.toInt(IntWrapper intWrapper)
+fn cast_string_to_i32(str: &str, eval_mode: EvalMode) -> SparkResult<Option<i32>> {
+    do_cast_string_to_int::<i32>(str, eval_mode, "INT", i32::MIN)
+}
+
+/// Equivalent to org.apache.spark.unsafe.types.UTF8String.toLong(LongWrapper intWrapper)
+fn cast_string_to_i64(str: &str, eval_mode: EvalMode) -> SparkResult<Option<i64>> {
+    do_cast_string_to_int::<i64>(str, eval_mode, "BIGINT", i64::MIN)
+}
+
+fn cast_string_to_int_with_range_check(
+    str: &str,
+    eval_mode: EvalMode,
+    type_name: &str,
+    min: i32,
+    max: i32,
+) -> SparkResult<Option<i32>> {
+    match do_cast_string_to_int(str, eval_mode, type_name, i32::MIN)? {
+        None => Ok(None),
+        Some(v) if v >= min && v <= max => Ok(Some(v)),
+        _ if eval_mode == EvalMode::Ansi => Err(invalid_value(str, "STRING", type_name)),
+        _ => Ok(None),
+    }
+}
+
+/// Equivalent to
+/// - org.apache.spark.unsafe.types.UTF8String.toInt(IntWrapper intWrapper, boolean allowDecimal)
+/// - org.apache.spark.unsafe.types.UTF8String.toLong(LongWrapper longWrapper, boolean allowDecimal)
+fn do_cast_string_to_int<
+    T: Num + PartialOrd + Integer + CheckedSub + CheckedNeg + From<i32> + Copy,
+>(
+    str: &str,
+    eval_mode: EvalMode,
+    type_name: &str,
+    min_value: T,
+) -> SparkResult<Option<T>> {
+    let trimmed_str = str.trim();
+    if trimmed_str.is_empty() {
+        return none_or_err(eval_mode, type_name, str);
+    }
+    let len = trimmed_str.len();
+    let mut result: T = T::zero();
+    let mut negative = false;
+    let radix = T::from(10);
+    let stop_value = min_value / radix;
+    let mut parse_sign_and_digits = true;
+
+    for (i, ch) in trimmed_str.char_indices() {
+        if parse_sign_and_digits {
+            if i == 0 {
+                negative = ch == '-';
+                let positive = ch == '+';
+                if negative || positive {
+                    if i + 1 == len {
+                        // input string is just "+" or "-"
+                        return none_or_err(eval_mode, type_name, str);
+                    }
+                    // consume this char
+                    continue;
+                }
+            }
+
+            if ch == '.' {
+                if eval_mode == EvalMode::Legacy {
+                    // truncate decimal in legacy mode
+                    parse_sign_and_digits = false;
+                    continue;
+                } else {
+                    return none_or_err(eval_mode, type_name, str);
+                }
+            }
+
+            let digit = if ch.is_ascii_digit() {
+                (ch as u32) - ('0' as u32)
+            } else {
+                return none_or_err(eval_mode, type_name, str);
+            };
+
+            // We are going to process the new digit and accumulate the result. However, before
+            // doing this, if the result is already smaller than the
+            // stopValue(Integer.MIN_VALUE / radix), then result * 10 will definitely be
+            // smaller than minValue, and we can stop
+            if result < stop_value {
+                return none_or_err(eval_mode, type_name, str);
+            }
+
+            // Since the previous result is greater than or equal to stopValue(Integer.MIN_VALUE /
+            // radix), we can just use `result > 0` to check overflow. If result
+            // overflows, we should stop
+            let v = result * radix;
+            let digit = (digit as i32).into();
+            match v.checked_sub(&digit) {
+                Some(x) if x <= T::zero() => result = x,
+                _ => {
+                    return none_or_err(eval_mode, type_name, str);
+                }
+            }
+        } else {
+            // make sure fractional digits are valid digits but ignore them
+            if !ch.is_ascii_digit() {
+                return none_or_err(eval_mode, type_name, str);
+            }
+        }
+    }
+
+    if !negative {
+        if let Some(neg) = result.checked_neg() {
+            if neg < T::zero() {
+                return none_or_err(eval_mode, type_name, str);
+            }
+            result = neg;
+        } else {
+            return none_or_err(eval_mode, type_name, str);
+        }
+    }
+
+    Ok(Some(result))
+}
+
+/// Either return Ok(None) or Err(SparkError::CastInvalidValue) depending on the evaluation mode
+#[inline]
+fn none_or_err<T>(eval_mode: EvalMode, type_name: &str, str: &str) -> SparkResult<Option<T>> {
+    match eval_mode {
+        EvalMode::Ansi => Err(invalid_value(str, "STRING", type_name)),
+        _ => Ok(None),
+    }
+}
+
+#[inline]
+fn invalid_value(value: &str, from_type: &str, to_type: &str) -> SparkError {
+    SparkError::CastInvalidValue {
+        value: value.to_string(),
+        from_type: from_type.to_string(),
+        to_type: to_type.to_string(),
+    }
+}
+
+#[inline]
+fn cast_overflow(value: &str, from_type: &str, to_type: &str) -> SparkError {
+    SparkError::CastOverFlow {
+        value: value.to_string(),
+        from_type: from_type.to_string(),
+        to_type: to_type.to_string(),
+    }
+}
+
+impl Display for Cast {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        write!(
+            f,
+            "Cast [data_type: {}, timezone: {}, child: {}, eval_mode: {:?}]",
+            self.data_type, self.timezone, self.child, &self.eval_mode
+        )
+    }
+}
+
+impl PartialEq<dyn Any> for Cast {
+    fn eq(&self, other: &dyn Any) -> bool {
+        down_cast_any_ref(other)
+            .downcast_ref::<Self>()
+            .map(|x| {
+                self.child.eq(&x.child)
+                    && self.timezone.eq(&x.timezone)
+                    && self.data_type.eq(&x.data_type)
+                    && self.eval_mode.eq(&x.eval_mode)
+            })
+            .unwrap_or(false)
+    }
+}
+
+impl PhysicalExpr for Cast {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn data_type(&self, _: &Schema) -> DataFusionResult<DataType> {
+        Ok(self.data_type.clone())
+    }
+
+    fn nullable(&self, _: &Schema) -> DataFusionResult<bool> {
+        Ok(true)
+    }
+
+    fn evaluate(&self, batch: &RecordBatch) -> DataFusionResult<ColumnarValue> {
+        let arg = self.child.evaluate(batch)?;
+        match arg {
+            ColumnarValue::Array(array) => Ok(ColumnarValue::Array(self.cast_array(array)?)),
+            ColumnarValue::Scalar(scalar) => {
+                // Note that normally CAST(scalar) should be fold in Spark JVM side. However, for
+                // some cases e.g., scalar subquery, Spark will not fold it, so we need to handle it
+                // here.
+                let array = scalar.to_array()?;
+                let scalar = ScalarValue::try_from_array(&self.cast_array(array)?, 0)?;
+                Ok(ColumnarValue::Scalar(scalar))
+            }
+        }
+    }
+
+    fn children(&self) -> Vec<&Arc<dyn PhysicalExpr>> {
+        vec![&self.child]
+    }
+
+    fn with_new_children(
+        self: Arc<Self>,
+        children: Vec<Arc<dyn PhysicalExpr>>,
+    ) -> datafusion_common::Result<Arc<dyn PhysicalExpr>> {
+        match children.len() {
+            1 => Ok(Arc::new(Cast::new(
+                children[0].clone(),
+                self.data_type.clone(),
+                self.eval_mode,
+                self.timezone.clone(),
+            ))),
+            _ => internal_err!("Cast should have exactly one child"),
+        }
+    }
+
+    fn dyn_hash(&self, state: &mut dyn Hasher) {
+        let mut s = state;
+        self.child.hash(&mut s);
+        self.data_type.hash(&mut s);
+        self.timezone.hash(&mut s);
+        self.eval_mode.hash(&mut s);
+        self.hash(&mut s);
+    }
+}
+
+fn timestamp_parser(value: &str, eval_mode: EvalMode) -> SparkResult<Option<i64>> {
+    let value = value.trim();
+    if value.is_empty() {
+        return Ok(None);
+    }
+    // Define regex patterns and corresponding parsing functions
+    let patterns = &[
+        (
+            Regex::new(r"^\d{4}$").unwrap(),
+            parse_str_to_year_timestamp as fn(&str) -> SparkResult<Option<i64>>,
+        ),
+        (
+            Regex::new(r"^\d{4}-\d{2}$").unwrap(),
+            parse_str_to_month_timestamp,
+        ),
+        (
+            Regex::new(r"^\d{4}-\d{2}-\d{2}$").unwrap(),
+            parse_str_to_day_timestamp,
+        ),
+        (
+            Regex::new(r"^\d{4}-\d{2}-\d{2}T\d{1,2}$").unwrap(),
+            parse_str_to_hour_timestamp,
+        ),
+        (
+            Regex::new(r"^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}$").unwrap(),
+            parse_str_to_minute_timestamp,
+        ),
+        (
+            Regex::new(r"^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}$").unwrap(),
+            parse_str_to_second_timestamp,
+        ),
+        (
+            Regex::new(r"^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d{1,6}$").unwrap(),
+            parse_str_to_microsecond_timestamp,
+        ),
+        (
+            Regex::new(r"^T\d{1,2}$").unwrap(),
+            parse_str_to_time_only_timestamp,
+        ),
+    ];
+
+    let mut timestamp = None;
+
+    // Iterate through patterns and try matching
+    for (pattern, parse_func) in patterns {
+        if pattern.is_match(value) {
+            timestamp = parse_func(value)?;
+            break;
+        }
+    }
+
+    if timestamp.is_none() {
+        return if eval_mode == EvalMode::Ansi {
+            Err(SparkError::CastInvalidValue {
+                value: value.to_string(),
+                from_type: "STRING".to_string(),
+                to_type: "TIMESTAMP".to_string(),
+            })
+        } else {
+            Ok(None)
+        };
+    }
+
+    match timestamp {
+        Some(ts) => Ok(Some(ts)),
+        None => Err(SparkError::Internal(
+            "Failed to parse timestamp".to_string(),
+        )),
+    }
+}
+
+fn parse_ymd_timestamp(year: i32, month: u32, day: u32) -> SparkResult<Option<i64>> {
+    let datetime = chrono::Utc.with_ymd_and_hms(year, month, day, 0, 0, 0);
+
+    // Check if datetime is not None
+    let utc_datetime = match datetime.single() {
+        Some(dt) => dt.with_timezone(&chrono::Utc),
+        None => {
+            return Err(SparkError::Internal(
+                "Failed to parse timestamp".to_string(),
+            ));
+        }
+    };
+
+    Ok(Some(utc_datetime.timestamp_micros()))
+}
+
+fn parse_hms_timestamp(
+    year: i32,
+    month: u32,
+    day: u32,
+    hour: u32,
+    minute: u32,
+    second: u32,
+    microsecond: u32,
+) -> SparkResult<Option<i64>> {
+    let datetime = chrono::Utc.with_ymd_and_hms(year, month, day, hour, minute, second);
+
+    // Check if datetime is not None
+    let utc_datetime = match datetime.single() {
+        Some(dt) => dt
+            .with_timezone(&chrono::Utc)
+            .with_nanosecond(microsecond * 1000),
+        None => {
+            return Err(SparkError::Internal(
+                "Failed to parse timestamp".to_string(),
+            ));
+        }
+    };
+
+    let result = match utc_datetime {
+        Some(dt) => dt.timestamp_micros(),
+        None => {
+            return Err(SparkError::Internal(
+                "Failed to parse timestamp".to_string(),
+            ));
+        }
+    };
+
+    Ok(Some(result))
+}
+
+fn get_timestamp_values(value: &str, timestamp_type: &str) -> SparkResult<Option<i64>> {
+    let values: Vec<_> = value
+        .split(|c| c == 'T' || c == '-' || c == ':' || c == '.')
+        .collect();
+    let year = values[0].parse::<i32>().unwrap_or_default();
+    let month = values.get(1).map_or(1, |m| m.parse::<u32>().unwrap_or(1));
+    let day = values.get(2).map_or(1, |d| d.parse::<u32>().unwrap_or(1));
+    let hour = values.get(3).map_or(0, |h| h.parse::<u32>().unwrap_or(0));
+    let minute = values.get(4).map_or(0, |m| m.parse::<u32>().unwrap_or(0));
+    let second = values.get(5).map_or(0, |s| s.parse::<u32>().unwrap_or(0));
+    let microsecond = values.get(6).map_or(0, |ms| ms.parse::<u32>().unwrap_or(0));
+
+    match timestamp_type {
+        "year" => parse_ymd_timestamp(year, 1, 1),
+        "month" => parse_ymd_timestamp(year, month, 1),
+        "day" => parse_ymd_timestamp(year, month, day),
+        "hour" => parse_hms_timestamp(year, month, day, hour, 0, 0, 0),
+        "minute" => parse_hms_timestamp(year, month, day, hour, minute, 0, 0),
+        "second" => parse_hms_timestamp(year, month, day, hour, minute, second, 0),
+        "microsecond" => parse_hms_timestamp(year, month, day, hour, minute, second, microsecond),
+        _ => Err(SparkError::CastInvalidValue {
+            value: value.to_string(),
+            from_type: "STRING".to_string(),
+            to_type: "TIMESTAMP".to_string(),
+        }),
+    }
+}
+
+fn parse_str_to_year_timestamp(value: &str) -> SparkResult<Option<i64>> {
+    get_timestamp_values(value, "year")
+}
+
+fn parse_str_to_month_timestamp(value: &str) -> SparkResult<Option<i64>> {
+    get_timestamp_values(value, "month")
+}
+
+fn parse_str_to_day_timestamp(value: &str) -> SparkResult<Option<i64>> {
+    get_timestamp_values(value, "day")
+}
+
+fn parse_str_to_hour_timestamp(value: &str) -> SparkResult<Option<i64>> {
+    get_timestamp_values(value, "hour")
+}
+
+fn parse_str_to_minute_timestamp(value: &str) -> SparkResult<Option<i64>> {
+    get_timestamp_values(value, "minute")
+}
+
+fn parse_str_to_second_timestamp(value: &str) -> SparkResult<Option<i64>> {
+    get_timestamp_values(value, "second")
+}
+
+fn parse_str_to_microsecond_timestamp(value: &str) -> SparkResult<Option<i64>> {
+    get_timestamp_values(value, "microsecond")
+}
+
+fn parse_str_to_time_only_timestamp(value: &str) -> SparkResult<Option<i64>> {
+    let values: Vec<&str> = value.split('T').collect();
+    let time_values: Vec<u32> = values[1]
+        .split(':')
+        .map(|v| v.parse::<u32>().unwrap_or(0))
+        .collect();
+
+    let datetime = chrono::Utc::now();
+    let timestamp = datetime
+        .with_hour(time_values.first().copied().unwrap_or_default())
+        .and_then(|dt| dt.with_minute(*time_values.get(1).unwrap_or(&0)))
+        .and_then(|dt| dt.with_second(*time_values.get(2).unwrap_or(&0)))
+        .and_then(|dt| dt.with_nanosecond(*time_values.get(3).unwrap_or(&0) * 1_000))
+        .map(|dt| dt.to_utc().timestamp_micros())
+        .unwrap_or_default();
+
+    Ok(Some(timestamp))
+}
+
+//a string to date parser - port of spark's SparkDateTimeUtils#stringToDate.
+fn date_parser(date_str: &str, eval_mode: EvalMode) -> SparkResult<Option<i32>> {
+    // local functions
+    fn get_trimmed_start(bytes: &[u8]) -> usize {
+        let mut start = 0;
+        while start < bytes.len() && is_whitespace_or_iso_control(bytes[start]) {
+            start += 1;
+        }
+        start
+    }
+
+    fn get_trimmed_end(start: usize, bytes: &[u8]) -> usize {
+        let mut end = bytes.len() - 1;
+        while end > start && is_whitespace_or_iso_control(bytes[end]) {
+            end -= 1;
+        }
+        end + 1
+    }
+
+    fn is_whitespace_or_iso_control(byte: u8) -> bool {
+        byte.is_ascii_whitespace() || byte.is_ascii_control()
+    }
+
+    fn is_valid_digits(segment: i32, digits: usize) -> bool {
+        // An integer is able to represent a date within [+-]5 million years.
+        let max_digits_year = 7;
+        //year (segment 0) can be between 4 to 7 digits,
+        //month and day (segment 1 and 2) can be between 1 to 2 digits
+        (segment == 0 && digits >= 4 && digits <= max_digits_year)
+            || (segment != 0 && digits > 0 && digits <= 2)
+    }
+
+    fn return_result(date_str: &str, eval_mode: EvalMode) -> SparkResult<Option<i32>> {
+        if eval_mode == EvalMode::Ansi {
+            Err(SparkError::CastInvalidValue {
+                value: date_str.to_string(),
+                from_type: "STRING".to_string(),
+                to_type: "DATE".to_string(),
+            })
+        } else {
+            Ok(None)
+        }
+    }
+    // end local functions
+
+    if date_str.is_empty() {
+        return return_result(date_str, eval_mode);
+    }
+
+    //values of date segments year, month and day defaulting to 1
+    let mut date_segments = [1, 1, 1];
+    let mut sign = 1;
+    let mut current_segment = 0;
+    let mut current_segment_value = Wrapping(0);
+    let mut current_segment_digits = 0;
+    let bytes = date_str.as_bytes();
+
+    let mut j = get_trimmed_start(bytes);
+    let str_end_trimmed = get_trimmed_end(j, bytes);
+
+    if j == str_end_trimmed {
+        return return_result(date_str, eval_mode);
+    }
+
+    //assign a sign to the date
+    if bytes[j] == b'-' || bytes[j] == b'+' {
+        sign = if bytes[j] == b'-' { -1 } else { 1 };
+        j += 1;
+    }
+
+    //loop to the end of string until we have processed 3 segments,
+    //exit loop on encountering any space ' ' or 'T' after the 3rd segment
+    while j < str_end_trimmed && (current_segment < 3 && !(bytes[j] == b' ' || bytes[j] == b'T')) {
+        let b = bytes[j];
+        if current_segment < 2 && b == b'-' {
+            //check for validity of year and month segments if current byte is separator
+            if !is_valid_digits(current_segment, current_segment_digits) {
+                return return_result(date_str, eval_mode);
+            }
+            //if valid update corresponding segment with the current segment value.
+            date_segments[current_segment as usize] = current_segment_value.0;
+            current_segment_value = Wrapping(0);
+            current_segment_digits = 0;
+            current_segment += 1;
+        } else if !b.is_ascii_digit() {
+            return return_result(date_str, eval_mode);
+        } else {
+            //increment value of current segment by the next digit
+            let parsed_value = Wrapping((b - b'0') as i32);
+            current_segment_value = current_segment_value * Wrapping(10) + parsed_value;
+            current_segment_digits += 1;
+        }
+        j += 1;
+    }
+
+    //check for validity of last segment
+    if !is_valid_digits(current_segment, current_segment_digits) {
+        return return_result(date_str, eval_mode);
+    }
+
+    if current_segment < 2 && j < str_end_trimmed {
+        // For the `yyyy` and `yyyy-[m]m` formats, entire input must be consumed.
+        return return_result(date_str, eval_mode);
+    }
+
+    date_segments[current_segment as usize] = current_segment_value.0;
+
+    match NaiveDate::from_ymd_opt(
+        sign * date_segments[0],
+        date_segments[1] as u32,
+        date_segments[2] as u32,
+    ) {
+        Some(date) => {
+            let duration_since_epoch = date
+                .signed_duration_since(NaiveDateTime::UNIX_EPOCH.date())
+                .num_days();
+            Ok(Some(duration_since_epoch.to_i32().unwrap()))
+        }
+        None => Ok(None),
+    }
+}
+
+/// This takes for special casting cases of Spark. E.g., Timestamp to Long.
+/// This function runs as a post process of the DataFusion cast(). By the time it arrives here,
+/// Dictionary arrays are already unpacked by the DataFusion cast() since Spark cannot specify
+/// Dictionary as to_type. The from_type is taken before the DataFusion cast() runs in
+/// expressions/cast.rs, so it can be still Dictionary.
+fn spark_cast(array: ArrayRef, from_type: &DataType, to_type: &DataType) -> ArrayRef {
+    match (from_type, to_type) {
+        (DataType::Timestamp(_, _), DataType::Int64) => {
+            // See Spark's `Cast` expression
+            unary_dyn::<_, Int64Type>(&array, |v| div_floor(v, MICROS_PER_SECOND)).unwrap()
+        }
+        (DataType::Dictionary(_, value_type), DataType::Int64)
+            if matches!(value_type.as_ref(), &DataType::Timestamp(_, _)) =>
+        {
+            // See Spark's `Cast` expression
+            unary_dyn::<_, Int64Type>(&array, |v| div_floor(v, MICROS_PER_SECOND)).unwrap()
+        }
+        (DataType::Timestamp(_, _), DataType::Utf8) => remove_trailing_zeroes(array),
+        (DataType::Dictionary(_, value_type), DataType::Utf8)
+            if matches!(value_type.as_ref(), &DataType::Timestamp(_, _)) =>
+        {
+            remove_trailing_zeroes(array)
+        }
+        _ => array,
+    }
+}
+
+/// A fork & modified version of Arrow's `unary_dyn` which is being deprecated
+fn unary_dyn<F, T>(array: &ArrayRef, op: F) -> Result<ArrayRef, ArrowError>
+where
+    T: ArrowPrimitiveType,
+    F: Fn(T::Native) -> T::Native,
+{
+    if let Some(d) = array.as_any_dictionary_opt() {
+        let new_values = unary_dyn::<F, T>(d.values(), op)?;
+        return Ok(Arc::new(d.with_values(Arc::new(new_values))));
+    }
+
+    match array.as_primitive_opt::<T>() {
+        Some(a) if PrimitiveArray::<T>::is_compatible(a.data_type()) => {
+            Ok(Arc::new(unary::<T, F, T>(
+                array.as_any().downcast_ref::<PrimitiveArray<T>>().unwrap(),
+                op,
+            )))
+        }
+        _ => Err(ArrowError::NotYetImplemented(format!(
+            "Cannot perform unary operation of type {} on array of type {}",
+            T::DATA_TYPE,
+            array.data_type()
+        ))),
+    }
+}
+
+/// Remove any trailing zeroes in the string if they occur after in the fractional seconds,
+/// to match Spark behavior
+/// example:
+/// "1970-01-01 05:29:59.900" => "1970-01-01 05:29:59.9"
+/// "1970-01-01 05:29:59.990" => "1970-01-01 05:29:59.99"
+/// "1970-01-01 05:29:59.999" => "1970-01-01 05:29:59.999"
+/// "1970-01-01 05:30:00"     => "1970-01-01 05:30:00"
+/// "1970-01-01 05:30:00.001" => "1970-01-01 05:30:00.001"
+fn remove_trailing_zeroes(array: ArrayRef) -> ArrayRef {
+    let string_array = as_generic_string_array::<i32>(&array).unwrap();
+    let result = string_array
+        .iter()
+        .map(|s| s.map(trim_end))
+        .collect::<GenericStringArray<i32>>();
+    Arc::new(result) as ArrayRef
+}
+
+fn trim_end(s: &str) -> &str {
+    if s.rfind('.').is_some() {
+        s.trim_end_matches('0')
+    } else {
+        s
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use arrow::datatypes::TimestampMicrosecondType;
+    use arrow_array::StringArray;
+    use arrow_schema::TimeUnit;
+
+    use datafusion_physical_expr::expressions::Column;
+
+    use super::*;
+
+    #[test]
+    #[cfg_attr(miri, ignore)] // test takes too long with miri
+    fn timestamp_parser_test() {
+        // write for all formats
+        assert_eq!(
+            timestamp_parser("2020", EvalMode::Legacy).unwrap(),
+            Some(1577836800000000) // this is in milliseconds
+        );
+        assert_eq!(
+            timestamp_parser("2020-01", EvalMode::Legacy).unwrap(),
+            Some(1577836800000000)
+        );
+        assert_eq!(
+            timestamp_parser("2020-01-01", EvalMode::Legacy).unwrap(),
+            Some(1577836800000000)
+        );
+        assert_eq!(
+            timestamp_parser("2020-01-01T12", EvalMode::Legacy).unwrap(),
+            Some(1577880000000000)
+        );
+        assert_eq!(
+            timestamp_parser("2020-01-01T12:34", EvalMode::Legacy).unwrap(),
+            Some(1577882040000000)
+        );
+        assert_eq!(
+            timestamp_parser("2020-01-01T12:34:56", EvalMode::Legacy).unwrap(),
+            Some(1577882096000000)
+        );
+        assert_eq!(
+            timestamp_parser("2020-01-01T12:34:56.123456", EvalMode::Legacy).unwrap(),
+            Some(1577882096123456)
+        );
+        // assert_eq!(
+        //     timestamp_parser("T2",  EvalMode::Legacy).unwrap(),
+        //     Some(1714356000000000) // this value needs to change everyday.
+        // );
+    }
+
+    #[test]
+    #[cfg_attr(miri, ignore)] // test takes too long with miri
+    fn test_cast_string_to_timestamp() {
+        let array: ArrayRef = Arc::new(StringArray::from(vec![
+            Some("2020-01-01T12:34:56.123456"),
+            Some("T2"),
+        ]));
+
+        let string_array = array
+            .as_any()
+            .downcast_ref::<GenericStringArray<i32>>()
+            .expect("Expected a string array");
+
+        let eval_mode = EvalMode::Legacy;
+        let result = cast_utf8_to_timestamp!(
+            &string_array,
+            eval_mode,
+            TimestampMicrosecondType,
+            timestamp_parser
+        );
+
+        assert_eq!(
+            result.data_type(),
+            &DataType::Timestamp(TimeUnit::Microsecond, Some("UTC".into()))
+        );
+        assert_eq!(result.len(), 2);
+    }
+
+    #[test]
+    fn date_parser_test() {
+        for date in &[
+            "2020",
+            "2020-01",
+            "2020-01-01",
+            "02020-01-01",
+            "002020-01-01",
+            "0002020-01-01",
+            "2020-1-1",
+            "2020-01-01 ",
+            "2020-01-01T",
+        ] {
+            for eval_mode in &[EvalMode::Legacy, EvalMode::Ansi, EvalMode::Try] {
+                assert_eq!(date_parser(*date, *eval_mode).unwrap(), Some(18262));
+            }
+        }
+
+        //dates in invalid formats
+        for date in &[
+            "abc",
+            "",
+            "not_a_date",
+            "3/",
+            "3/12",
+            "3/12/2020",
+            "3/12/2002 T",
+            "202",
+            "2020-010-01",
+            "2020-10-010",
+            "2020-10-010T",
+            "--262143-12-31",
+            "--262143-12-31 ",
+        ] {
+            for eval_mode in &[EvalMode::Legacy, EvalMode::Try] {
+                assert_eq!(date_parser(*date, *eval_mode).unwrap(), None);
+            }
+            assert!(date_parser(*date, EvalMode::Ansi).is_err());
+        }
+
+        for date in &["-3638-5"] {
+            for eval_mode in &[EvalMode::Legacy, EvalMode::Try, EvalMode::Ansi] {
+                assert_eq!(date_parser(*date, *eval_mode).unwrap(), Some(-2048160));
+            }
+        }
+
+        //Naive Date only supports years 262142 AD to 262143 BC
+        //returns None for dates out of range supported by Naive Date.
+        for date in &[
+            "-262144-1-1",
+            "262143-01-1",
+            "262143-1-1",
+            "262143-01-1 ",
+            "262143-01-01T ",
+            "262143-1-01T 1234",
+            "-0973250",
+        ] {
+            for eval_mode in &[EvalMode::Legacy, EvalMode::Try, EvalMode::Ansi] {
+                assert_eq!(date_parser(*date, *eval_mode).unwrap(), None);
+            }
+        }
+    }
+
+    #[test]
+    fn test_cast_string_to_date() {
+        let array: ArrayRef = Arc::new(StringArray::from(vec![
+            Some("2020"),
+            Some("2020-01"),
+            Some("2020-01-01"),
+            Some("2020-01-01T"),
+        ]));
+
+        let result =
+            Cast::cast_string_to_date(&array, &DataType::Date32, EvalMode::Legacy).unwrap();
+
+        let date32_array = result
+            .as_any()
+            .downcast_ref::<arrow::array::Date32Array>()
+            .unwrap();
+        assert_eq!(date32_array.len(), 4);
+        date32_array
+            .iter()
+            .for_each(|v| assert_eq!(v.unwrap(), 18262));
+    }
+
+    #[test]
+    fn test_cast_string_array_with_valid_dates() {
+        let array_with_invalid_date: ArrayRef = Arc::new(StringArray::from(vec![
+            Some("-262143-12-31"),
+            Some("\n -262143-12-31 "),
+            Some("-262143-12-31T \t\n"),
+            Some("\n\t-262143-12-31T\r"),
+            Some("-262143-12-31T 123123123"),
+            Some("\r\n-262143-12-31T \r123123123"),
+            Some("\n -262143-12-31T \n\t"),
+        ]));
+
+        for eval_mode in &[EvalMode::Legacy, EvalMode::Try, EvalMode::Ansi] {
+            let result =
+                Cast::cast_string_to_date(&array_with_invalid_date, &DataType::Date32, *eval_mode)
+                    .unwrap();
+
+            let date32_array = result
+                .as_any()
+                .downcast_ref::<arrow::array::Date32Array>()
+                .unwrap();
+            assert_eq!(result.len(), 7);
+            date32_array
+                .iter()
+                .for_each(|v| assert_eq!(v.unwrap(), -96464928));
+        }
+    }
+
+    #[test]
+    fn test_cast_string_array_with_invalid_dates() {
+        let array_with_invalid_date: ArrayRef = Arc::new(StringArray::from(vec![
+            Some("2020"),
+            Some("2020-01"),
+            Some("2020-01-01"),
+            //4 invalid dates
+            Some("2020-010-01T"),
+            Some("202"),
+            Some(" 202 "),
+            Some("\n 2020-\r8 "),
+            Some("2020-01-01T"),
+            // Overflows i32
+            Some("-4607172990231812908"),
+        ]));
+
+        for eval_mode in &[EvalMode::Legacy, EvalMode::Try] {
+            let result =
+                Cast::cast_string_to_date(&array_with_invalid_date, &DataType::Date32, *eval_mode)
+                    .unwrap();
+
+            let date32_array = result
+                .as_any()
+                .downcast_ref::<arrow::array::Date32Array>()
+                .unwrap();
+            assert_eq!(
+                date32_array.iter().collect::<Vec<_>>(),
+                vec![
+                    Some(18262),
+                    Some(18262),
+                    Some(18262),
+                    None,
+                    None,
+                    None,
+                    None,
+                    Some(18262),
+                    None
+                ]
+            );
+        }
+
+        let result =
+            Cast::cast_string_to_date(&array_with_invalid_date, &DataType::Date32, EvalMode::Ansi);
+        match result {
+            Err(e) => assert!(
+                e.to_string().contains(
+                    "[CAST_INVALID_INPUT] The value '2020-010-01T' of the type \"STRING\" cannot be cast to \"DATE\" because it is malformed")
+            ),
+            _ => panic!("Expected error"),
+        }
+    }
+
+    #[test]
+    fn test_cast_string_as_i8() {
+        // basic
+        assert_eq!(
+            cast_string_to_i8("127", EvalMode::Legacy).unwrap(),
+            Some(127_i8)
+        );
+        assert_eq!(cast_string_to_i8("128", EvalMode::Legacy).unwrap(), None);
+        assert!(cast_string_to_i8("128", EvalMode::Ansi).is_err());
+        // decimals
+        assert_eq!(
+            cast_string_to_i8("0.2", EvalMode::Legacy).unwrap(),
+            Some(0_i8)
+        );
+        assert_eq!(
+            cast_string_to_i8(".", EvalMode::Legacy).unwrap(),
+            Some(0_i8)
+        );
+        // TRY should always return null for decimals
+        assert_eq!(cast_string_to_i8("0.2", EvalMode::Try).unwrap(), None);
+        assert_eq!(cast_string_to_i8(".", EvalMode::Try).unwrap(), None);
+        // ANSI mode should throw error on decimal
+        assert!(cast_string_to_i8("0.2", EvalMode::Ansi).is_err());
+        assert!(cast_string_to_i8(".", EvalMode::Ansi).is_err());
+    }
+
+    #[test]
+    fn test_cast_unsupported_timestamp_to_date() {
+        // Since datafusion uses chrono::Datetime internally not all dates representable by TimestampMicrosecondType are supported
+        let timestamps: PrimitiveArray<TimestampMicrosecondType> = vec![i64::MAX].into();
+        let cast = Cast::new(
+            Arc::new(Column::new("a", 0)),
+            DataType::Date32,
+            EvalMode::Legacy,
+            "UTC".to_owned(),
+        );
+        let result = cast.cast_array(Arc::new(timestamps.with_timezone("Europe/Copenhagen")));
+        assert!(result.is_err())
+    }
+
+    #[test]
+    fn test_cast_invalid_timezone() {
+        let timestamps: PrimitiveArray<TimestampMicrosecondType> = vec![i64::MAX].into();
+        let cast = Cast::new(
+            Arc::new(Column::new("a", 0)),
+            DataType::Date32,
+            EvalMode::Legacy,
+            "Not a valid timezone".to_owned(),
+        );
+        let result = cast.cast_array(Arc::new(timestamps.with_timezone("Europe/Copenhagen")));
+        assert!(result.is_err())
+    }
+}
diff --git a/src/lib.rs b/src/lib.rs
index 57da56f9aca6..93c7f249eb2e 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -16,6 +16,7 @@
 // under the License.
 
 mod abs;
+pub mod cast;
 mod error;
 mod if_expr;
 

From fb7b1981bf528481fdb43606b24e8b829c457470 Mon Sep 17 00:00:00 2001
From: Andy Grove <agrove@apache.org>
Date: Fri, 12 Jul 2024 16:13:38 -0600
Subject: [PATCH 05/68] remove utils crate and move utils into spark-expr crate
 (#658)

---
 Cargo.toml      |   3 +-
 src/cast.rs     |   2 +-
 src/if_expr.rs  |   2 +-
 src/lib.rs      |   3 +
 src/timezone.rs | 143 +++++++++++++++++++++++++++++++++++
 src/utils.rs    | 196 ++++++++++++++++++++++++++++++++++++++++++++++++
 6 files changed, 346 insertions(+), 3 deletions(-)
 create mode 100644 src/timezone.rs
 create mode 100644 src/utils.rs

diff --git a/Cargo.toml b/Cargo.toml
index 220417fe8b05..976a1f36f354 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -36,7 +36,8 @@ datafusion-common = { workspace = true }
 datafusion-functions = { workspace = true }
 datafusion-expr = { workspace = true }
 datafusion-physical-expr = { workspace = true }
-datafusion-comet-utils = { workspace = true }
+datafusion-physical-plan = { workspace = true }
+chrono-tz = { workspace = true }
 num = { workspace = true }
 regex = { workspace = true }
 thiserror = { workspace = true }
diff --git a/src/cast.rs b/src/cast.rs
index b9cf2790b5d2..7f53583e8d76 100644
--- a/src/cast.rs
+++ b/src/cast.rs
@@ -55,7 +55,7 @@ use num::{
 };
 use regex::Regex;
 
-use datafusion_comet_utils::{array_with_timezone, down_cast_any_ref};
+use crate::utils::{array_with_timezone, down_cast_any_ref};
 
 use crate::{EvalMode, SparkError, SparkResult};
 
diff --git a/src/if_expr.rs b/src/if_expr.rs
index c04494ec4ffb..fa52c5d5b9b9 100644
--- a/src/if_expr.rs
+++ b/src/if_expr.rs
@@ -31,7 +31,7 @@ use datafusion::logical_expr::ColumnarValue;
 use datafusion_common::{cast::as_boolean_array, Result};
 use datafusion_physical_expr::PhysicalExpr;
 
-use datafusion_comet_utils::down_cast_any_ref;
+use crate::utils::down_cast_any_ref;
 
 #[derive(Debug, Hash)]
 pub struct IfExpr {
diff --git a/src/lib.rs b/src/lib.rs
index 93c7f249eb2e..3c726f52a8e8 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -20,6 +20,9 @@ pub mod cast;
 mod error;
 mod if_expr;
 
+pub mod timezone;
+pub mod utils;
+
 pub use abs::Abs;
 pub use error::{SparkError, SparkResult};
 pub use if_expr::IfExpr;
diff --git a/src/timezone.rs b/src/timezone.rs
new file mode 100644
index 000000000000..7aad386aa915
--- /dev/null
+++ b/src/timezone.rs
@@ -0,0 +1,143 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+/// Utils for timezone. This is basically from arrow-array::timezone (private).
+use arrow_schema::ArrowError;
+use chrono::{
+    format::{parse, Parsed, StrftimeItems},
+    offset::TimeZone,
+    FixedOffset, LocalResult, NaiveDate, NaiveDateTime, Offset,
+};
+use std::str::FromStr;
+
+/// Parses a fixed offset of the form "+09:00"
+fn parse_fixed_offset(tz: &str) -> Result<FixedOffset, ArrowError> {
+    let mut parsed = Parsed::new();
+
+    if let Ok(fixed_offset) =
+        parse(&mut parsed, tz, StrftimeItems::new("%:z")).and_then(|_| parsed.to_fixed_offset())
+    {
+        return Ok(fixed_offset);
+    }
+
+    if let Ok(fixed_offset) =
+        parse(&mut parsed, tz, StrftimeItems::new("%#z")).and_then(|_| parsed.to_fixed_offset())
+    {
+        return Ok(fixed_offset);
+    }
+
+    Err(ArrowError::ParseError(format!(
+        "Invalid timezone \"{}\": Expected format [+-]XX:XX, [+-]XX, or [+-]XXXX",
+        tz
+    )))
+}
+
+/// An [`Offset`] for [`Tz`]
+#[derive(Debug, Copy, Clone)]
+pub struct TzOffset {
+    tz: Tz,
+    offset: FixedOffset,
+}
+
+impl std::fmt::Display for TzOffset {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        self.offset.fmt(f)
+    }
+}
+
+impl Offset for TzOffset {
+    fn fix(&self) -> FixedOffset {
+        self.offset
+    }
+}
+
+/// An Arrow [`TimeZone`]
+#[derive(Debug, Copy, Clone)]
+pub struct Tz(TzInner);
+
+#[derive(Debug, Copy, Clone)]
+enum TzInner {
+    Timezone(chrono_tz::Tz),
+    Offset(FixedOffset),
+}
+
+impl FromStr for Tz {
+    type Err = ArrowError;
+
+    fn from_str(tz: &str) -> Result<Self, Self::Err> {
+        if tz.starts_with('+') || tz.starts_with('-') {
+            Ok(Self(TzInner::Offset(parse_fixed_offset(tz)?)))
+        } else {
+            Ok(Self(TzInner::Timezone(tz.parse().map_err(|e| {
+                ArrowError::ParseError(format!("Invalid timezone \"{}\": {}", tz, e))
+            })?)))
+        }
+    }
+}
+
+macro_rules! tz {
+    ($s:ident, $tz:ident, $b:block) => {
+        match $s.0 {
+            TzInner::Timezone($tz) => $b,
+            TzInner::Offset($tz) => $b,
+        }
+    };
+}
+
+impl TimeZone for Tz {
+    type Offset = TzOffset;
+
+    fn from_offset(offset: &Self::Offset) -> Self {
+        offset.tz
+    }
+
+    fn offset_from_local_date(&self, local: &NaiveDate) -> LocalResult<Self::Offset> {
+        tz!(self, tz, {
+            tz.offset_from_local_date(local).map(|x| TzOffset {
+                tz: *self,
+                offset: x.fix(),
+            })
+        })
+    }
+
+    fn offset_from_local_datetime(&self, local: &NaiveDateTime) -> LocalResult<Self::Offset> {
+        tz!(self, tz, {
+            tz.offset_from_local_datetime(local).map(|x| TzOffset {
+                tz: *self,
+                offset: x.fix(),
+            })
+        })
+    }
+
+    fn offset_from_utc_date(&self, utc: &NaiveDate) -> Self::Offset {
+        tz!(self, tz, {
+            TzOffset {
+                tz: *self,
+                offset: tz.offset_from_utc_date(utc).fix(),
+            }
+        })
+    }
+
+    fn offset_from_utc_datetime(&self, utc: &NaiveDateTime) -> Self::Offset {
+        tz!(self, tz, {
+            TzOffset {
+                tz: *self,
+                offset: tz.offset_from_utc_datetime(utc).fix(),
+            }
+        })
+    }
+}
diff --git a/src/utils.rs b/src/utils.rs
new file mode 100644
index 000000000000..6945e82b3e4f
--- /dev/null
+++ b/src/utils.rs
@@ -0,0 +1,196 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow_array::{
+    cast::as_primitive_array,
+    types::{Int32Type, TimestampMicrosecondType},
+};
+use arrow_schema::{ArrowError, DataType};
+use std::any::Any;
+use std::sync::Arc;
+
+use crate::timezone::Tz;
+use arrow::{
+    array::{as_dictionary_array, Array, ArrayRef, PrimitiveArray},
+    temporal_conversions::as_datetime,
+};
+use chrono::{DateTime, Offset, TimeZone};
+
+use datafusion_physical_plan::PhysicalExpr;
+
+/// A utility function from DataFusion. It is not exposed by DataFusion.
+pub fn down_cast_any_ref(any: &dyn Any) -> &dyn Any {
+    if any.is::<Arc<dyn PhysicalExpr>>() {
+        any.downcast_ref::<Arc<dyn PhysicalExpr>>()
+            .unwrap()
+            .as_any()
+    } else if any.is::<Box<dyn PhysicalExpr>>() {
+        any.downcast_ref::<Box<dyn PhysicalExpr>>()
+            .unwrap()
+            .as_any()
+    } else {
+        any
+    }
+}
+
+/// Preprocesses input arrays to add timezone information from Spark to Arrow array datatype or
+/// to apply timezone offset.
+//
+//  We consider the following cases:
+//
+//  | --------------------- | ------------ | ----------------- | -------------------------------- |
+//  | Conversion            | Input array  | Timezone          | Output array                     |
+//  | --------------------- | ------------ | ----------------- | -------------------------------- |
+//  | Timestamp ->          | Array in UTC | Timezone of input | A timestamp with the timezone    |
+//  |  Utf8 or Date32       |              |                   | offset applied and timezone      |
+//  |                       |              |                   | removed                          |
+//  | --------------------- | ------------ | ----------------- | -------------------------------- |
+//  | Timestamp ->          | Array in UTC | Timezone of input | Same as input array              |
+//  |  Timestamp  w/Timezone|              |                   |                                  |
+//  | --------------------- | ------------ | ----------------- | -------------------------------- |
+//  | Timestamp_ntz ->      | Array in     | Timezone of input | Same as input array              |
+//  |   Utf8 or Date32      | timezone     |                   |                                  |
+//  |                       | session local|                   |                                  |
+//  |                       | timezone     |                   |                                  |
+//  | --------------------- | ------------ | ----------------- | -------------------------------- |
+//  | Timestamp_ntz ->      | Array in     | Timezone of input |  Array in UTC and timezone       |
+//  |  Timestamp w/Timezone | session local|                   |  specified in input              |
+//  |                       | timezone     |                   |                                  |
+//  | --------------------- | ------------ | ----------------- | -------------------------------- |
+//  | Timestamp(_ntz) ->    |                                                                     |
+//  |        Any other type |              Not Supported                                          |
+//  | --------------------- | ------------ | ----------------- | -------------------------------- |
+//
+pub fn array_with_timezone(
+    array: ArrayRef,
+    timezone: String,
+    to_type: Option<&DataType>,
+) -> Result<ArrayRef, ArrowError> {
+    match array.data_type() {
+        DataType::Timestamp(_, None) => {
+            assert!(!timezone.is_empty());
+            match to_type {
+                Some(DataType::Utf8) | Some(DataType::Date32) => Ok(array),
+                Some(DataType::Timestamp(_, Some(_))) => {
+                    timestamp_ntz_to_timestamp(array, timezone.as_str(), Some(timezone.as_str()))
+                }
+                _ => {
+                    // Not supported
+                    panic!(
+                        "Cannot convert from {:?} to {:?}",
+                        array.data_type(),
+                        to_type.unwrap()
+                    )
+                }
+            }
+        }
+        DataType::Timestamp(_, Some(_)) => {
+            assert!(!timezone.is_empty());
+            let array = as_primitive_array::<TimestampMicrosecondType>(&array);
+            let array_with_timezone = array.clone().with_timezone(timezone.clone());
+            let array = Arc::new(array_with_timezone) as ArrayRef;
+            match to_type {
+                Some(DataType::Utf8) | Some(DataType::Date32) => {
+                    pre_timestamp_cast(array, timezone)
+                }
+                _ => Ok(array),
+            }
+        }
+        DataType::Dictionary(_, value_type)
+            if matches!(value_type.as_ref(), &DataType::Timestamp(_, _)) =>
+        {
+            let dict = as_dictionary_array::<Int32Type>(&array);
+            let array = as_primitive_array::<TimestampMicrosecondType>(dict.values());
+            let array_with_timezone =
+                array_with_timezone(Arc::new(array.clone()) as ArrayRef, timezone, to_type)?;
+            let dict = dict.with_values(array_with_timezone);
+            Ok(Arc::new(dict))
+        }
+        _ => Ok(array),
+    }
+}
+
+fn datetime_cast_err(value: i64) -> ArrowError {
+    ArrowError::CastError(format!(
+        "Cannot convert TimestampMicrosecondType {value} to datetime. Comet only supports dates between Jan 1, 262145 BCE and Dec 31, 262143 CE",
+    ))
+}
+
+/// Takes in a Timestamp(Microsecond, None) array and a timezone id, and returns
+/// a Timestamp(Microsecond, Some<_>) array.
+/// The understanding is that the input array has time in the timezone specified in the second
+/// argument.
+/// Parameters:
+///     array - input array of timestamp without timezone
+///     tz - timezone of the values in the input array
+///     to_timezone - timezone to change the input values to
+fn timestamp_ntz_to_timestamp(
+    array: ArrayRef,
+    tz: &str,
+    to_timezone: Option<&str>,
+) -> Result<ArrayRef, ArrowError> {
+    assert!(!tz.is_empty());
+    match array.data_type() {
+        DataType::Timestamp(_, None) => {
+            let array = as_primitive_array::<TimestampMicrosecondType>(&array);
+            let tz: Tz = tz.parse()?;
+            let array: PrimitiveArray<TimestampMicrosecondType> = array.try_unary(|value| {
+                as_datetime::<TimestampMicrosecondType>(value)
+                    .ok_or_else(|| datetime_cast_err(value))
+                    .map(|local_datetime| {
+                        let datetime: DateTime<Tz> =
+                            tz.from_local_datetime(&local_datetime).unwrap();
+                        datetime.timestamp_micros()
+                    })
+            })?;
+            let array_with_tz = if let Some(to_tz) = to_timezone {
+                array.with_timezone(to_tz)
+            } else {
+                array
+            };
+            Ok(Arc::new(array_with_tz))
+        }
+        _ => Ok(array),
+    }
+}
+
+/// This takes for special pre-casting cases of Spark. E.g., Timestamp to String.
+fn pre_timestamp_cast(array: ArrayRef, timezone: String) -> Result<ArrayRef, ArrowError> {
+    assert!(!timezone.is_empty());
+    match array.data_type() {
+        DataType::Timestamp(_, _) => {
+            // Spark doesn't output timezone while casting timestamp to string, but arrow's cast
+            // kernel does if timezone exists. So we need to apply offset of timezone to array
+            // timestamp value and remove timezone from array datatype.
+            let array = as_primitive_array::<TimestampMicrosecondType>(&array);
+
+            let tz: Tz = timezone.parse()?;
+            let array: PrimitiveArray<TimestampMicrosecondType> = array.try_unary(|value| {
+                as_datetime::<TimestampMicrosecondType>(value)
+                    .ok_or_else(|| datetime_cast_err(value))
+                    .map(|datetime| {
+                        let offset = tz.offset_from_utc_datetime(&datetime).fix();
+                        let datetime = datetime + offset;
+                        datetime.and_utc().timestamp_micros()
+                    })
+            })?;
+
+            Ok(Arc::new(array))
+        }
+        _ => Ok(array),
+    }
+}

From d510649c789526c500ed1973655a9da54f4bbdea Mon Sep 17 00:00:00 2001
From: Andy Grove <agrove@apache.org>
Date: Mon, 15 Jul 2024 11:47:44 -0600
Subject: [PATCH 06/68] chore: Move temporal kernels and expressions to
 spark-expr crate (#660)

* Move temporal expressions to spark-expr crate

* reduce public api

* reduce public api

* update imports in benchmarks

* fmt

* remove unused dep
---
 src/kernels/mod.rs      |   20 +
 src/kernels/temporal.rs | 1148 +++++++++++++++++++++++++++++++++++++++
 src/lib.rs              |    6 +-
 src/temporal.rs         |  534 ++++++++++++++++++
 4 files changed, 1707 insertions(+), 1 deletion(-)
 create mode 100644 src/kernels/mod.rs
 create mode 100644 src/kernels/temporal.rs
 create mode 100644 src/temporal.rs

diff --git a/src/kernels/mod.rs b/src/kernels/mod.rs
new file mode 100644
index 000000000000..88aa34b1a3f8
--- /dev/null
+++ b/src/kernels/mod.rs
@@ -0,0 +1,20 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Kernels
+
+pub(crate) mod temporal;
diff --git a/src/kernels/temporal.rs b/src/kernels/temporal.rs
new file mode 100644
index 000000000000..6f2474e8d7a8
--- /dev/null
+++ b/src/kernels/temporal.rs
@@ -0,0 +1,1148 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! temporal kernels
+
+use chrono::{DateTime, Datelike, Duration, NaiveDateTime, Timelike, Utc};
+
+use std::sync::Arc;
+
+use arrow::{array::*, datatypes::DataType};
+use arrow_array::{
+    downcast_dictionary_array, downcast_temporal_array,
+    temporal_conversions::*,
+    timezone::Tz,
+    types::{ArrowDictionaryKeyType, ArrowTemporalType, Date32Type, TimestampMicrosecondType},
+    ArrowNumericType,
+};
+
+use arrow_schema::TimeUnit;
+
+use crate::SparkError;
+
+// Copied from arrow_arith/temporal.rs
+macro_rules! return_compute_error_with {
+    ($msg:expr, $param:expr) => {
+        return { Err(SparkError::Internal(format!("{}: {:?}", $msg, $param))) }
+    };
+}
+
+// The number of days between the beginning of the proleptic gregorian calendar (0001-01-01)
+// and the beginning of the Unix Epoch (1970-01-01)
+const DAYS_TO_UNIX_EPOCH: i32 = 719_163;
+
+// Copied from arrow_arith/temporal.rs with modification to the output datatype
+// Transforms a array of NaiveDate to an array of Date32 after applying an operation
+fn as_datetime_with_op<A: ArrayAccessor<Item = T::Native>, T: ArrowTemporalType, F>(
+    iter: ArrayIter<A>,
+    mut builder: PrimitiveBuilder<Date32Type>,
+    op: F,
+) -> Date32Array
+where
+    F: Fn(NaiveDateTime) -> i32,
+    i64: From<T::Native>,
+{
+    iter.into_iter().for_each(|value| {
+        if let Some(value) = value {
+            match as_datetime::<T>(i64::from(value)) {
+                Some(dt) => builder.append_value(op(dt)),
+                None => builder.append_null(),
+            }
+        } else {
+            builder.append_null();
+        }
+    });
+
+    builder.finish()
+}
+
+#[inline]
+fn as_datetime_with_op_single<F>(
+    value: Option<i32>,
+    builder: &mut PrimitiveBuilder<Date32Type>,
+    op: F,
+) where
+    F: Fn(NaiveDateTime) -> i32,
+{
+    if let Some(value) = value {
+        match as_datetime::<Date32Type>(i64::from(value)) {
+            Some(dt) => builder.append_value(op(dt)),
+            None => builder.append_null(),
+        }
+    } else {
+        builder.append_null();
+    }
+}
+
+// Based on arrow_arith/temporal.rs:extract_component_from_datetime_array
+// Transforms an array of DateTime<Tz> to an arrayOf TimeStampMicrosecond after applying an
+// operation
+fn as_timestamp_tz_with_op<A: ArrayAccessor<Item = T::Native>, T: ArrowTemporalType, F>(
+    iter: ArrayIter<A>,
+    mut builder: PrimitiveBuilder<TimestampMicrosecondType>,
+    tz: &str,
+    op: F,
+) -> Result<TimestampMicrosecondArray, SparkError>
+where
+    F: Fn(DateTime<Tz>) -> i64,
+    i64: From<T::Native>,
+{
+    let tz: Tz = tz.parse()?;
+    for value in iter {
+        match value {
+            Some(value) => match as_datetime_with_timezone::<T>(value.into(), tz) {
+                Some(time) => builder.append_value(op(time)),
+                _ => {
+                    return Err(SparkError::Internal(
+                        "Unable to read value as datetime".to_string(),
+                    ));
+                }
+            },
+            None => builder.append_null(),
+        }
+    }
+    Ok(builder.finish())
+}
+
+fn as_timestamp_tz_with_op_single<T: ArrowTemporalType, F>(
+    value: Option<T::Native>,
+    builder: &mut PrimitiveBuilder<TimestampMicrosecondType>,
+    tz: &Tz,
+    op: F,
+) -> Result<(), SparkError>
+where
+    F: Fn(DateTime<Tz>) -> i64,
+    i64: From<T::Native>,
+{
+    match value {
+        Some(value) => match as_datetime_with_timezone::<T>(value.into(), *tz) {
+            Some(time) => builder.append_value(op(time)),
+            _ => {
+                return Err(SparkError::Internal(
+                    "Unable to read value as datetime".to_string(),
+                ));
+            }
+        },
+        None => builder.append_null(),
+    }
+    Ok(())
+}
+
+#[inline]
+fn as_days_from_unix_epoch(dt: Option<NaiveDateTime>) -> i32 {
+    dt.unwrap().num_days_from_ce() - DAYS_TO_UNIX_EPOCH
+}
+
+// Apply the Tz to the Naive Date Time,,convert to UTC, and return as microseconds in Unix epoch
+#[inline]
+fn as_micros_from_unix_epoch_utc(dt: Option<DateTime<Tz>>) -> i64 {
+    dt.unwrap().with_timezone(&Utc).timestamp_micros()
+}
+
+#[inline]
+fn trunc_date_to_year<T: Datelike + Timelike>(dt: T) -> Option<T> {
+    Some(dt)
+        .and_then(|d| d.with_nanosecond(0))
+        .and_then(|d| d.with_second(0))
+        .and_then(|d| d.with_minute(0))
+        .and_then(|d| d.with_hour(0))
+        .and_then(|d| d.with_day0(0))
+        .and_then(|d| d.with_month0(0))
+}
+
+/// returns the month of the beginning of the quarter
+#[inline]
+fn quarter_month<T: Datelike>(dt: &T) -> u32 {
+    1 + 3 * ((dt.month() - 1) / 3)
+}
+
+#[inline]
+fn trunc_date_to_quarter<T: Datelike + Timelike>(dt: T) -> Option<T> {
+    Some(dt)
+        .and_then(|d| d.with_nanosecond(0))
+        .and_then(|d| d.with_second(0))
+        .and_then(|d| d.with_minute(0))
+        .and_then(|d| d.with_hour(0))
+        .and_then(|d| d.with_day0(0))
+        .and_then(|d| d.with_month(quarter_month(&d)))
+}
+
+#[inline]
+fn trunc_date_to_month<T: Datelike + Timelike>(dt: T) -> Option<T> {
+    Some(dt)
+        .and_then(|d| d.with_nanosecond(0))
+        .and_then(|d| d.with_second(0))
+        .and_then(|d| d.with_minute(0))
+        .and_then(|d| d.with_hour(0))
+        .and_then(|d| d.with_day0(0))
+}
+
+#[inline]
+fn trunc_date_to_week<T>(dt: T) -> Option<T>
+where
+    T: Datelike + Timelike + std::ops::Sub<Duration, Output = T> + Copy,
+{
+    Some(dt)
+        .map(|d| d - Duration::try_seconds(60 * 60 * 24 * d.weekday() as i64).unwrap())
+        .and_then(|d| d.with_nanosecond(0))
+        .and_then(|d| d.with_second(0))
+        .and_then(|d| d.with_minute(0))
+        .and_then(|d| d.with_hour(0))
+}
+
+#[inline]
+fn trunc_date_to_day<T: Timelike>(dt: T) -> Option<T> {
+    Some(dt)
+        .and_then(|d| d.with_nanosecond(0))
+        .and_then(|d| d.with_second(0))
+        .and_then(|d| d.with_minute(0))
+        .and_then(|d| d.with_hour(0))
+}
+
+#[inline]
+fn trunc_date_to_hour<T: Timelike>(dt: T) -> Option<T> {
+    Some(dt)
+        .and_then(|d| d.with_nanosecond(0))
+        .and_then(|d| d.with_second(0))
+        .and_then(|d| d.with_minute(0))
+}
+
+#[inline]
+fn trunc_date_to_minute<T: Timelike>(dt: T) -> Option<T> {
+    Some(dt)
+        .and_then(|d| d.with_nanosecond(0))
+        .and_then(|d| d.with_second(0))
+}
+
+#[inline]
+fn trunc_date_to_second<T: Timelike>(dt: T) -> Option<T> {
+    Some(dt).and_then(|d| d.with_nanosecond(0))
+}
+
+#[inline]
+fn trunc_date_to_ms<T: Timelike>(dt: T) -> Option<T> {
+    Some(dt).and_then(|d| d.with_nanosecond(1_000_000 * (d.nanosecond() / 1_000_000)))
+}
+
+#[inline]
+fn trunc_date_to_microsec<T: Timelike>(dt: T) -> Option<T> {
+    Some(dt).and_then(|d| d.with_nanosecond(1_000 * (d.nanosecond() / 1_000)))
+}
+
+///
+/// Implements the spark [TRUNC](https://spark.apache.org/docs/latest/api/sql/index.html#trunc)
+/// function where the specified format is a scalar value
+///
+///   array is an array of Date32 values. The array may be a dictionary array.
+///
+///   format is a scalar string specifying the format to apply to the timestamp value.
+pub(crate) fn date_trunc_dyn(array: &dyn Array, format: String) -> Result<ArrayRef, SparkError> {
+    match array.data_type().clone() {
+        DataType::Dictionary(_, _) => {
+            downcast_dictionary_array!(
+                array => {
+                    let truncated_values = date_trunc_dyn(array.values(), format)?;
+                    Ok(Arc::new(array.with_values(truncated_values)))
+                }
+                dt => return_compute_error_with!("date_trunc does not support", dt),
+            )
+        }
+        _ => {
+            downcast_temporal_array!(
+                array => {
+                   date_trunc(array, format)
+                    .map(|a| Arc::new(a) as ArrayRef)
+                }
+                dt => return_compute_error_with!("date_trunc does not support", dt),
+            )
+        }
+    }
+}
+
+pub(crate) fn date_trunc<T>(
+    array: &PrimitiveArray<T>,
+    format: String,
+) -> Result<Date32Array, SparkError>
+where
+    T: ArrowTemporalType + ArrowNumericType,
+    i64: From<T::Native>,
+{
+    let builder = Date32Builder::with_capacity(array.len());
+    let iter = ArrayIter::new(array);
+    match array.data_type() {
+        DataType::Date32 => match format.to_uppercase().as_str() {
+            "YEAR" | "YYYY" | "YY" => Ok(as_datetime_with_op::<&PrimitiveArray<T>, T, _>(
+                iter,
+                builder,
+                |dt| as_days_from_unix_epoch(trunc_date_to_year(dt)),
+            )),
+            "QUARTER" => Ok(as_datetime_with_op::<&PrimitiveArray<T>, T, _>(
+                iter,
+                builder,
+                |dt| as_days_from_unix_epoch(trunc_date_to_quarter(dt)),
+            )),
+            "MONTH" | "MON" | "MM" => Ok(as_datetime_with_op::<&PrimitiveArray<T>, T, _>(
+                iter,
+                builder,
+                |dt| as_days_from_unix_epoch(trunc_date_to_month(dt)),
+            )),
+            "WEEK" => Ok(as_datetime_with_op::<&PrimitiveArray<T>, T, _>(
+                iter,
+                builder,
+                |dt| as_days_from_unix_epoch(trunc_date_to_week(dt)),
+            )),
+            _ => Err(SparkError::Internal(format!(
+                "Unsupported format: {:?} for function 'date_trunc'",
+                format
+            ))),
+        },
+        dt => return_compute_error_with!(
+            "Unsupported input type '{:?}' for function 'date_trunc'",
+            dt
+        ),
+    }
+}
+
+///
+/// Implements the spark [TRUNC](https://spark.apache.org/docs/latest/api/sql/index.html#trunc)
+/// function where the specified format may be an array
+///
+///   array is an array of Date32 values. The array may be a dictionary array.
+///
+///   format is an array of strings specifying the format to apply to the corresponding date value.
+///             The array may be a dictionary array.
+pub(crate) fn date_trunc_array_fmt_dyn(
+    array: &dyn Array,
+    formats: &dyn Array,
+) -> Result<ArrayRef, SparkError> {
+    match (array.data_type().clone(), formats.data_type().clone()) {
+        (DataType::Dictionary(_, v), DataType::Dictionary(_, f)) => {
+            if !matches!(*v, DataType::Date32) {
+                return_compute_error_with!("date_trunc does not support", v)
+            }
+            if !matches!(*f, DataType::Utf8) {
+                return_compute_error_with!("date_trunc does not support format type ", f)
+            }
+            downcast_dictionary_array!(
+                formats => {
+                    downcast_dictionary_array!(
+                        array => {
+                            date_trunc_array_fmt_dict_dict(
+                                    &array.downcast_dict::<Date32Array>().unwrap(),
+                                    &formats.downcast_dict::<StringArray>().unwrap())
+                            .map(|a| Arc::new(a) as ArrayRef)
+                        }
+                        dt => return_compute_error_with!("date_trunc does not support", dt)
+                    )
+                }
+                fmt => return_compute_error_with!("date_trunc does not support format type", fmt),
+            )
+        }
+        (DataType::Dictionary(_, v), DataType::Utf8) => {
+            if !matches!(*v, DataType::Date32) {
+                return_compute_error_with!("date_trunc does not support", v)
+            }
+            downcast_dictionary_array!(
+                array => {
+                  date_trunc_array_fmt_dict_plain(
+                        &array.downcast_dict::<Date32Array>().unwrap(),
+                        formats.as_any().downcast_ref::<StringArray>()
+                            .expect("Unexpected value type in formats"))
+                  .map(|a| Arc::new(a) as ArrayRef)
+                }
+                dt => return_compute_error_with!("date_trunc does not support", dt),
+            )
+        }
+        (DataType::Date32, DataType::Dictionary(_, f)) => {
+            if !matches!(*f, DataType::Utf8) {
+                return_compute_error_with!("date_trunc does not support format type ", f)
+            }
+            downcast_dictionary_array!(
+                formats => {
+                downcast_temporal_array!(array => {
+                        date_trunc_array_fmt_plain_dict(
+                            array.as_any().downcast_ref::<Date32Array>()
+                                .expect("Unexpected error in casting date array"),
+                            &formats.downcast_dict::<StringArray>().unwrap())
+                        .map(|a| Arc::new(a) as ArrayRef)
+                    }
+                    dt => return_compute_error_with!("date_trunc does not support", dt),
+                    )
+                }
+                fmt => return_compute_error_with!("date_trunc does not support format type", fmt),
+            )
+        }
+        (DataType::Date32, DataType::Utf8) => date_trunc_array_fmt_plain_plain(
+            array
+                .as_any()
+                .downcast_ref::<Date32Array>()
+                .expect("Unexpected error in casting date array"),
+            formats
+                .as_any()
+                .downcast_ref::<StringArray>()
+                .expect("Unexpected value type in formats"),
+        )
+        .map(|a| Arc::new(a) as ArrayRef),
+        (dt, fmt) => Err(SparkError::Internal(format!(
+            "Unsupported datatype: {:}, format: {:?} for function 'date_trunc'",
+            dt, fmt
+        ))),
+    }
+}
+
+macro_rules! date_trunc_array_fmt_helper {
+    ($array: ident, $formats: ident, $datatype: ident) => {{
+        let mut builder = Date32Builder::with_capacity($array.len());
+        let iter = $array.into_iter();
+        match $datatype {
+            DataType::Date32 => {
+                for (index, val) in iter.enumerate() {
+                    let op_result = match $formats.value(index).to_uppercase().as_str() {
+                        "YEAR" | "YYYY" | "YY" => {
+                            Ok(as_datetime_with_op_single(val, &mut builder, |dt| {
+                                as_days_from_unix_epoch(trunc_date_to_year(dt))
+                            }))
+                        }
+                        "QUARTER" => Ok(as_datetime_with_op_single(val, &mut builder, |dt| {
+                            as_days_from_unix_epoch(trunc_date_to_quarter(dt))
+                        })),
+                        "MONTH" | "MON" | "MM" => {
+                            Ok(as_datetime_with_op_single(val, &mut builder, |dt| {
+                                as_days_from_unix_epoch(trunc_date_to_month(dt))
+                            }))
+                        }
+                        "WEEK" => Ok(as_datetime_with_op_single(val, &mut builder, |dt| {
+                            as_days_from_unix_epoch(trunc_date_to_week(dt))
+                        })),
+                        _ => Err(SparkError::Internal(format!(
+                            "Unsupported format: {:?} for function 'date_trunc'",
+                            $formats.value(index)
+                        ))),
+                    };
+                    op_result?
+                }
+                Ok(builder.finish())
+            }
+            dt => return_compute_error_with!(
+                "Unsupported input type '{:?}' for function 'date_trunc'",
+                dt
+            ),
+        }
+    }};
+}
+
+fn date_trunc_array_fmt_plain_plain(
+    array: &Date32Array,
+    formats: &StringArray,
+) -> Result<Date32Array, SparkError>
+where
+{
+    let data_type = array.data_type();
+    date_trunc_array_fmt_helper!(array, formats, data_type)
+}
+
+fn date_trunc_array_fmt_plain_dict<K>(
+    array: &Date32Array,
+    formats: &TypedDictionaryArray<K, StringArray>,
+) -> Result<Date32Array, SparkError>
+where
+    K: ArrowDictionaryKeyType,
+{
+    let data_type = array.data_type();
+    date_trunc_array_fmt_helper!(array, formats, data_type)
+}
+
+fn date_trunc_array_fmt_dict_plain<K>(
+    array: &TypedDictionaryArray<K, Date32Array>,
+    formats: &StringArray,
+) -> Result<Date32Array, SparkError>
+where
+    K: ArrowDictionaryKeyType,
+{
+    let data_type = array.values().data_type();
+    date_trunc_array_fmt_helper!(array, formats, data_type)
+}
+
+fn date_trunc_array_fmt_dict_dict<K, F>(
+    array: &TypedDictionaryArray<K, Date32Array>,
+    formats: &TypedDictionaryArray<F, StringArray>,
+) -> Result<Date32Array, SparkError>
+where
+    K: ArrowDictionaryKeyType,
+    F: ArrowDictionaryKeyType,
+{
+    let data_type = array.values().data_type();
+    date_trunc_array_fmt_helper!(array, formats, data_type)
+}
+
+///
+/// Implements the spark [DATE_TRUNC](https://spark.apache.org/docs/latest/api/sql/index.html#date_trunc)
+/// function where the specified format is a scalar value
+///
+///   array is an array of Timestamp(Microsecond) values. Timestamp values must have a valid
+///            timezone or no timezone. The array may be a dictionary array.
+///
+///   format is a scalar string specifying the format to apply to the timestamp value.
+pub(crate) fn timestamp_trunc_dyn(
+    array: &dyn Array,
+    format: String,
+) -> Result<ArrayRef, SparkError> {
+    match array.data_type().clone() {
+        DataType::Dictionary(_, _) => {
+            downcast_dictionary_array!(
+                array => {
+                    let truncated_values = timestamp_trunc_dyn(array.values(), format)?;
+                    Ok(Arc::new(array.with_values(truncated_values)))
+                }
+                dt => return_compute_error_with!("timestamp_trunc does not support", dt),
+            )
+        }
+        _ => {
+            downcast_temporal_array!(
+                array => {
+                   timestamp_trunc(array, format)
+                    .map(|a| Arc::new(a) as ArrayRef)
+                }
+                dt => return_compute_error_with!("timestamp_trunc does not support", dt),
+            )
+        }
+    }
+}
+
+pub(crate) fn timestamp_trunc<T>(
+    array: &PrimitiveArray<T>,
+    format: String,
+) -> Result<TimestampMicrosecondArray, SparkError>
+where
+    T: ArrowTemporalType + ArrowNumericType,
+    i64: From<T::Native>,
+{
+    let builder = TimestampMicrosecondBuilder::with_capacity(array.len());
+    let iter = ArrayIter::new(array);
+    match array.data_type() {
+        DataType::Timestamp(TimeUnit::Microsecond, Some(tz)) => {
+            match format.to_uppercase().as_str() {
+                "YEAR" | "YYYY" | "YY" => {
+                    as_timestamp_tz_with_op::<&PrimitiveArray<T>, T, _>(iter, builder, tz, |dt| {
+                        as_micros_from_unix_epoch_utc(trunc_date_to_year(dt))
+                    })
+                }
+                "QUARTER" => {
+                    as_timestamp_tz_with_op::<&PrimitiveArray<T>, T, _>(iter, builder, tz, |dt| {
+                        as_micros_from_unix_epoch_utc(trunc_date_to_quarter(dt))
+                    })
+                }
+                "MONTH" | "MON" | "MM" => {
+                    as_timestamp_tz_with_op::<&PrimitiveArray<T>, T, _>(iter, builder, tz, |dt| {
+                        as_micros_from_unix_epoch_utc(trunc_date_to_month(dt))
+                    })
+                }
+                "WEEK" => {
+                    as_timestamp_tz_with_op::<&PrimitiveArray<T>, T, _>(iter, builder, tz, |dt| {
+                        as_micros_from_unix_epoch_utc(trunc_date_to_week(dt))
+                    })
+                }
+                "DAY" | "DD" => {
+                    as_timestamp_tz_with_op::<&PrimitiveArray<T>, T, _>(iter, builder, tz, |dt| {
+                        as_micros_from_unix_epoch_utc(trunc_date_to_day(dt))
+                    })
+                }
+                "HOUR" => {
+                    as_timestamp_tz_with_op::<&PrimitiveArray<T>, T, _>(iter, builder, tz, |dt| {
+                        as_micros_from_unix_epoch_utc(trunc_date_to_hour(dt))
+                    })
+                }
+                "MINUTE" => {
+                    as_timestamp_tz_with_op::<&PrimitiveArray<T>, T, _>(iter, builder, tz, |dt| {
+                        as_micros_from_unix_epoch_utc(trunc_date_to_minute(dt))
+                    })
+                }
+                "SECOND" => {
+                    as_timestamp_tz_with_op::<&PrimitiveArray<T>, T, _>(iter, builder, tz, |dt| {
+                        as_micros_from_unix_epoch_utc(trunc_date_to_second(dt))
+                    })
+                }
+                "MILLISECOND" => {
+                    as_timestamp_tz_with_op::<&PrimitiveArray<T>, T, _>(iter, builder, tz, |dt| {
+                        as_micros_from_unix_epoch_utc(trunc_date_to_ms(dt))
+                    })
+                }
+                "MICROSECOND" => {
+                    as_timestamp_tz_with_op::<&PrimitiveArray<T>, T, _>(iter, builder, tz, |dt| {
+                        as_micros_from_unix_epoch_utc(trunc_date_to_microsec(dt))
+                    })
+                }
+                _ => Err(SparkError::Internal(format!(
+                    "Unsupported format: {:?} for function 'timestamp_trunc'",
+                    format
+                ))),
+            }
+        }
+        dt => return_compute_error_with!(
+            "Unsupported input type '{:?}' for function 'timestamp_trunc'",
+            dt
+        ),
+    }
+}
+
+///
+/// Implements the spark [DATE_TRUNC](https://spark.apache.org/docs/latest/api/sql/index.html#date_trunc)
+/// function where the specified format may be an array
+///
+///   array is an array of Timestamp(Microsecond) values. Timestamp values must have a valid
+///            timezone or no timezone. The array may be a dictionary array.
+///
+///   format is an array of strings specifying the format to apply to the corresponding timestamp
+///             value. The array may be a dictionary array.
+pub(crate) fn timestamp_trunc_array_fmt_dyn(
+    array: &dyn Array,
+    formats: &dyn Array,
+) -> Result<ArrayRef, SparkError> {
+    match (array.data_type().clone(), formats.data_type().clone()) {
+        (DataType::Dictionary(_, _), DataType::Dictionary(_, _)) => {
+            downcast_dictionary_array!(
+                formats => {
+                    downcast_dictionary_array!(
+                        array => {
+                            timestamp_trunc_array_fmt_dict_dict(
+                                    &array.downcast_dict::<TimestampMicrosecondArray>().unwrap(),
+                                    &formats.downcast_dict::<StringArray>().unwrap())
+                            .map(|a| Arc::new(a) as ArrayRef)
+                        }
+                        dt => return_compute_error_with!("timestamp_trunc does not support", dt)
+                    )
+                }
+                fmt => return_compute_error_with!("timestamp_trunc does not support format type", fmt),
+            )
+        }
+        (DataType::Dictionary(_, _), DataType::Utf8) => {
+            downcast_dictionary_array!(
+                array => {
+                  timestamp_trunc_array_fmt_dict_plain(
+                        &array.downcast_dict::<PrimitiveArray<TimestampMicrosecondType>>().unwrap(),
+                        formats.as_any().downcast_ref::<StringArray>()
+                            .expect("Unexpected value type in formats"))
+                  .map(|a| Arc::new(a) as ArrayRef)
+                }
+                dt => return_compute_error_with!("timestamp_trunc does not support", dt),
+            )
+        }
+        (DataType::Timestamp(TimeUnit::Microsecond, _), DataType::Dictionary(_, _)) => {
+            downcast_dictionary_array!(
+                formats => {
+                downcast_temporal_array!(array => {
+                        timestamp_trunc_array_fmt_plain_dict(
+                                array,
+                                &formats.downcast_dict::<StringArray>().unwrap())
+                        .map(|a| Arc::new(a) as ArrayRef)
+                    }
+                    dt => return_compute_error_with!("timestamp_trunc does not support", dt),
+                    )
+                }
+                fmt => return_compute_error_with!("timestamp_trunc does not support format type", fmt),
+            )
+        }
+        (DataType::Timestamp(TimeUnit::Microsecond, _), DataType::Utf8) => {
+            downcast_temporal_array!(
+                array => {
+                    timestamp_trunc_array_fmt_plain_plain(array,
+                        formats.as_any().downcast_ref::<StringArray>().expect("Unexpected value type in formats"))
+                    .map(|a| Arc::new(a) as ArrayRef)
+                },
+                dt => return_compute_error_with!("timestamp_trunc does not support", dt),
+            )
+        }
+        (dt, fmt) => Err(SparkError::Internal(format!(
+            "Unsupported datatype: {:}, format: {:?} for function 'timestamp_trunc'",
+            dt, fmt
+        ))),
+    }
+}
+
+macro_rules! timestamp_trunc_array_fmt_helper {
+    ($array: ident, $formats: ident, $datatype: ident) => {{
+        let mut builder = TimestampMicrosecondBuilder::with_capacity($array.len());
+        let iter = $array.into_iter();
+        assert_eq!(
+            $array.len(),
+            $formats.len(),
+            "lengths of values array and format array must be the same"
+        );
+        match $datatype {
+            DataType::Timestamp(TimeUnit::Microsecond, Some(tz)) => {
+                let tz: Tz = tz.parse()?;
+                for (index, val) in iter.enumerate() {
+                    let op_result = match $formats.value(index).to_uppercase().as_str() {
+                        "YEAR" | "YYYY" | "YY" => {
+                            as_timestamp_tz_with_op_single::<T, _>(val, &mut builder, &tz, |dt| {
+                                as_micros_from_unix_epoch_utc(trunc_date_to_year(dt))
+                            })
+                        }
+                        "QUARTER" => {
+                            as_timestamp_tz_with_op_single::<T, _>(val, &mut builder, &tz, |dt| {
+                                as_micros_from_unix_epoch_utc(trunc_date_to_quarter(dt))
+                            })
+                        }
+                        "MONTH" | "MON" | "MM" => {
+                            as_timestamp_tz_with_op_single::<T, _>(val, &mut builder, &tz, |dt| {
+                                as_micros_from_unix_epoch_utc(trunc_date_to_month(dt))
+                            })
+                        }
+                        "WEEK" => {
+                            as_timestamp_tz_with_op_single::<T, _>(val, &mut builder, &tz, |dt| {
+                                as_micros_from_unix_epoch_utc(trunc_date_to_week(dt))
+                            })
+                        }
+                        "DAY" | "DD" => {
+                            as_timestamp_tz_with_op_single::<T, _>(val, &mut builder, &tz, |dt| {
+                                as_micros_from_unix_epoch_utc(trunc_date_to_day(dt))
+                            })
+                        }
+                        "HOUR" => {
+                            as_timestamp_tz_with_op_single::<T, _>(val, &mut builder, &tz, |dt| {
+                                as_micros_from_unix_epoch_utc(trunc_date_to_hour(dt))
+                            })
+                        }
+                        "MINUTE" => {
+                            as_timestamp_tz_with_op_single::<T, _>(val, &mut builder, &tz, |dt| {
+                                as_micros_from_unix_epoch_utc(trunc_date_to_minute(dt))
+                            })
+                        }
+                        "SECOND" => {
+                            as_timestamp_tz_with_op_single::<T, _>(val, &mut builder, &tz, |dt| {
+                                as_micros_from_unix_epoch_utc(trunc_date_to_second(dt))
+                            })
+                        }
+                        "MILLISECOND" => {
+                            as_timestamp_tz_with_op_single::<T, _>(val, &mut builder, &tz, |dt| {
+                                as_micros_from_unix_epoch_utc(trunc_date_to_ms(dt))
+                            })
+                        }
+                        "MICROSECOND" => {
+                            as_timestamp_tz_with_op_single::<T, _>(val, &mut builder, &tz, |dt| {
+                                as_micros_from_unix_epoch_utc(trunc_date_to_microsec(dt))
+                            })
+                        }
+                        _ => Err(SparkError::Internal(format!(
+                            "Unsupported format: {:?} for function 'timestamp_trunc'",
+                            $formats.value(index)
+                        ))),
+                    };
+                    op_result?
+                }
+                Ok(builder.finish())
+            }
+            dt => {
+                return_compute_error_with!(
+                    "Unsupported input type '{:?}' for function 'timestamp_trunc'",
+                    dt
+                )
+            }
+        }
+    }};
+}
+
+fn timestamp_trunc_array_fmt_plain_plain<T>(
+    array: &PrimitiveArray<T>,
+    formats: &StringArray,
+) -> Result<TimestampMicrosecondArray, SparkError>
+where
+    T: ArrowTemporalType + ArrowNumericType,
+    i64: From<T::Native>,
+{
+    let data_type = array.data_type();
+    timestamp_trunc_array_fmt_helper!(array, formats, data_type)
+}
+fn timestamp_trunc_array_fmt_plain_dict<T, K>(
+    array: &PrimitiveArray<T>,
+    formats: &TypedDictionaryArray<K, StringArray>,
+) -> Result<TimestampMicrosecondArray, SparkError>
+where
+    T: ArrowTemporalType + ArrowNumericType,
+    i64: From<T::Native>,
+    K: ArrowDictionaryKeyType,
+{
+    let data_type = array.data_type();
+    timestamp_trunc_array_fmt_helper!(array, formats, data_type)
+}
+
+fn timestamp_trunc_array_fmt_dict_plain<T, K>(
+    array: &TypedDictionaryArray<K, PrimitiveArray<T>>,
+    formats: &StringArray,
+) -> Result<TimestampMicrosecondArray, SparkError>
+where
+    T: ArrowTemporalType + ArrowNumericType,
+    i64: From<T::Native>,
+    K: ArrowDictionaryKeyType,
+{
+    let data_type = array.values().data_type();
+    timestamp_trunc_array_fmt_helper!(array, formats, data_type)
+}
+
+fn timestamp_trunc_array_fmt_dict_dict<T, K, F>(
+    array: &TypedDictionaryArray<K, PrimitiveArray<T>>,
+    formats: &TypedDictionaryArray<F, StringArray>,
+) -> Result<TimestampMicrosecondArray, SparkError>
+where
+    T: ArrowTemporalType + ArrowNumericType,
+    i64: From<T::Native>,
+    K: ArrowDictionaryKeyType,
+    F: ArrowDictionaryKeyType,
+{
+    let data_type = array.values().data_type();
+    timestamp_trunc_array_fmt_helper!(array, formats, data_type)
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::kernels::temporal::{
+        date_trunc, date_trunc_array_fmt_dyn, timestamp_trunc, timestamp_trunc_array_fmt_dyn,
+    };
+    use arrow_array::{
+        builder::{PrimitiveDictionaryBuilder, StringDictionaryBuilder},
+        iterator::ArrayIter,
+        types::{Date32Type, Int32Type, TimestampMicrosecondType},
+        Array, Date32Array, PrimitiveArray, StringArray, TimestampMicrosecondArray,
+    };
+    use std::sync::Arc;
+
+    #[test]
+    #[cfg_attr(miri, ignore)] // test takes too long with miri
+    fn test_date_trunc() {
+        let size = 1000;
+        let mut vec: Vec<i32> = Vec::with_capacity(size);
+        for i in 0..size {
+            vec.push(i as i32);
+        }
+        let array = Date32Array::from(vec);
+        for fmt in [
+            "YEAR", "YYYY", "YY", "QUARTER", "MONTH", "MON", "MM", "WEEK",
+        ] {
+            match date_trunc(&array, fmt.to_string()) {
+                Ok(a) => {
+                    for i in 0..size {
+                        assert!(array.values().get(i) >= a.values().get(i))
+                    }
+                }
+                _ => assert!(false),
+            }
+        }
+    }
+
+    #[test]
+    // This test only verifies that the various input array types work. Actually correctness to
+    // ensure this produces the same results as spark is verified in the JVM tests
+    fn test_date_trunc_array_fmt_dyn() {
+        let size = 10;
+        let formats = [
+            "YEAR", "YYYY", "YY", "QUARTER", "MONTH", "MON", "MM", "WEEK",
+        ];
+        let mut vec: Vec<i32> = Vec::with_capacity(size * formats.len());
+        let mut fmt_vec: Vec<&str> = Vec::with_capacity(size * formats.len());
+        for i in 0..size {
+            for j in 0..formats.len() {
+                vec.push(i as i32 * 1_000_001);
+                fmt_vec.push(formats[j]);
+            }
+        }
+
+        // timestamp array
+        let array = Date32Array::from(vec);
+
+        // formats array
+        let fmt_array = StringArray::from(fmt_vec);
+
+        // timestamp dictionary array
+        let mut date_dict_builder = PrimitiveDictionaryBuilder::<Int32Type, Date32Type>::new();
+        for v in array.iter() {
+            date_dict_builder
+                .append(v.unwrap())
+                .expect("Error in building timestamp array");
+        }
+        let mut array_dict = date_dict_builder.finish();
+        // apply timezone
+        array_dict = array_dict.with_values(Arc::new(
+            array_dict
+                .values()
+                .as_any()
+                .downcast_ref::<Date32Array>()
+                .unwrap()
+                .clone(),
+        ));
+
+        // formats dictionary array
+        let mut formats_dict_builder = StringDictionaryBuilder::<Int32Type>::new();
+        for v in fmt_array.iter() {
+            formats_dict_builder
+                .append(v.unwrap())
+                .expect("Error in building formats array");
+        }
+        let fmt_dict = formats_dict_builder.finish();
+
+        // verify input arrays
+        let iter = ArrayIter::new(&array);
+        let mut dict_iter = array_dict
+            .downcast_dict::<PrimitiveArray<Date32Type>>()
+            .unwrap()
+            .into_iter();
+        for val in iter {
+            assert_eq!(
+                dict_iter
+                    .next()
+                    .expect("array and dictionary array do not match"),
+                val
+            )
+        }
+
+        // verify input format arrays
+        let fmt_iter = ArrayIter::new(&fmt_array);
+        let mut fmt_dict_iter = fmt_dict.downcast_dict::<StringArray>().unwrap().into_iter();
+        for val in fmt_iter {
+            assert_eq!(
+                fmt_dict_iter
+                    .next()
+                    .expect("formats and dictionary formats do not match"),
+                val
+            )
+        }
+
+        // test cases
+        if let Ok(a) = date_trunc_array_fmt_dyn(&array, &fmt_array) {
+            for i in 0..array.len() {
+                assert!(
+                    array.value(i) >= a.as_any().downcast_ref::<Date32Array>().unwrap().value(i)
+                )
+            }
+        } else {
+            assert!(false)
+        }
+        if let Ok(a) = date_trunc_array_fmt_dyn(&array_dict, &fmt_array) {
+            for i in 0..array.len() {
+                assert!(
+                    array.value(i) >= a.as_any().downcast_ref::<Date32Array>().unwrap().value(i)
+                )
+            }
+        } else {
+            assert!(false)
+        }
+        if let Ok(a) = date_trunc_array_fmt_dyn(&array, &fmt_dict) {
+            for i in 0..array.len() {
+                assert!(
+                    array.value(i) >= a.as_any().downcast_ref::<Date32Array>().unwrap().value(i)
+                )
+            }
+        } else {
+            assert!(false)
+        }
+        if let Ok(a) = date_trunc_array_fmt_dyn(&array_dict, &fmt_dict) {
+            for i in 0..array.len() {
+                assert!(
+                    array.value(i) >= a.as_any().downcast_ref::<Date32Array>().unwrap().value(i)
+                )
+            }
+        } else {
+            assert!(false)
+        }
+    }
+
+    #[test]
+    #[cfg_attr(miri, ignore)] // test takes too long with miri
+    fn test_timestamp_trunc() {
+        let size = 1000;
+        let mut vec: Vec<i64> = Vec::with_capacity(size);
+        for i in 0..size {
+            vec.push(i as i64);
+        }
+        let array = TimestampMicrosecondArray::from(vec).with_timezone_utc();
+        for fmt in [
+            "YEAR",
+            "YYYY",
+            "YY",
+            "QUARTER",
+            "MONTH",
+            "MON",
+            "MM",
+            "WEEK",
+            "DAY",
+            "DD",
+            "HOUR",
+            "MINUTE",
+            "SECOND",
+            "MILLISECOND",
+            "MICROSECOND",
+        ] {
+            match timestamp_trunc(&array, fmt.to_string()) {
+                Ok(a) => {
+                    for i in 0..size {
+                        assert!(array.values().get(i) >= a.values().get(i))
+                    }
+                }
+                _ => assert!(false),
+            }
+        }
+    }
+
+    #[test]
+    // test takes too long with miri
+    #[cfg_attr(miri, ignore)]
+    // This test only verifies that the various input array types work. Actually correctness to
+    // ensure this produces the same results as spark is verified in the JVM tests
+    fn test_timestamp_trunc_array_fmt_dyn() {
+        let size = 10;
+        let formats = [
+            "YEAR",
+            "YYYY",
+            "YY",
+            "QUARTER",
+            "MONTH",
+            "MON",
+            "MM",
+            "WEEK",
+            "DAY",
+            "DD",
+            "HOUR",
+            "MINUTE",
+            "SECOND",
+            "MILLISECOND",
+            "MICROSECOND",
+        ];
+        let mut vec: Vec<i64> = Vec::with_capacity(size * formats.len());
+        let mut fmt_vec: Vec<&str> = Vec::with_capacity(size * formats.len());
+        for i in 0..size {
+            for j in 0..formats.len() {
+                vec.push(i as i64 * 1_000_000_001);
+                fmt_vec.push(formats[j]);
+            }
+        }
+
+        // timestamp array
+        let array = TimestampMicrosecondArray::from(vec).with_timezone_utc();
+
+        // formats array
+        let fmt_array = StringArray::from(fmt_vec);
+
+        // timestamp dictionary array
+        let mut timestamp_dict_builder =
+            PrimitiveDictionaryBuilder::<Int32Type, TimestampMicrosecondType>::new();
+        for v in array.iter() {
+            timestamp_dict_builder
+                .append(v.unwrap())
+                .expect("Error in building timestamp array");
+        }
+        let mut array_dict = timestamp_dict_builder.finish();
+        // apply timezone
+        array_dict = array_dict.with_values(Arc::new(
+            array_dict
+                .values()
+                .as_any()
+                .downcast_ref::<TimestampMicrosecondArray>()
+                .unwrap()
+                .clone()
+                .with_timezone_utc(),
+        ));
+
+        // formats dictionary array
+        let mut formats_dict_builder = StringDictionaryBuilder::<Int32Type>::new();
+        for v in fmt_array.iter() {
+            formats_dict_builder
+                .append(v.unwrap())
+                .expect("Error in building formats array");
+        }
+        let fmt_dict = formats_dict_builder.finish();
+
+        // verify input arrays
+        let iter = ArrayIter::new(&array);
+        let mut dict_iter = array_dict
+            .downcast_dict::<PrimitiveArray<TimestampMicrosecondType>>()
+            .unwrap()
+            .into_iter();
+        for val in iter {
+            assert_eq!(
+                dict_iter
+                    .next()
+                    .expect("array and dictionary array do not match"),
+                val
+            )
+        }
+
+        // verify input format arrays
+        let fmt_iter = ArrayIter::new(&fmt_array);
+        let mut fmt_dict_iter = fmt_dict.downcast_dict::<StringArray>().unwrap().into_iter();
+        for val in fmt_iter {
+            assert_eq!(
+                fmt_dict_iter
+                    .next()
+                    .expect("formats and dictionary formats do not match"),
+                val
+            )
+        }
+
+        // test cases
+        if let Ok(a) = timestamp_trunc_array_fmt_dyn(&array, &fmt_array) {
+            for i in 0..array.len() {
+                assert!(
+                    array.value(i)
+                        >= a.as_any()
+                            .downcast_ref::<TimestampMicrosecondArray>()
+                            .unwrap()
+                            .value(i)
+                )
+            }
+        } else {
+            assert!(false)
+        }
+        if let Ok(a) = timestamp_trunc_array_fmt_dyn(&array_dict, &fmt_array) {
+            for i in 0..array.len() {
+                assert!(
+                    array.value(i)
+                        >= a.as_any()
+                            .downcast_ref::<TimestampMicrosecondArray>()
+                            .unwrap()
+                            .value(i)
+                )
+            }
+        } else {
+            assert!(false)
+        }
+        if let Ok(a) = timestamp_trunc_array_fmt_dyn(&array, &fmt_dict) {
+            for i in 0..array.len() {
+                assert!(
+                    array.value(i)
+                        >= a.as_any()
+                            .downcast_ref::<TimestampMicrosecondArray>()
+                            .unwrap()
+                            .value(i)
+                )
+            }
+        } else {
+            assert!(false)
+        }
+        if let Ok(a) = timestamp_trunc_array_fmt_dyn(&array_dict, &fmt_dict) {
+            for i in 0..array.len() {
+                assert!(
+                    array.value(i)
+                        >= a.as_any()
+                            .downcast_ref::<TimestampMicrosecondArray>()
+                            .unwrap()
+                            .value(i)
+                )
+            }
+        } else {
+            assert!(false)
+        }
+    }
+}
diff --git a/src/lib.rs b/src/lib.rs
index 3c726f52a8e8..5168e0e80747 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -16,16 +16,20 @@
 // under the License.
 
 mod abs;
-pub mod cast;
+mod cast;
 mod error;
 mod if_expr;
 
+mod kernels;
+mod temporal;
 pub mod timezone;
 pub mod utils;
 
 pub use abs::Abs;
+pub use cast::Cast;
 pub use error::{SparkError, SparkResult};
 pub use if_expr::IfExpr;
+pub use temporal::{DateTruncExec, HourExec, MinuteExec, SecondExec, TimestampTruncExec};
 
 /// Spark supports three evaluation modes when evaluating expressions, which affect
 /// the behavior when processing input values that are invalid or would result in an
diff --git a/src/temporal.rs b/src/temporal.rs
new file mode 100644
index 000000000000..ea30d3383dd5
--- /dev/null
+++ b/src/temporal.rs
@@ -0,0 +1,534 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::{
+    any::Any,
+    fmt::{Debug, Display, Formatter},
+    hash::{Hash, Hasher},
+    sync::Arc,
+};
+
+use arrow::{
+    compute::{date_part, DatePart},
+    record_batch::RecordBatch,
+};
+use arrow_schema::{DataType, Schema, TimeUnit::Microsecond};
+use datafusion::logical_expr::ColumnarValue;
+use datafusion_common::{DataFusionError, ScalarValue::Utf8};
+use datafusion_physical_expr::PhysicalExpr;
+
+use crate::utils::{array_with_timezone, down_cast_any_ref};
+
+use crate::kernels::temporal::{
+    date_trunc_array_fmt_dyn, date_trunc_dyn, timestamp_trunc_array_fmt_dyn, timestamp_trunc_dyn,
+};
+
+#[derive(Debug, Hash)]
+pub struct HourExec {
+    /// An array with DataType::Timestamp(TimeUnit::Microsecond, None)
+    child: Arc<dyn PhysicalExpr>,
+    timezone: String,
+}
+
+impl HourExec {
+    pub fn new(child: Arc<dyn PhysicalExpr>, timezone: String) -> Self {
+        HourExec { child, timezone }
+    }
+}
+
+impl Display for HourExec {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        write!(
+            f,
+            "Hour [timezone:{}, child: {}]",
+            self.timezone, self.child
+        )
+    }
+}
+
+impl PartialEq<dyn Any> for HourExec {
+    fn eq(&self, other: &dyn Any) -> bool {
+        down_cast_any_ref(other)
+            .downcast_ref::<Self>()
+            .map(|x| self.child.eq(&x.child) && self.timezone.eq(&x.timezone))
+            .unwrap_or(false)
+    }
+}
+
+impl PhysicalExpr for HourExec {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn data_type(&self, input_schema: &Schema) -> datafusion_common::Result<DataType> {
+        match self.child.data_type(input_schema).unwrap() {
+            DataType::Dictionary(key_type, _) => {
+                Ok(DataType::Dictionary(key_type, Box::new(DataType::Int32)))
+            }
+            _ => Ok(DataType::Int32),
+        }
+    }
+
+    fn nullable(&self, _: &Schema) -> datafusion_common::Result<bool> {
+        Ok(true)
+    }
+
+    fn evaluate(&self, batch: &RecordBatch) -> datafusion_common::Result<ColumnarValue> {
+        let arg = self.child.evaluate(batch)?;
+        match arg {
+            ColumnarValue::Array(array) => {
+                let array = array_with_timezone(
+                    array,
+                    self.timezone.clone(),
+                    Some(&DataType::Timestamp(
+                        Microsecond,
+                        Some(self.timezone.clone().into()),
+                    )),
+                )?;
+                let result = date_part(&array, DatePart::Hour)?;
+
+                Ok(ColumnarValue::Array(result))
+            }
+            _ => Err(DataFusionError::Execution(
+                "Hour(scalar) should be fold in Spark JVM side.".to_string(),
+            )),
+        }
+    }
+
+    fn children(&self) -> Vec<&Arc<dyn PhysicalExpr>> {
+        vec![&self.child]
+    }
+
+    fn with_new_children(
+        self: Arc<Self>,
+        children: Vec<Arc<dyn PhysicalExpr>>,
+    ) -> Result<Arc<dyn PhysicalExpr>, DataFusionError> {
+        Ok(Arc::new(HourExec::new(
+            children[0].clone(),
+            self.timezone.clone(),
+        )))
+    }
+
+    fn dyn_hash(&self, state: &mut dyn Hasher) {
+        let mut s = state;
+        self.child.hash(&mut s);
+        self.timezone.hash(&mut s);
+        self.hash(&mut s);
+    }
+}
+
+#[derive(Debug, Hash)]
+pub struct MinuteExec {
+    /// An array with DataType::Timestamp(TimeUnit::Microsecond, None)
+    child: Arc<dyn PhysicalExpr>,
+    timezone: String,
+}
+
+impl MinuteExec {
+    pub fn new(child: Arc<dyn PhysicalExpr>, timezone: String) -> Self {
+        MinuteExec { child, timezone }
+    }
+}
+
+impl Display for MinuteExec {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        write!(
+            f,
+            "Minute [timezone:{}, child: {}]",
+            self.timezone, self.child
+        )
+    }
+}
+
+impl PartialEq<dyn Any> for MinuteExec {
+    fn eq(&self, other: &dyn Any) -> bool {
+        down_cast_any_ref(other)
+            .downcast_ref::<Self>()
+            .map(|x| self.child.eq(&x.child) && self.timezone.eq(&x.timezone))
+            .unwrap_or(false)
+    }
+}
+
+impl PhysicalExpr for MinuteExec {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn data_type(&self, input_schema: &Schema) -> datafusion_common::Result<DataType> {
+        match self.child.data_type(input_schema).unwrap() {
+            DataType::Dictionary(key_type, _) => {
+                Ok(DataType::Dictionary(key_type, Box::new(DataType::Int32)))
+            }
+            _ => Ok(DataType::Int32),
+        }
+    }
+
+    fn nullable(&self, _: &Schema) -> datafusion_common::Result<bool> {
+        Ok(true)
+    }
+
+    fn evaluate(&self, batch: &RecordBatch) -> datafusion_common::Result<ColumnarValue> {
+        let arg = self.child.evaluate(batch)?;
+        match arg {
+            ColumnarValue::Array(array) => {
+                let array = array_with_timezone(
+                    array,
+                    self.timezone.clone(),
+                    Some(&DataType::Timestamp(
+                        Microsecond,
+                        Some(self.timezone.clone().into()),
+                    )),
+                )?;
+                let result = date_part(&array, DatePart::Minute)?;
+
+                Ok(ColumnarValue::Array(result))
+            }
+            _ => Err(DataFusionError::Execution(
+                "Minute(scalar) should be fold in Spark JVM side.".to_string(),
+            )),
+        }
+    }
+
+    fn children(&self) -> Vec<&Arc<dyn PhysicalExpr>> {
+        vec![&self.child]
+    }
+
+    fn with_new_children(
+        self: Arc<Self>,
+        children: Vec<Arc<dyn PhysicalExpr>>,
+    ) -> Result<Arc<dyn PhysicalExpr>, DataFusionError> {
+        Ok(Arc::new(MinuteExec::new(
+            children[0].clone(),
+            self.timezone.clone(),
+        )))
+    }
+
+    fn dyn_hash(&self, state: &mut dyn Hasher) {
+        let mut s = state;
+        self.child.hash(&mut s);
+        self.timezone.hash(&mut s);
+        self.hash(&mut s);
+    }
+}
+
+#[derive(Debug, Hash)]
+pub struct SecondExec {
+    /// An array with DataType::Timestamp(TimeUnit::Microsecond, None)
+    child: Arc<dyn PhysicalExpr>,
+    timezone: String,
+}
+
+impl SecondExec {
+    pub fn new(child: Arc<dyn PhysicalExpr>, timezone: String) -> Self {
+        SecondExec { child, timezone }
+    }
+}
+
+impl Display for SecondExec {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        write!(
+            f,
+            "Second (timezone:{}, child: {}]",
+            self.timezone, self.child
+        )
+    }
+}
+
+impl PartialEq<dyn Any> for SecondExec {
+    fn eq(&self, other: &dyn Any) -> bool {
+        down_cast_any_ref(other)
+            .downcast_ref::<Self>()
+            .map(|x| self.child.eq(&x.child) && self.timezone.eq(&x.timezone))
+            .unwrap_or(false)
+    }
+}
+
+impl PhysicalExpr for SecondExec {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn data_type(&self, input_schema: &Schema) -> datafusion_common::Result<DataType> {
+        match self.child.data_type(input_schema).unwrap() {
+            DataType::Dictionary(key_type, _) => {
+                Ok(DataType::Dictionary(key_type, Box::new(DataType::Int32)))
+            }
+            _ => Ok(DataType::Int32),
+        }
+    }
+
+    fn nullable(&self, _: &Schema) -> datafusion_common::Result<bool> {
+        Ok(true)
+    }
+
+    fn evaluate(&self, batch: &RecordBatch) -> datafusion_common::Result<ColumnarValue> {
+        let arg = self.child.evaluate(batch)?;
+        match arg {
+            ColumnarValue::Array(array) => {
+                let array = array_with_timezone(
+                    array,
+                    self.timezone.clone(),
+                    Some(&DataType::Timestamp(
+                        Microsecond,
+                        Some(self.timezone.clone().into()),
+                    )),
+                )?;
+                let result = date_part(&array, DatePart::Second)?;
+
+                Ok(ColumnarValue::Array(result))
+            }
+            _ => Err(DataFusionError::Execution(
+                "Second(scalar) should be fold in Spark JVM side.".to_string(),
+            )),
+        }
+    }
+
+    fn children(&self) -> Vec<&Arc<dyn PhysicalExpr>> {
+        vec![&self.child]
+    }
+
+    fn with_new_children(
+        self: Arc<Self>,
+        children: Vec<Arc<dyn PhysicalExpr>>,
+    ) -> Result<Arc<dyn PhysicalExpr>, DataFusionError> {
+        Ok(Arc::new(SecondExec::new(
+            children[0].clone(),
+            self.timezone.clone(),
+        )))
+    }
+
+    fn dyn_hash(&self, state: &mut dyn Hasher) {
+        let mut s = state;
+        self.child.hash(&mut s);
+        self.timezone.hash(&mut s);
+        self.hash(&mut s);
+    }
+}
+
+#[derive(Debug, Hash)]
+pub struct DateTruncExec {
+    /// An array with DataType::Date32
+    child: Arc<dyn PhysicalExpr>,
+    /// Scalar UTF8 string matching the valid values in Spark SQL: https://spark.apache.org/docs/latest/api/sql/index.html#trunc
+    format: Arc<dyn PhysicalExpr>,
+}
+
+impl DateTruncExec {
+    pub fn new(child: Arc<dyn PhysicalExpr>, format: Arc<dyn PhysicalExpr>) -> Self {
+        DateTruncExec { child, format }
+    }
+}
+
+impl Display for DateTruncExec {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        write!(
+            f,
+            "DateTrunc [child:{}, format: {}]",
+            self.child, self.format
+        )
+    }
+}
+
+impl PartialEq<dyn Any> for DateTruncExec {
+    fn eq(&self, other: &dyn Any) -> bool {
+        down_cast_any_ref(other)
+            .downcast_ref::<Self>()
+            .map(|x| self.child.eq(&x.child) && self.format.eq(&x.format))
+            .unwrap_or(false)
+    }
+}
+
+impl PhysicalExpr for DateTruncExec {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn data_type(&self, input_schema: &Schema) -> datafusion_common::Result<DataType> {
+        self.child.data_type(input_schema)
+    }
+
+    fn nullable(&self, _: &Schema) -> datafusion_common::Result<bool> {
+        Ok(true)
+    }
+
+    fn evaluate(&self, batch: &RecordBatch) -> datafusion_common::Result<ColumnarValue> {
+        let date = self.child.evaluate(batch)?;
+        let format = self.format.evaluate(batch)?;
+        match (date, format) {
+            (ColumnarValue::Array(date), ColumnarValue::Scalar(Utf8(Some(format)))) => {
+                let result = date_trunc_dyn(&date, format)?;
+                Ok(ColumnarValue::Array(result))
+            }
+            (ColumnarValue::Array(date), ColumnarValue::Array(formats)) => {
+                let result = date_trunc_array_fmt_dyn(&date, &formats)?;
+                Ok(ColumnarValue::Array(result))
+            }
+            _ => Err(DataFusionError::Execution(
+                "Invalid input to function DateTrunc. Expected (PrimitiveArray<Date32>, Scalar) or \
+                    (PrimitiveArray<Date32>, StringArray)".to_string(),
+            )),
+        }
+    }
+
+    fn children(&self) -> Vec<&Arc<dyn PhysicalExpr>> {
+        vec![&self.child]
+    }
+
+    fn with_new_children(
+        self: Arc<Self>,
+        children: Vec<Arc<dyn PhysicalExpr>>,
+    ) -> Result<Arc<dyn PhysicalExpr>, DataFusionError> {
+        Ok(Arc::new(DateTruncExec::new(
+            children[0].clone(),
+            self.format.clone(),
+        )))
+    }
+
+    fn dyn_hash(&self, state: &mut dyn Hasher) {
+        let mut s = state;
+        self.child.hash(&mut s);
+        self.format.hash(&mut s);
+        self.hash(&mut s);
+    }
+}
+
+#[derive(Debug, Hash)]
+pub struct TimestampTruncExec {
+    /// An array with DataType::Timestamp(TimeUnit::Microsecond, None)
+    child: Arc<dyn PhysicalExpr>,
+    /// Scalar UTF8 string matching the valid values in Spark SQL: https://spark.apache.org/docs/latest/api/sql/index.html#date_trunc
+    format: Arc<dyn PhysicalExpr>,
+    /// String containing a timezone name. The name must be found in the standard timezone
+    /// database (https://en.wikipedia.org/wiki/List_of_tz_database_time_zones). The string is
+    /// later parsed into a chrono::TimeZone.
+    /// Timestamp arrays in this implementation are kept in arrays of UTC timestamps (in micros)
+    /// along with a single value for the associated TimeZone. The timezone offset is applied
+    /// just before any operations on the timestamp
+    timezone: String,
+}
+
+impl TimestampTruncExec {
+    pub fn new(
+        child: Arc<dyn PhysicalExpr>,
+        format: Arc<dyn PhysicalExpr>,
+        timezone: String,
+    ) -> Self {
+        TimestampTruncExec {
+            child,
+            format,
+            timezone,
+        }
+    }
+}
+
+impl Display for TimestampTruncExec {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        write!(
+            f,
+            "TimestampTrunc [child:{}, format:{}, timezone: {}]",
+            self.child, self.format, self.timezone
+        )
+    }
+}
+
+impl PartialEq<dyn Any> for TimestampTruncExec {
+    fn eq(&self, other: &dyn Any) -> bool {
+        down_cast_any_ref(other)
+            .downcast_ref::<Self>()
+            .map(|x| {
+                self.child.eq(&x.child)
+                    && self.format.eq(&x.format)
+                    && self.timezone.eq(&x.timezone)
+            })
+            .unwrap_or(false)
+    }
+}
+
+impl PhysicalExpr for TimestampTruncExec {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn data_type(&self, input_schema: &Schema) -> datafusion_common::Result<DataType> {
+        match self.child.data_type(input_schema)? {
+            DataType::Dictionary(key_type, _) => Ok(DataType::Dictionary(
+                key_type,
+                Box::new(DataType::Timestamp(Microsecond, None)),
+            )),
+            _ => Ok(DataType::Timestamp(Microsecond, None)),
+        }
+    }
+
+    fn nullable(&self, _: &Schema) -> datafusion_common::Result<bool> {
+        Ok(true)
+    }
+
+    fn evaluate(&self, batch: &RecordBatch) -> datafusion_common::Result<ColumnarValue> {
+        let timestamp = self.child.evaluate(batch)?;
+        let format = self.format.evaluate(batch)?;
+        let tz = self.timezone.clone();
+        match (timestamp, format) {
+            (ColumnarValue::Array(ts), ColumnarValue::Scalar(Utf8(Some(format)))) => {
+                let ts = array_with_timezone(
+                    ts,
+                    tz.clone(),
+                    Some(&DataType::Timestamp(Microsecond, Some(tz.into()))),
+                )?;
+                let result = timestamp_trunc_dyn(&ts, format)?;
+                Ok(ColumnarValue::Array(result))
+            }
+            (ColumnarValue::Array(ts), ColumnarValue::Array(formats)) => {
+                let ts = array_with_timezone(
+                    ts,
+                    tz.clone(),
+                    Some(&DataType::Timestamp(Microsecond, Some(tz.into()))),
+                )?;
+                let result = timestamp_trunc_array_fmt_dyn(&ts, &formats)?;
+                Ok(ColumnarValue::Array(result))
+            }
+            _ => Err(DataFusionError::Execution(
+                "Invalid input to function TimestampTrunc. \
+                    Expected (PrimitiveArray<TimestampMicrosecondType>, Scalar, String) or \
+                    (PrimitiveArray<TimestampMicrosecondType>, StringArray, String)"
+                    .to_string(),
+            )),
+        }
+    }
+
+    fn children(&self) -> Vec<&Arc<dyn PhysicalExpr>> {
+        vec![&self.child]
+    }
+
+    fn with_new_children(
+        self: Arc<Self>,
+        children: Vec<Arc<dyn PhysicalExpr>>,
+    ) -> Result<Arc<dyn PhysicalExpr>, DataFusionError> {
+        Ok(Arc::new(TimestampTruncExec::new(
+            children[0].clone(),
+            self.format.clone(),
+            self.timezone.clone(),
+        )))
+    }
+
+    fn dyn_hash(&self, state: &mut dyn Hasher) {
+        let mut s = state;
+        self.child.hash(&mut s);
+        self.format.hash(&mut s);
+        self.timezone.hash(&mut s);
+        self.hash(&mut s);
+    }
+}

From 46e8bf287a93a977181da0c3499a582231afffa3 Mon Sep 17 00:00:00 2001
From: Vipul Vaibhaw <vaibhaw.vipul@gmail.com>
Date: Tue, 16 Jul 2024 01:03:10 +0530
Subject: [PATCH 07/68] fix: Optimize some functions to rewrite
 dictionary-encoded strings (#627)

* dedup code

* transforming the dict directly

* code optimization for cast string to timestamp

* minor optimizations

* fmt fixes and casting to dict array without unpacking to array first

* bug fixes

* revert unrelated change

* Added test case and code refactor

* minor optimization

* minor optimization again

* convert the cast to array

* Revert "convert the cast to array"

This reverts commit 9270aedeafa12dacabc664ca9df7c85236e05d85.

* bug fixes

* rename the test to cast_dict_to_timestamp arr
---
 src/cast.rs | 98 ++++++++++++++++++++++++++++++++++++++---------------
 1 file changed, 71 insertions(+), 27 deletions(-)

diff --git a/src/cast.rs b/src/cast.rs
index 7f53583e8d76..8702ce7070a8 100644
--- a/src/cast.rs
+++ b/src/cast.rs
@@ -31,7 +31,7 @@ use arrow::{
         GenericStringArray, Int16Array, Int32Array, Int64Array, Int8Array, OffsetSizeTrait,
         PrimitiveArray,
     },
-    compute::{cast_with_options, unary, CastOptions},
+    compute::{cast_with_options, take, unary, CastOptions},
     datatypes::{
         ArrowPrimitiveType, Decimal128Type, DecimalType, Float32Type, Float64Type, Int64Type,
         TimestampMicrosecondType,
@@ -40,6 +40,7 @@ use arrow::{
     record_batch::RecordBatch,
     util::display::FormatOptions,
 };
+use arrow_array::DictionaryArray;
 use arrow_schema::{DataType, Schema};
 
 use datafusion_common::{
@@ -98,7 +99,6 @@ macro_rules! cast_utf8_to_int {
         result
     }};
 }
-
 macro_rules! cast_utf8_to_timestamp {
     ($array:expr, $eval_mode:expr, $array_type:ty, $cast_method:ident) => {{
         let len = $array.len();
@@ -507,19 +507,27 @@ impl Cast {
         let to_type = &self.data_type;
         let array = array_with_timezone(array, self.timezone.clone(), Some(to_type))?;
         let from_type = array.data_type().clone();
-
-        // unpack dictionary string arrays first
-        // TODO: we are unpacking a dictionary-encoded array and then performing
-        // the cast. We could potentially improve performance here by casting the
-        // dictionary values directly without unpacking the array first, although this
-        // would add more complexity to the code
         let array = match &from_type {
             DataType::Dictionary(key_type, value_type)
                 if key_type.as_ref() == &DataType::Int32
                     && (value_type.as_ref() == &DataType::Utf8
                         || value_type.as_ref() == &DataType::LargeUtf8) =>
             {
-                cast_with_options(&array, value_type.as_ref(), &CAST_OPTIONS)?
+                let dict_array = array
+                    .as_any()
+                    .downcast_ref::<DictionaryArray<Int32Type>>()
+                    .expect("Expected a dictionary array");
+
+                let casted_dictionary = DictionaryArray::<Int32Type>::new(
+                    dict_array.keys().clone(),
+                    self.cast_array(dict_array.values().clone())?,
+                );
+
+                let casted_result = match to_type {
+                    DataType::Dictionary(_, _) => Arc::new(casted_dictionary.clone()),
+                    _ => take(casted_dictionary.values().as_ref(), dict_array.keys(), None)?,
+                };
+                return Ok(spark_cast(casted_result, &from_type, to_type));
             }
             _ => array,
         };
@@ -724,26 +732,31 @@ impl Cast {
             .downcast_ref::<GenericStringArray<i32>>()
             .expect("Expected a string array");
 
-        let cast_array: ArrayRef = match to_type {
-            DataType::Date32 => {
-                let len = string_array.len();
-                let mut cast_array = PrimitiveArray::<Date32Type>::builder(len);
-                for i in 0..len {
-                    if !string_array.is_null(i) {
-                        match date_parser(string_array.value(i), eval_mode) {
-                            Ok(Some(cast_value)) => cast_array.append_value(cast_value),
-                            Ok(None) => cast_array.append_null(),
-                            Err(e) => return Err(e),
-                        }
-                    } else {
-                        cast_array.append_null()
-                    }
+        if to_type != &DataType::Date32 {
+            unreachable!("Invalid data type {:?} in cast from string", to_type);
+        }
+
+        let len = string_array.len();
+        let mut cast_array = PrimitiveArray::<Date32Type>::builder(len);
+
+        for i in 0..len {
+            let value = if string_array.is_null(i) {
+                None
+            } else {
+                match date_parser(string_array.value(i), eval_mode) {
+                    Ok(Some(cast_value)) => Some(cast_value),
+                    Ok(None) => None,
+                    Err(e) => return Err(e),
                 }
-                Arc::new(cast_array.finish()) as ArrayRef
+            };
+
+            match value {
+                Some(cast_value) => cast_array.append_value(cast_value),
+                None => cast_array.append_null(),
             }
-            _ => unreachable!("Invalid data type {:?} in cast from string", to_type),
-        };
-        Ok(cast_array)
+        }
+
+        Ok(Arc::new(cast_array.finish()) as ArrayRef)
     }
 
     fn cast_string_to_timestamp(
@@ -1796,6 +1809,37 @@ mod tests {
         assert_eq!(result.len(), 2);
     }
 
+    #[test]
+    fn test_cast_dict_string_to_timestamp() -> DataFusionResult<()> {
+        // prepare input data
+        let keys = Int32Array::from(vec![0, 1]);
+        let values: ArrayRef = Arc::new(StringArray::from(vec![
+            Some("2020-01-01T12:34:56.123456"),
+            Some("T2"),
+        ]));
+        let dict_array = Arc::new(DictionaryArray::new(keys, values));
+
+        // prepare cast expression
+        let timezone = "UTC".to_string();
+        let expr = Arc::new(Column::new("a", 0)); // this is not used by the test
+        let cast = Cast::new(
+            expr,
+            DataType::Timestamp(TimeUnit::Microsecond, Some(timezone.clone().into())),
+            EvalMode::Legacy,
+            timezone.clone(),
+        );
+
+        // test casting string dictionary array to timestamp array
+        let result = cast.cast_array(dict_array)?;
+        assert_eq!(
+            *result.data_type(),
+            DataType::Timestamp(TimeUnit::Microsecond, Some(timezone.into()))
+        );
+        assert_eq!(result.len(), 2);
+
+        Ok(())
+    }
+
     #[test]
     fn date_parser_test() {
         for date in &[

From 21793315c46434e9c60967de0a4ea7f9a29c30be Mon Sep 17 00:00:00 2001
From: Andy Grove <agrove@apache.org>
Date: Tue, 16 Jul 2024 13:17:46 -0600
Subject: [PATCH 08/68] Change suffix on some expressions from Exec to Expr
 (#673)

---
 src/lib.rs      |  2 +-
 src/temporal.rs | 70 ++++++++++++++++++++++++-------------------------
 2 files changed, 36 insertions(+), 36 deletions(-)

diff --git a/src/lib.rs b/src/lib.rs
index 5168e0e80747..91d61f70a14d 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -29,7 +29,7 @@ pub use abs::Abs;
 pub use cast::Cast;
 pub use error::{SparkError, SparkResult};
 pub use if_expr::IfExpr;
-pub use temporal::{DateTruncExec, HourExec, MinuteExec, SecondExec, TimestampTruncExec};
+pub use temporal::{DateTruncExpr, HourExpr, MinuteExpr, SecondExpr, TimestampTruncExpr};
 
 /// Spark supports three evaluation modes when evaluating expressions, which affect
 /// the behavior when processing input values that are invalid or would result in an
diff --git a/src/temporal.rs b/src/temporal.rs
index ea30d3383dd5..34b71a284a4e 100644
--- a/src/temporal.rs
+++ b/src/temporal.rs
@@ -38,19 +38,19 @@ use crate::kernels::temporal::{
 };
 
 #[derive(Debug, Hash)]
-pub struct HourExec {
+pub struct HourExpr {
     /// An array with DataType::Timestamp(TimeUnit::Microsecond, None)
     child: Arc<dyn PhysicalExpr>,
     timezone: String,
 }
 
-impl HourExec {
+impl HourExpr {
     pub fn new(child: Arc<dyn PhysicalExpr>, timezone: String) -> Self {
-        HourExec { child, timezone }
+        HourExpr { child, timezone }
     }
 }
 
-impl Display for HourExec {
+impl Display for HourExpr {
     fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
         write!(
             f,
@@ -60,7 +60,7 @@ impl Display for HourExec {
     }
 }
 
-impl PartialEq<dyn Any> for HourExec {
+impl PartialEq<dyn Any> for HourExpr {
     fn eq(&self, other: &dyn Any) -> bool {
         down_cast_any_ref(other)
             .downcast_ref::<Self>()
@@ -69,7 +69,7 @@ impl PartialEq<dyn Any> for HourExec {
     }
 }
 
-impl PhysicalExpr for HourExec {
+impl PhysicalExpr for HourExpr {
     fn as_any(&self) -> &dyn Any {
         self
     }
@@ -117,7 +117,7 @@ impl PhysicalExpr for HourExec {
         self: Arc<Self>,
         children: Vec<Arc<dyn PhysicalExpr>>,
     ) -> Result<Arc<dyn PhysicalExpr>, DataFusionError> {
-        Ok(Arc::new(HourExec::new(
+        Ok(Arc::new(HourExpr::new(
             children[0].clone(),
             self.timezone.clone(),
         )))
@@ -132,19 +132,19 @@ impl PhysicalExpr for HourExec {
 }
 
 #[derive(Debug, Hash)]
-pub struct MinuteExec {
+pub struct MinuteExpr {
     /// An array with DataType::Timestamp(TimeUnit::Microsecond, None)
     child: Arc<dyn PhysicalExpr>,
     timezone: String,
 }
 
-impl MinuteExec {
+impl MinuteExpr {
     pub fn new(child: Arc<dyn PhysicalExpr>, timezone: String) -> Self {
-        MinuteExec { child, timezone }
+        MinuteExpr { child, timezone }
     }
 }
 
-impl Display for MinuteExec {
+impl Display for MinuteExpr {
     fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
         write!(
             f,
@@ -154,7 +154,7 @@ impl Display for MinuteExec {
     }
 }
 
-impl PartialEq<dyn Any> for MinuteExec {
+impl PartialEq<dyn Any> for MinuteExpr {
     fn eq(&self, other: &dyn Any) -> bool {
         down_cast_any_ref(other)
             .downcast_ref::<Self>()
@@ -163,7 +163,7 @@ impl PartialEq<dyn Any> for MinuteExec {
     }
 }
 
-impl PhysicalExpr for MinuteExec {
+impl PhysicalExpr for MinuteExpr {
     fn as_any(&self) -> &dyn Any {
         self
     }
@@ -211,7 +211,7 @@ impl PhysicalExpr for MinuteExec {
         self: Arc<Self>,
         children: Vec<Arc<dyn PhysicalExpr>>,
     ) -> Result<Arc<dyn PhysicalExpr>, DataFusionError> {
-        Ok(Arc::new(MinuteExec::new(
+        Ok(Arc::new(MinuteExpr::new(
             children[0].clone(),
             self.timezone.clone(),
         )))
@@ -226,19 +226,19 @@ impl PhysicalExpr for MinuteExec {
 }
 
 #[derive(Debug, Hash)]
-pub struct SecondExec {
+pub struct SecondExpr {
     /// An array with DataType::Timestamp(TimeUnit::Microsecond, None)
     child: Arc<dyn PhysicalExpr>,
     timezone: String,
 }
 
-impl SecondExec {
+impl SecondExpr {
     pub fn new(child: Arc<dyn PhysicalExpr>, timezone: String) -> Self {
-        SecondExec { child, timezone }
+        SecondExpr { child, timezone }
     }
 }
 
-impl Display for SecondExec {
+impl Display for SecondExpr {
     fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
         write!(
             f,
@@ -248,7 +248,7 @@ impl Display for SecondExec {
     }
 }
 
-impl PartialEq<dyn Any> for SecondExec {
+impl PartialEq<dyn Any> for SecondExpr {
     fn eq(&self, other: &dyn Any) -> bool {
         down_cast_any_ref(other)
             .downcast_ref::<Self>()
@@ -257,7 +257,7 @@ impl PartialEq<dyn Any> for SecondExec {
     }
 }
 
-impl PhysicalExpr for SecondExec {
+impl PhysicalExpr for SecondExpr {
     fn as_any(&self) -> &dyn Any {
         self
     }
@@ -305,7 +305,7 @@ impl PhysicalExpr for SecondExec {
         self: Arc<Self>,
         children: Vec<Arc<dyn PhysicalExpr>>,
     ) -> Result<Arc<dyn PhysicalExpr>, DataFusionError> {
-        Ok(Arc::new(SecondExec::new(
+        Ok(Arc::new(SecondExpr::new(
             children[0].clone(),
             self.timezone.clone(),
         )))
@@ -320,20 +320,20 @@ impl PhysicalExpr for SecondExec {
 }
 
 #[derive(Debug, Hash)]
-pub struct DateTruncExec {
+pub struct DateTruncExpr {
     /// An array with DataType::Date32
     child: Arc<dyn PhysicalExpr>,
     /// Scalar UTF8 string matching the valid values in Spark SQL: https://spark.apache.org/docs/latest/api/sql/index.html#trunc
     format: Arc<dyn PhysicalExpr>,
 }
 
-impl DateTruncExec {
+impl DateTruncExpr {
     pub fn new(child: Arc<dyn PhysicalExpr>, format: Arc<dyn PhysicalExpr>) -> Self {
-        DateTruncExec { child, format }
+        DateTruncExpr { child, format }
     }
 }
 
-impl Display for DateTruncExec {
+impl Display for DateTruncExpr {
     fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
         write!(
             f,
@@ -343,7 +343,7 @@ impl Display for DateTruncExec {
     }
 }
 
-impl PartialEq<dyn Any> for DateTruncExec {
+impl PartialEq<dyn Any> for DateTruncExpr {
     fn eq(&self, other: &dyn Any) -> bool {
         down_cast_any_ref(other)
             .downcast_ref::<Self>()
@@ -352,7 +352,7 @@ impl PartialEq<dyn Any> for DateTruncExec {
     }
 }
 
-impl PhysicalExpr for DateTruncExec {
+impl PhysicalExpr for DateTruncExpr {
     fn as_any(&self) -> &dyn Any {
         self
     }
@@ -392,7 +392,7 @@ impl PhysicalExpr for DateTruncExec {
         self: Arc<Self>,
         children: Vec<Arc<dyn PhysicalExpr>>,
     ) -> Result<Arc<dyn PhysicalExpr>, DataFusionError> {
-        Ok(Arc::new(DateTruncExec::new(
+        Ok(Arc::new(DateTruncExpr::new(
             children[0].clone(),
             self.format.clone(),
         )))
@@ -407,7 +407,7 @@ impl PhysicalExpr for DateTruncExec {
 }
 
 #[derive(Debug, Hash)]
-pub struct TimestampTruncExec {
+pub struct TimestampTruncExpr {
     /// An array with DataType::Timestamp(TimeUnit::Microsecond, None)
     child: Arc<dyn PhysicalExpr>,
     /// Scalar UTF8 string matching the valid values in Spark SQL: https://spark.apache.org/docs/latest/api/sql/index.html#date_trunc
@@ -421,13 +421,13 @@ pub struct TimestampTruncExec {
     timezone: String,
 }
 
-impl TimestampTruncExec {
+impl TimestampTruncExpr {
     pub fn new(
         child: Arc<dyn PhysicalExpr>,
         format: Arc<dyn PhysicalExpr>,
         timezone: String,
     ) -> Self {
-        TimestampTruncExec {
+        TimestampTruncExpr {
             child,
             format,
             timezone,
@@ -435,7 +435,7 @@ impl TimestampTruncExec {
     }
 }
 
-impl Display for TimestampTruncExec {
+impl Display for TimestampTruncExpr {
     fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
         write!(
             f,
@@ -445,7 +445,7 @@ impl Display for TimestampTruncExec {
     }
 }
 
-impl PartialEq<dyn Any> for TimestampTruncExec {
+impl PartialEq<dyn Any> for TimestampTruncExpr {
     fn eq(&self, other: &dyn Any) -> bool {
         down_cast_any_ref(other)
             .downcast_ref::<Self>()
@@ -458,7 +458,7 @@ impl PartialEq<dyn Any> for TimestampTruncExec {
     }
 }
 
-impl PhysicalExpr for TimestampTruncExec {
+impl PhysicalExpr for TimestampTruncExpr {
     fn as_any(&self) -> &dyn Any {
         self
     }
@@ -517,7 +517,7 @@ impl PhysicalExpr for TimestampTruncExec {
         self: Arc<Self>,
         children: Vec<Arc<dyn PhysicalExpr>>,
     ) -> Result<Arc<dyn PhysicalExpr>, DataFusionError> {
-        Ok(Arc::new(TimestampTruncExec::new(
+        Ok(Arc::new(TimestampTruncExpr::new(
             children[0].clone(),
             self.format.clone(),
             self.timezone.clone(),

From 01e21a931947326d9eb620e76c91068b4fdf1495 Mon Sep 17 00:00:00 2001
From: Andy Grove <agrove@apache.org>
Date: Sat, 20 Jul 2024 14:08:00 -0600
Subject: [PATCH 09/68] chore: Disable abs and signum because they return
 incorrect results (#695)

---
 Cargo.toml |  1 -
 src/abs.rs | 89 ------------------------------------------------------
 src/lib.rs |  2 --
 3 files changed, 92 deletions(-)
 delete mode 100644 src/abs.rs

diff --git a/Cargo.toml b/Cargo.toml
index 976a1f36f354..192ed102b7f6 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -33,7 +33,6 @@ arrow-schema = { workspace = true }
 chrono = { workspace = true }
 datafusion = { workspace = true }
 datafusion-common = { workspace = true }
-datafusion-functions = { workspace = true }
 datafusion-expr = { workspace = true }
 datafusion-physical-expr = { workspace = true }
 datafusion-physical-plan = { workspace = true }
diff --git a/src/abs.rs b/src/abs.rs
deleted file mode 100644
index fa25a7775ae7..000000000000
--- a/src/abs.rs
+++ /dev/null
@@ -1,89 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-//! Spark-compatible implementation of abs function
-
-use std::{any::Any, sync::Arc};
-
-use arrow::datatypes::DataType;
-use arrow_schema::ArrowError;
-
-use datafusion::logical_expr::{ColumnarValue, ScalarUDFImpl, Signature};
-use datafusion_common::DataFusionError;
-use datafusion_functions::math;
-
-use super::{EvalMode, SparkError};
-
-/// Spark-compatible ABS expression
-#[derive(Debug)]
-pub struct Abs {
-    inner_abs_func: Arc<dyn ScalarUDFImpl>,
-    eval_mode: EvalMode,
-    data_type_name: String,
-}
-
-impl Abs {
-    pub fn new(eval_mode: EvalMode, data_type_name: String) -> Result<Self, DataFusionError> {
-        if let EvalMode::Legacy | EvalMode::Ansi = eval_mode {
-            Ok(Self {
-                inner_abs_func: math::abs().inner().clone(),
-                eval_mode,
-                data_type_name,
-            })
-        } else {
-            Err(DataFusionError::Execution(format!(
-                "Invalid EvalMode: \"{:?}\"",
-                eval_mode
-            )))
-        }
-    }
-}
-
-impl ScalarUDFImpl for Abs {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-    fn name(&self) -> &str {
-        "abs"
-    }
-
-    fn signature(&self) -> &Signature {
-        self.inner_abs_func.signature()
-    }
-
-    fn return_type(&self, arg_types: &[DataType]) -> Result<DataType, DataFusionError> {
-        self.inner_abs_func.return_type(arg_types)
-    }
-
-    fn invoke(&self, args: &[ColumnarValue]) -> Result<ColumnarValue, DataFusionError> {
-        match self.inner_abs_func.invoke(args) {
-            Err(DataFusionError::ArrowError(ArrowError::ComputeError(msg), _))
-                if msg.contains("overflow") =>
-            {
-                if self.eval_mode == EvalMode::Legacy {
-                    Ok(args[0].clone())
-                } else {
-                    Err(SparkError::ArithmeticOverflow {
-                        from_type: self.data_type_name.clone(),
-                    }
-                    .into())
-                }
-            }
-            other => other,
-        }
-    }
-}
diff --git a/src/lib.rs b/src/lib.rs
index 91d61f70a14d..336201f4846e 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -15,7 +15,6 @@
 // specific language governing permissions and limitations
 // under the License.
 
-mod abs;
 mod cast;
 mod error;
 mod if_expr;
@@ -25,7 +24,6 @@ mod temporal;
 pub mod timezone;
 pub mod utils;
 
-pub use abs::Abs;
 pub use cast::Cast;
 pub use error::{SparkError, SparkResult};
 pub use if_expr::IfExpr;

From 01362b50b22e2629851e6ad363ecaef4e32b33ab Mon Sep 17 00:00:00 2001
From: Xuanwo <github@xuanwo.io>
Date: Mon, 22 Jul 2024 02:51:08 +0800
Subject: [PATCH 10/68] chore: Make rust clippy happy (#701)

* chore: Make rust clippy happy

Signed-off-by: Xuanwo <github@xuanwo.io>

* Format code

Signed-off-by: Xuanwo <github@xuanwo.io>

---------

Signed-off-by: Xuanwo <github@xuanwo.io>
---
 src/cast.rs             | 10 +++++-----
 src/kernels/temporal.rs | 28 ++++++++++++++--------------
 2 files changed, 19 insertions(+), 19 deletions(-)

diff --git a/src/cast.rs b/src/cast.rs
index 8702ce7070a8..9a47cc87334e 100644
--- a/src/cast.rs
+++ b/src/cast.rs
@@ -1854,7 +1854,7 @@ mod tests {
             "2020-01-01T",
         ] {
             for eval_mode in &[EvalMode::Legacy, EvalMode::Ansi, EvalMode::Try] {
-                assert_eq!(date_parser(*date, *eval_mode).unwrap(), Some(18262));
+                assert_eq!(date_parser(date, *eval_mode).unwrap(), Some(18262));
             }
         }
 
@@ -1875,14 +1875,14 @@ mod tests {
             "--262143-12-31 ",
         ] {
             for eval_mode in &[EvalMode::Legacy, EvalMode::Try] {
-                assert_eq!(date_parser(*date, *eval_mode).unwrap(), None);
+                assert_eq!(date_parser(date, *eval_mode).unwrap(), None);
             }
-            assert!(date_parser(*date, EvalMode::Ansi).is_err());
+            assert!(date_parser(date, EvalMode::Ansi).is_err());
         }
 
         for date in &["-3638-5"] {
             for eval_mode in &[EvalMode::Legacy, EvalMode::Try, EvalMode::Ansi] {
-                assert_eq!(date_parser(*date, *eval_mode).unwrap(), Some(-2048160));
+                assert_eq!(date_parser(date, *eval_mode).unwrap(), Some(-2048160));
             }
         }
 
@@ -1898,7 +1898,7 @@ mod tests {
             "-0973250",
         ] {
             for eval_mode in &[EvalMode::Legacy, EvalMode::Try, EvalMode::Ansi] {
-                assert_eq!(date_parser(*date, *eval_mode).unwrap(), None);
+                assert_eq!(date_parser(date, *eval_mode).unwrap(), None);
             }
         }
     }
diff --git a/src/kernels/temporal.rs b/src/kernels/temporal.rs
index 6f2474e8d7a8..cda4bef5d184 100644
--- a/src/kernels/temporal.rs
+++ b/src/kernels/temporal.rs
@@ -838,7 +838,7 @@ mod tests {
                         assert!(array.values().get(i) >= a.values().get(i))
                     }
                 }
-                _ => assert!(false),
+                _ => unreachable!(),
             }
         }
     }
@@ -854,9 +854,9 @@ mod tests {
         let mut vec: Vec<i32> = Vec::with_capacity(size * formats.len());
         let mut fmt_vec: Vec<&str> = Vec::with_capacity(size * formats.len());
         for i in 0..size {
-            for j in 0..formats.len() {
+            for fmt_value in &formats {
                 vec.push(i as i32 * 1_000_001);
-                fmt_vec.push(formats[j]);
+                fmt_vec.push(fmt_value);
             }
         }
 
@@ -928,7 +928,7 @@ mod tests {
                 )
             }
         } else {
-            assert!(false)
+            unreachable!()
         }
         if let Ok(a) = date_trunc_array_fmt_dyn(&array_dict, &fmt_array) {
             for i in 0..array.len() {
@@ -937,7 +937,7 @@ mod tests {
                 )
             }
         } else {
-            assert!(false)
+            unreachable!()
         }
         if let Ok(a) = date_trunc_array_fmt_dyn(&array, &fmt_dict) {
             for i in 0..array.len() {
@@ -946,7 +946,7 @@ mod tests {
                 )
             }
         } else {
-            assert!(false)
+            unreachable!()
         }
         if let Ok(a) = date_trunc_array_fmt_dyn(&array_dict, &fmt_dict) {
             for i in 0..array.len() {
@@ -955,7 +955,7 @@ mod tests {
                 )
             }
         } else {
-            assert!(false)
+            unreachable!()
         }
     }
 
@@ -991,7 +991,7 @@ mod tests {
                         assert!(array.values().get(i) >= a.values().get(i))
                     }
                 }
-                _ => assert!(false),
+                _ => unreachable!(),
             }
         }
     }
@@ -1023,9 +1023,9 @@ mod tests {
         let mut vec: Vec<i64> = Vec::with_capacity(size * formats.len());
         let mut fmt_vec: Vec<&str> = Vec::with_capacity(size * formats.len());
         for i in 0..size {
-            for j in 0..formats.len() {
+            for fmt_value in &formats {
                 vec.push(i as i64 * 1_000_000_001);
-                fmt_vec.push(formats[j]);
+                fmt_vec.push(fmt_value);
             }
         }
 
@@ -1103,7 +1103,7 @@ mod tests {
                 )
             }
         } else {
-            assert!(false)
+            unreachable!()
         }
         if let Ok(a) = timestamp_trunc_array_fmt_dyn(&array_dict, &fmt_array) {
             for i in 0..array.len() {
@@ -1116,7 +1116,7 @@ mod tests {
                 )
             }
         } else {
-            assert!(false)
+            unreachable!()
         }
         if let Ok(a) = timestamp_trunc_array_fmt_dyn(&array, &fmt_dict) {
             for i in 0..array.len() {
@@ -1129,7 +1129,7 @@ mod tests {
                 )
             }
         } else {
-            assert!(false)
+            unreachable!()
         }
         if let Ok(a) = timestamp_trunc_array_fmt_dyn(&array_dict, &fmt_dict) {
             for i in 0..array.len() {
@@ -1142,7 +1142,7 @@ mod tests {
                 )
             }
         } else {
-            assert!(false)
+            unreachable!()
         }
     }
 }

From e2d838ec3abdd7701198738799e40bb54abe5814 Mon Sep 17 00:00:00 2001
From: Andy Grove <agrove@apache.org>
Date: Wed, 24 Jul 2024 15:11:21 -0600
Subject: [PATCH 11/68] perf: Optimize IfExpr by delegating to CaseExpr (#681)

* Unify IF and CASE expressions

* revert test changes

* fix
---
 Cargo.toml                  |  17 +++++
 benches/cast_from_string.rs |  91 +++++++++++++++++++++++
 benches/cast_numeric.rs     |  79 ++++++++++++++++++++
 benches/conditional.rs      | 139 ++++++++++++++++++++++++++++++++++++
 src/if_expr.rs              |  44 ++++--------
 5 files changed, 340 insertions(+), 30 deletions(-)
 create mode 100644 benches/cast_from_string.rs
 create mode 100644 benches/cast_numeric.rs
 create mode 100644 benches/conditional.rs

diff --git a/Cargo.toml b/Cargo.toml
index 192ed102b7f6..aa4fcfc5f022 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -34,6 +34,7 @@ chrono = { workspace = true }
 datafusion = { workspace = true }
 datafusion-common = { workspace = true }
 datafusion-expr = { workspace = true }
+datafusion-physical-expr-common = { workspace = true }
 datafusion-physical-expr = { workspace = true }
 datafusion-physical-plan = { workspace = true }
 chrono-tz = { workspace = true }
@@ -41,6 +42,22 @@ num = { workspace = true }
 regex = { workspace = true }
 thiserror = { workspace = true }
 
+[dev-dependencies]
+criterion = "0.5.1"
+rand = "0.8.5"
+
 [lib]
 name = "datafusion_comet_spark_expr"
 path = "src/lib.rs"
+
+[[bench]]
+name = "cast_from_string"
+harness = false
+
+[[bench]]
+name = "cast_numeric"
+harness = false
+
+[[bench]]
+name = "conditional"
+harness = false
\ No newline at end of file
diff --git a/benches/cast_from_string.rs b/benches/cast_from_string.rs
new file mode 100644
index 000000000000..51410a68ad90
--- /dev/null
+++ b/benches/cast_from_string.rs
@@ -0,0 +1,91 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow_array::{builder::StringBuilder, RecordBatch};
+use arrow_schema::{DataType, Field, Schema};
+use criterion::{criterion_group, criterion_main, Criterion};
+use datafusion_comet_spark_expr::{Cast, EvalMode};
+use datafusion_physical_expr::{expressions::Column, PhysicalExpr};
+use std::sync::Arc;
+
+fn criterion_benchmark(c: &mut Criterion) {
+    let batch = create_utf8_batch();
+    let expr = Arc::new(Column::new("a", 0));
+    let timezone = "".to_string();
+    let cast_string_to_i8 = Cast::new(
+        expr.clone(),
+        DataType::Int8,
+        EvalMode::Legacy,
+        timezone.clone(),
+    );
+    let cast_string_to_i16 = Cast::new(
+        expr.clone(),
+        DataType::Int16,
+        EvalMode::Legacy,
+        timezone.clone(),
+    );
+    let cast_string_to_i32 = Cast::new(
+        expr.clone(),
+        DataType::Int32,
+        EvalMode::Legacy,
+        timezone.clone(),
+    );
+    let cast_string_to_i64 = Cast::new(expr, DataType::Int64, EvalMode::Legacy, timezone);
+
+    let mut group = c.benchmark_group("cast_string_to_int");
+    group.bench_function("cast_string_to_i8", |b| {
+        b.iter(|| cast_string_to_i8.evaluate(&batch).unwrap());
+    });
+    group.bench_function("cast_string_to_i16", |b| {
+        b.iter(|| cast_string_to_i16.evaluate(&batch).unwrap());
+    });
+    group.bench_function("cast_string_to_i32", |b| {
+        b.iter(|| cast_string_to_i32.evaluate(&batch).unwrap());
+    });
+    group.bench_function("cast_string_to_i64", |b| {
+        b.iter(|| cast_string_to_i64.evaluate(&batch).unwrap());
+    });
+}
+
+// Create UTF8 batch with strings representing ints, floats, nulls
+fn create_utf8_batch() -> RecordBatch {
+    let schema = Arc::new(Schema::new(vec![Field::new("a", DataType::Utf8, true)]));
+    let mut b = StringBuilder::new();
+    for i in 0..1000 {
+        if i % 10 == 0 {
+            b.append_null();
+        } else if i % 2 == 0 {
+            b.append_value(format!("{}", rand::random::<f64>()));
+        } else {
+            b.append_value(format!("{}", rand::random::<i64>()));
+        }
+    }
+    let array = b.finish();
+
+    RecordBatch::try_new(schema.clone(), vec![Arc::new(array)]).unwrap()
+}
+
+fn config() -> Criterion {
+    Criterion::default()
+}
+
+criterion_group! {
+    name = benches;
+    config = config();
+    targets = criterion_benchmark
+}
+criterion_main!(benches);
diff --git a/benches/cast_numeric.rs b/benches/cast_numeric.rs
new file mode 100644
index 000000000000..dc0ceea79ad1
--- /dev/null
+++ b/benches/cast_numeric.rs
@@ -0,0 +1,79 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow_array::{builder::Int32Builder, RecordBatch};
+use arrow_schema::{DataType, Field, Schema};
+use criterion::{criterion_group, criterion_main, Criterion};
+use datafusion_comet_spark_expr::{Cast, EvalMode};
+use datafusion_physical_expr::{expressions::Column, PhysicalExpr};
+use std::sync::Arc;
+
+fn criterion_benchmark(c: &mut Criterion) {
+    let batch = create_int32_batch();
+    let expr = Arc::new(Column::new("a", 0));
+    let timezone = "".to_string();
+    let cast_i32_to_i8 = Cast::new(
+        expr.clone(),
+        DataType::Int8,
+        EvalMode::Legacy,
+        timezone.clone(),
+    );
+    let cast_i32_to_i16 = Cast::new(
+        expr.clone(),
+        DataType::Int16,
+        EvalMode::Legacy,
+        timezone.clone(),
+    );
+    let cast_i32_to_i64 = Cast::new(expr, DataType::Int64, EvalMode::Legacy, timezone);
+
+    let mut group = c.benchmark_group("cast_int_to_int");
+    group.bench_function("cast_i32_to_i8", |b| {
+        b.iter(|| cast_i32_to_i8.evaluate(&batch).unwrap());
+    });
+    group.bench_function("cast_i32_to_i16", |b| {
+        b.iter(|| cast_i32_to_i16.evaluate(&batch).unwrap());
+    });
+    group.bench_function("cast_i32_to_i64", |b| {
+        b.iter(|| cast_i32_to_i64.evaluate(&batch).unwrap());
+    });
+}
+
+fn create_int32_batch() -> RecordBatch {
+    let schema = Arc::new(Schema::new(vec![Field::new("a", DataType::Int32, true)]));
+    let mut b = Int32Builder::new();
+    for i in 0..1000 {
+        if i % 10 == 0 {
+            b.append_null();
+        } else {
+            b.append_value(rand::random::<i32>());
+        }
+    }
+    let array = b.finish();
+
+    RecordBatch::try_new(schema.clone(), vec![Arc::new(array)]).unwrap()
+}
+
+fn config() -> Criterion {
+    Criterion::default()
+}
+
+criterion_group! {
+    name = benches;
+    config = config();
+    targets = criterion_benchmark
+}
+criterion_main!(benches);
diff --git a/benches/conditional.rs b/benches/conditional.rs
new file mode 100644
index 000000000000..d86ef76f82ee
--- /dev/null
+++ b/benches/conditional.rs
@@ -0,0 +1,139 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::datatypes::{Field, Schema};
+use arrow::record_batch::RecordBatch;
+use arrow_array::builder::{Int32Builder, StringBuilder};
+use arrow_schema::DataType;
+use criterion::{black_box, criterion_group, criterion_main, Criterion};
+use datafusion_comet_spark_expr::IfExpr;
+use datafusion_common::ScalarValue;
+use datafusion_expr::Operator;
+use datafusion_physical_expr::expressions::{BinaryExpr, CaseExpr};
+use datafusion_physical_expr_common::expressions::column::Column;
+use datafusion_physical_expr_common::expressions::Literal;
+use datafusion_physical_expr_common::physical_expr::PhysicalExpr;
+use std::sync::Arc;
+
+fn make_col(name: &str, index: usize) -> Arc<dyn PhysicalExpr> {
+    Arc::new(Column::new(name, index))
+}
+
+fn make_lit_i32(n: i32) -> Arc<dyn PhysicalExpr> {
+    Arc::new(Literal::new(ScalarValue::Int32(Some(n))))
+}
+
+fn make_null_lit() -> Arc<dyn PhysicalExpr> {
+    Arc::new(Literal::new(ScalarValue::Utf8(None)))
+}
+
+fn criterion_benchmark(c: &mut Criterion) {
+    // create input data
+    let mut c1 = Int32Builder::new();
+    let mut c2 = StringBuilder::new();
+    let mut c3 = StringBuilder::new();
+    for i in 0..1000 {
+        c1.append_value(i);
+        if i % 7 == 0 {
+            c2.append_null();
+        } else {
+            c2.append_value(&format!("string {i}"));
+        }
+        if i % 9 == 0 {
+            c3.append_null();
+        } else {
+            c3.append_value(&format!("other string {i}"));
+        }
+    }
+    let c1 = Arc::new(c1.finish());
+    let c2 = Arc::new(c2.finish());
+    let c3 = Arc::new(c3.finish());
+    let schema = Schema::new(vec![
+        Field::new("c1", DataType::Int32, true),
+        Field::new("c2", DataType::Utf8, true),
+        Field::new("c3", DataType::Utf8, true),
+    ]);
+    let batch = RecordBatch::try_new(Arc::new(schema), vec![c1, c2, c3]).unwrap();
+
+    // use same predicate for all benchmarks
+    let predicate = Arc::new(BinaryExpr::new(
+        make_col("c1", 0),
+        Operator::LtEq,
+        make_lit_i32(500),
+    ));
+
+    // CASE WHEN c1 <= 500 THEN 1 ELSE 0 END
+    c.bench_function("case_when: scalar or scalar", |b| {
+        let expr = Arc::new(
+            CaseExpr::try_new(
+                None,
+                vec![(predicate.clone(), make_lit_i32(1))],
+                Some(make_lit_i32(0)),
+            )
+            .unwrap(),
+        );
+        b.iter(|| black_box(expr.evaluate(black_box(&batch)).unwrap()))
+    });
+    c.bench_function("if: scalar or scalar", |b| {
+        let expr = Arc::new(IfExpr::new(
+            predicate.clone(),
+            make_lit_i32(1),
+            make_lit_i32(0),
+        ));
+        b.iter(|| black_box(expr.evaluate(black_box(&batch)).unwrap()))
+    });
+
+    // CASE WHEN c1 <= 500 THEN c2 [ELSE NULL] END
+    c.bench_function("case_when: column or null", |b| {
+        let expr = Arc::new(
+            CaseExpr::try_new(None, vec![(predicate.clone(), make_col("c2", 1))], None).unwrap(),
+        );
+        b.iter(|| black_box(expr.evaluate(black_box(&batch)).unwrap()))
+    });
+    c.bench_function("if: column or null", |b| {
+        let expr = Arc::new(IfExpr::new(
+            predicate.clone(),
+            make_col("c2", 1),
+            make_null_lit(),
+        ));
+        b.iter(|| black_box(expr.evaluate(black_box(&batch)).unwrap()))
+    });
+
+    // CASE WHEN c1 <= 500 THEN c2 ELSE c3 END
+    c.bench_function("case_when: expr or expr", |b| {
+        let expr = Arc::new(
+            CaseExpr::try_new(
+                None,
+                vec![(predicate.clone(), make_col("c2", 1))],
+                Some(make_col("c3", 2)),
+            )
+            .unwrap(),
+        );
+        b.iter(|| black_box(expr.evaluate(black_box(&batch)).unwrap()))
+    });
+    c.bench_function("if: expr or expr", |b| {
+        let expr = Arc::new(IfExpr::new(
+            predicate.clone(),
+            make_col("c2", 1),
+            make_col("c3", 2),
+        ));
+        b.iter(|| black_box(expr.evaluate(black_box(&batch)).unwrap()))
+    });
+}
+
+criterion_group!(benches, criterion_benchmark);
+criterion_main!(benches);
diff --git a/src/if_expr.rs b/src/if_expr.rs
index fa52c5d5b9b9..a5344140bb8a 100644
--- a/src/if_expr.rs
+++ b/src/if_expr.rs
@@ -22,22 +22,24 @@ use std::{
 };
 
 use arrow::{
-    array::*,
-    compute::{and, is_null, kernels::zip::zip, not, or_kleene},
     datatypes::{DataType, Schema},
     record_batch::RecordBatch,
 };
 use datafusion::logical_expr::ColumnarValue;
-use datafusion_common::{cast::as_boolean_array, Result};
-use datafusion_physical_expr::PhysicalExpr;
+use datafusion_common::Result;
+use datafusion_physical_expr::{expressions::CaseExpr, PhysicalExpr};
 
 use crate::utils::down_cast_any_ref;
 
+/// IfExpr is a wrapper around CaseExpr, because `IF(a, b, c)` is semantically equivalent to
+/// `CASE WHEN a THEN b ELSE c END`.
 #[derive(Debug, Hash)]
 pub struct IfExpr {
     if_expr: Arc<dyn PhysicalExpr>,
     true_expr: Arc<dyn PhysicalExpr>,
     false_expr: Arc<dyn PhysicalExpr>,
+    // we delegate to case_expr for evaluation
+    case_expr: Arc<CaseExpr>,
 }
 
 impl std::fmt::Display for IfExpr {
@@ -58,9 +60,12 @@ impl IfExpr {
         false_expr: Arc<dyn PhysicalExpr>,
     ) -> Self {
         Self {
-            if_expr,
-            true_expr,
-            false_expr,
+            if_expr: if_expr.clone(),
+            true_expr: true_expr.clone(),
+            false_expr: false_expr.clone(),
+            case_expr: Arc::new(
+                CaseExpr::try_new(None, vec![(if_expr, true_expr)], Some(false_expr)).unwrap(),
+            ),
         }
     }
 }
@@ -85,29 +90,7 @@ impl PhysicalExpr for IfExpr {
     }
 
     fn evaluate(&self, batch: &RecordBatch) -> Result<ColumnarValue> {
-        let mut remainder = BooleanArray::from(vec![true; batch.num_rows()]);
-
-        // evaluate if condition on batch
-        let if_value = self.if_expr.evaluate_selection(batch, &remainder)?;
-        let if_value = if_value.into_array(batch.num_rows())?;
-        let if_value =
-            as_boolean_array(&if_value).expect("if expression did not return a BooleanArray");
-
-        let true_value = self.true_expr.evaluate_selection(batch, if_value)?;
-        let true_value = true_value.into_array(batch.num_rows())?;
-
-        remainder = and(
-            &remainder,
-            &or_kleene(&not(if_value)?, &is_null(if_value)?)?,
-        )?;
-
-        let false_value = self
-            .false_expr
-            .evaluate_selection(batch, &remainder)?
-            .into_array(batch.num_rows())?;
-        let current_value = zip(&remainder, &false_value, &true_value)?;
-
-        Ok(ColumnarValue::Array(current_value))
+        self.case_expr.evaluate(batch)
     }
 
     fn children(&self) -> Vec<&Arc<dyn PhysicalExpr>> {
@@ -150,6 +133,7 @@ impl PartialEq<dyn Any> for IfExpr {
 #[cfg(test)]
 mod tests {
     use arrow::{array::StringArray, datatypes::*};
+    use arrow_array::Int32Array;
     use datafusion::logical_expr::Operator;
     use datafusion_common::cast::as_int32_array;
     use datafusion_physical_expr::expressions::{binary, col, lit};

From 5dcf7138be3f3e5e86568866566882cc49c6f811 Mon Sep 17 00:00:00 2001
From: Arttu <Blizzara@users.noreply.github.com>
Date: Sat, 27 Jul 2024 14:38:05 +0200
Subject: [PATCH 12/68] chore: make Cast's logic reusable for other projects
 (#716)

---
 src/cast.rs | 1099 +++++++++++++++++++++++++--------------------------
 src/lib.rs  |    2 +-
 2 files changed, 550 insertions(+), 551 deletions(-)

diff --git a/src/cast.rs b/src/cast.rs
index 9a47cc87334e..ae0818970f03 100644
--- a/src/cast.rs
+++ b/src/cast.rs
@@ -502,158 +502,166 @@ impl Cast {
             eval_mode,
         }
     }
+}
 
-    fn cast_array(&self, array: ArrayRef) -> DataFusionResult<ArrayRef> {
-        let to_type = &self.data_type;
-        let array = array_with_timezone(array, self.timezone.clone(), Some(to_type))?;
-        let from_type = array.data_type().clone();
-        let array = match &from_type {
-            DataType::Dictionary(key_type, value_type)
-                if key_type.as_ref() == &DataType::Int32
-                    && (value_type.as_ref() == &DataType::Utf8
-                        || value_type.as_ref() == &DataType::LargeUtf8) =>
-            {
-                let dict_array = array
-                    .as_any()
-                    .downcast_ref::<DictionaryArray<Int32Type>>()
-                    .expect("Expected a dictionary array");
-
-                let casted_dictionary = DictionaryArray::<Int32Type>::new(
-                    dict_array.keys().clone(),
-                    self.cast_array(dict_array.values().clone())?,
-                );
-
-                let casted_result = match to_type {
-                    DataType::Dictionary(_, _) => Arc::new(casted_dictionary.clone()),
-                    _ => take(casted_dictionary.values().as_ref(), dict_array.keys(), None)?,
-                };
-                return Ok(spark_cast(casted_result, &from_type, to_type));
-            }
-            _ => array,
-        };
-        let from_type = array.data_type();
-
-        let cast_result = match (from_type, to_type) {
-            (DataType::Utf8, DataType::Boolean) => {
-                Self::spark_cast_utf8_to_boolean::<i32>(&array, self.eval_mode)
-            }
-            (DataType::LargeUtf8, DataType::Boolean) => {
-                Self::spark_cast_utf8_to_boolean::<i64>(&array, self.eval_mode)
-            }
-            (DataType::Utf8, DataType::Timestamp(_, _)) => {
-                Self::cast_string_to_timestamp(&array, to_type, self.eval_mode)
-            }
-            (DataType::Utf8, DataType::Date32) => {
-                Self::cast_string_to_date(&array, to_type, self.eval_mode)
-            }
-            (DataType::Int64, DataType::Int32)
-            | (DataType::Int64, DataType::Int16)
-            | (DataType::Int64, DataType::Int8)
-            | (DataType::Int32, DataType::Int16)
-            | (DataType::Int32, DataType::Int8)
-            | (DataType::Int16, DataType::Int8)
-                if self.eval_mode != EvalMode::Try =>
-            {
-                Self::spark_cast_int_to_int(&array, self.eval_mode, from_type, to_type)
-            }
-            (
-                DataType::Utf8,
-                DataType::Int8 | DataType::Int16 | DataType::Int32 | DataType::Int64,
-            ) => Self::cast_string_to_int::<i32>(to_type, &array, self.eval_mode),
-            (
-                DataType::LargeUtf8,
-                DataType::Int8 | DataType::Int16 | DataType::Int32 | DataType::Int64,
-            ) => Self::cast_string_to_int::<i64>(to_type, &array, self.eval_mode),
-            (DataType::Float64, DataType::Utf8) => {
-                Self::spark_cast_float64_to_utf8::<i32>(&array, self.eval_mode)
-            }
-            (DataType::Float64, DataType::LargeUtf8) => {
-                Self::spark_cast_float64_to_utf8::<i64>(&array, self.eval_mode)
-            }
-            (DataType::Float32, DataType::Utf8) => {
-                Self::spark_cast_float32_to_utf8::<i32>(&array, self.eval_mode)
-            }
-            (DataType::Float32, DataType::LargeUtf8) => {
-                Self::spark_cast_float32_to_utf8::<i64>(&array, self.eval_mode)
-            }
-            (DataType::Float32, DataType::Decimal128(precision, scale)) => {
-                Self::cast_float32_to_decimal128(&array, *precision, *scale, self.eval_mode)
-            }
-            (DataType::Float64, DataType::Decimal128(precision, scale)) => {
-                Self::cast_float64_to_decimal128(&array, *precision, *scale, self.eval_mode)
-            }
-            (DataType::Float32, DataType::Int8)
-            | (DataType::Float32, DataType::Int16)
-            | (DataType::Float32, DataType::Int32)
-            | (DataType::Float32, DataType::Int64)
-            | (DataType::Float64, DataType::Int8)
-            | (DataType::Float64, DataType::Int16)
-            | (DataType::Float64, DataType::Int32)
-            | (DataType::Float64, DataType::Int64)
-            | (DataType::Decimal128(_, _), DataType::Int8)
-            | (DataType::Decimal128(_, _), DataType::Int16)
-            | (DataType::Decimal128(_, _), DataType::Int32)
-            | (DataType::Decimal128(_, _), DataType::Int64)
-                if self.eval_mode != EvalMode::Try =>
-            {
-                Self::spark_cast_nonintegral_numeric_to_integral(
-                    &array,
-                    self.eval_mode,
-                    from_type,
-                    to_type,
-                )
-            }
-            _ if Self::is_datafusion_spark_compatible(from_type, to_type) => {
-                // use DataFusion cast only when we know that it is compatible with Spark
-                Ok(cast_with_options(&array, to_type, &CAST_OPTIONS)?)
-            }
-            _ => {
-                // we should never reach this code because the Scala code should be checking
-                // for supported cast operations and falling back to Spark for anything that
-                // is not yet supported
-                Err(SparkError::Internal(format!(
-                    "Native cast invoked for unsupported cast from {from_type:?} to {to_type:?}"
-                )))
-            }
-        };
-        Ok(spark_cast(cast_result?, from_type, to_type))
+/// Spark-compatible cast implementation. Defers to DataFusion's cast where that is known
+/// to be compatible, and returns an error when a not supported and not DF-compatible cast
+/// is requested.
+pub fn spark_cast(
+    arg: ColumnarValue,
+    data_type: &DataType,
+    eval_mode: EvalMode,
+    timezone: String,
+) -> DataFusionResult<ColumnarValue> {
+    match arg {
+        ColumnarValue::Array(array) => Ok(ColumnarValue::Array(cast_array(
+            array,
+            data_type,
+            eval_mode,
+            timezone.to_owned(),
+        )?)),
+        ColumnarValue::Scalar(scalar) => {
+            // Note that normally CAST(scalar) should be fold in Spark JVM side. However, for
+            // some cases e.g., scalar subquery, Spark will not fold it, so we need to handle it
+            // here.
+            let array = scalar.to_array()?;
+            let scalar = ScalarValue::try_from_array(
+                &cast_array(array, data_type, eval_mode, timezone.to_owned())?,
+                0,
+            )?;
+            Ok(ColumnarValue::Scalar(scalar))
+        }
     }
+}
 
-    /// Determines if DataFusion supports the given cast in a way that is
-    /// compatible with Spark
-    fn is_datafusion_spark_compatible(from_type: &DataType, to_type: &DataType) -> bool {
-        if from_type == to_type {
-            return true;
+fn cast_array(
+    array: ArrayRef,
+    to_type: &DataType,
+    eval_mode: EvalMode,
+    timezone: String,
+) -> DataFusionResult<ArrayRef> {
+    let array = array_with_timezone(array, timezone.clone(), Some(to_type))?;
+    let from_type = array.data_type().clone();
+    let array = match &from_type {
+        DataType::Dictionary(key_type, value_type)
+            if key_type.as_ref() == &DataType::Int32
+                && (value_type.as_ref() == &DataType::Utf8
+                    || value_type.as_ref() == &DataType::LargeUtf8) =>
+        {
+            let dict_array = array
+                .as_any()
+                .downcast_ref::<DictionaryArray<Int32Type>>()
+                .expect("Expected a dictionary array");
+
+            let casted_dictionary = DictionaryArray::<Int32Type>::new(
+                dict_array.keys().clone(),
+                cast_array(dict_array.values().clone(), to_type, eval_mode, timezone)?,
+            );
+
+            let casted_result = match to_type {
+                DataType::Dictionary(_, _) => Arc::new(casted_dictionary.clone()),
+                _ => take(casted_dictionary.values().as_ref(), dict_array.keys(), None)?,
+            };
+            return Ok(spark_cast_postprocess(casted_result, &from_type, to_type));
         }
-        match from_type {
-            DataType::Boolean => matches!(
-                to_type,
-                DataType::Int8
-                    | DataType::Int16
-                    | DataType::Int32
-                    | DataType::Int64
-                    | DataType::Float32
-                    | DataType::Float64
-                    | DataType::Utf8
-            ),
-            DataType::Int8 | DataType::Int16 | DataType::Int32 | DataType::Int64 => {
-                // note that the cast from Int32/Int64 -> Decimal128 here is actually
-                // not compatible with Spark (no overflow checks) but we have tests that
-                // rely on this cast working so we have to leave it here for now
-                matches!(
-                    to_type,
-                    DataType::Boolean
-                        | DataType::Int8
-                        | DataType::Int16
-                        | DataType::Int32
-                        | DataType::Int64
-                        | DataType::Float32
-                        | DataType::Float64
-                        | DataType::Decimal128(_, _)
-                        | DataType::Utf8
-                )
-            }
-            DataType::Float32 | DataType::Float64 => matches!(
+        _ => array,
+    };
+    let from_type = array.data_type();
+
+    let cast_result = match (from_type, to_type) {
+        (DataType::Utf8, DataType::Boolean) => spark_cast_utf8_to_boolean::<i32>(&array, eval_mode),
+        (DataType::LargeUtf8, DataType::Boolean) => {
+            spark_cast_utf8_to_boolean::<i64>(&array, eval_mode)
+        }
+        (DataType::Utf8, DataType::Timestamp(_, _)) => {
+            cast_string_to_timestamp(&array, to_type, eval_mode)
+        }
+        (DataType::Utf8, DataType::Date32) => cast_string_to_date(&array, to_type, eval_mode),
+        (DataType::Int64, DataType::Int32)
+        | (DataType::Int64, DataType::Int16)
+        | (DataType::Int64, DataType::Int8)
+        | (DataType::Int32, DataType::Int16)
+        | (DataType::Int32, DataType::Int8)
+        | (DataType::Int16, DataType::Int8)
+            if eval_mode != EvalMode::Try =>
+        {
+            spark_cast_int_to_int(&array, eval_mode, from_type, to_type)
+        }
+        (DataType::Utf8, DataType::Int8 | DataType::Int16 | DataType::Int32 | DataType::Int64) => {
+            cast_string_to_int::<i32>(to_type, &array, eval_mode)
+        }
+        (
+            DataType::LargeUtf8,
+            DataType::Int8 | DataType::Int16 | DataType::Int32 | DataType::Int64,
+        ) => cast_string_to_int::<i64>(to_type, &array, eval_mode),
+        (DataType::Float64, DataType::Utf8) => spark_cast_float64_to_utf8::<i32>(&array, eval_mode),
+        (DataType::Float64, DataType::LargeUtf8) => {
+            spark_cast_float64_to_utf8::<i64>(&array, eval_mode)
+        }
+        (DataType::Float32, DataType::Utf8) => spark_cast_float32_to_utf8::<i32>(&array, eval_mode),
+        (DataType::Float32, DataType::LargeUtf8) => {
+            spark_cast_float32_to_utf8::<i64>(&array, eval_mode)
+        }
+        (DataType::Float32, DataType::Decimal128(precision, scale)) => {
+            cast_float32_to_decimal128(&array, *precision, *scale, eval_mode)
+        }
+        (DataType::Float64, DataType::Decimal128(precision, scale)) => {
+            cast_float64_to_decimal128(&array, *precision, *scale, eval_mode)
+        }
+        (DataType::Float32, DataType::Int8)
+        | (DataType::Float32, DataType::Int16)
+        | (DataType::Float32, DataType::Int32)
+        | (DataType::Float32, DataType::Int64)
+        | (DataType::Float64, DataType::Int8)
+        | (DataType::Float64, DataType::Int16)
+        | (DataType::Float64, DataType::Int32)
+        | (DataType::Float64, DataType::Int64)
+        | (DataType::Decimal128(_, _), DataType::Int8)
+        | (DataType::Decimal128(_, _), DataType::Int16)
+        | (DataType::Decimal128(_, _), DataType::Int32)
+        | (DataType::Decimal128(_, _), DataType::Int64)
+            if eval_mode != EvalMode::Try =>
+        {
+            spark_cast_nonintegral_numeric_to_integral(&array, eval_mode, from_type, to_type)
+        }
+        _ if is_datafusion_spark_compatible(from_type, to_type) => {
+            // use DataFusion cast only when we know that it is compatible with Spark
+            Ok(cast_with_options(&array, to_type, &CAST_OPTIONS)?)
+        }
+        _ => {
+            // we should never reach this code because the Scala code should be checking
+            // for supported cast operations and falling back to Spark for anything that
+            // is not yet supported
+            Err(SparkError::Internal(format!(
+                "Native cast invoked for unsupported cast from {from_type:?} to {to_type:?}"
+            )))
+        }
+    };
+    Ok(spark_cast_postprocess(cast_result?, from_type, to_type))
+}
+
+/// Determines if DataFusion supports the given cast in a way that is
+/// compatible with Spark
+fn is_datafusion_spark_compatible(from_type: &DataType, to_type: &DataType) -> bool {
+    if from_type == to_type {
+        return true;
+    }
+    match from_type {
+        DataType::Boolean => matches!(
+            to_type,
+            DataType::Int8
+                | DataType::Int16
+                | DataType::Int32
+                | DataType::Int64
+                | DataType::Float32
+                | DataType::Float64
+                | DataType::Utf8
+        ),
+        DataType::Int8 | DataType::Int16 | DataType::Int32 | DataType::Int64 => {
+            // note that the cast from Int32/Int64 -> Decimal128 here is actually
+            // not compatible with Spark (no overflow checks) but we have tests that
+            // rely on this cast working so we have to leave it here for now
+            matches!(
                 to_type,
                 DataType::Boolean
                     | DataType::Int8
@@ -662,182 +670,180 @@ impl Cast {
                     | DataType::Int64
                     | DataType::Float32
                     | DataType::Float64
-            ),
-            DataType::Decimal128(_, _) | DataType::Decimal256(_, _) => matches!(
-                to_type,
-                DataType::Int8
-                    | DataType::Int16
-                    | DataType::Int32
-                    | DataType::Int64
-                    | DataType::Float32
-                    | DataType::Float64
                     | DataType::Decimal128(_, _)
-                    | DataType::Decimal256(_, _)
-            ),
-            DataType::Utf8 => matches!(to_type, DataType::Binary),
-            DataType::Date32 => matches!(to_type, DataType::Utf8),
-            DataType::Timestamp(_, _) => {
-                matches!(
-                    to_type,
-                    DataType::Int64 | DataType::Date32 | DataType::Utf8 | DataType::Timestamp(_, _)
-                )
-            }
-            DataType::Binary => {
-                // note that this is not completely Spark compatible because
-                // DataFusion only supports binary data containing valid UTF-8 strings
-                matches!(to_type, DataType::Utf8)
-            }
-            _ => false,
+                    | DataType::Utf8
+            )
+        }
+        DataType::Float32 | DataType::Float64 => matches!(
+            to_type,
+            DataType::Boolean
+                | DataType::Int8
+                | DataType::Int16
+                | DataType::Int32
+                | DataType::Int64
+                | DataType::Float32
+                | DataType::Float64
+        ),
+        DataType::Decimal128(_, _) | DataType::Decimal256(_, _) => matches!(
+            to_type,
+            DataType::Int8
+                | DataType::Int16
+                | DataType::Int32
+                | DataType::Int64
+                | DataType::Float32
+                | DataType::Float64
+                | DataType::Decimal128(_, _)
+                | DataType::Decimal256(_, _)
+        ),
+        DataType::Utf8 => matches!(to_type, DataType::Binary),
+        DataType::Date32 => matches!(to_type, DataType::Utf8),
+        DataType::Timestamp(_, _) => {
+            matches!(
+                to_type,
+                DataType::Int64 | DataType::Date32 | DataType::Utf8 | DataType::Timestamp(_, _)
+            )
         }
+        DataType::Binary => {
+            // note that this is not completely Spark compatible because
+            // DataFusion only supports binary data containing valid UTF-8 strings
+            matches!(to_type, DataType::Utf8)
+        }
+        _ => false,
     }
+}
 
-    fn cast_string_to_int<OffsetSize: OffsetSizeTrait>(
-        to_type: &DataType,
-        array: &ArrayRef,
-        eval_mode: EvalMode,
-    ) -> SparkResult<ArrayRef> {
-        let string_array = array
-            .as_any()
-            .downcast_ref::<GenericStringArray<OffsetSize>>()
-            .expect("cast_string_to_int expected a string array");
+fn cast_string_to_int<OffsetSize: OffsetSizeTrait>(
+    to_type: &DataType,
+    array: &ArrayRef,
+    eval_mode: EvalMode,
+) -> SparkResult<ArrayRef> {
+    let string_array = array
+        .as_any()
+        .downcast_ref::<GenericStringArray<OffsetSize>>()
+        .expect("cast_string_to_int expected a string array");
+
+    let cast_array: ArrayRef = match to_type {
+        DataType::Int8 => cast_utf8_to_int!(string_array, eval_mode, Int8Type, cast_string_to_i8)?,
+        DataType::Int16 => {
+            cast_utf8_to_int!(string_array, eval_mode, Int16Type, cast_string_to_i16)?
+        }
+        DataType::Int32 => {
+            cast_utf8_to_int!(string_array, eval_mode, Int32Type, cast_string_to_i32)?
+        }
+        DataType::Int64 => {
+            cast_utf8_to_int!(string_array, eval_mode, Int64Type, cast_string_to_i64)?
+        }
+        dt => unreachable!(
+            "{}",
+            format!("invalid integer type {dt} in cast from string")
+        ),
+    };
+    Ok(cast_array)
+}
 
-        let cast_array: ArrayRef = match to_type {
-            DataType::Int8 => {
-                cast_utf8_to_int!(string_array, eval_mode, Int8Type, cast_string_to_i8)?
-            }
-            DataType::Int16 => {
-                cast_utf8_to_int!(string_array, eval_mode, Int16Type, cast_string_to_i16)?
-            }
-            DataType::Int32 => {
-                cast_utf8_to_int!(string_array, eval_mode, Int32Type, cast_string_to_i32)?
-            }
-            DataType::Int64 => {
-                cast_utf8_to_int!(string_array, eval_mode, Int64Type, cast_string_to_i64)?
-            }
-            dt => unreachable!(
-                "{}",
-                format!("invalid integer type {dt} in cast from string")
-            ),
-        };
-        Ok(cast_array)
+fn cast_string_to_date(
+    array: &ArrayRef,
+    to_type: &DataType,
+    eval_mode: EvalMode,
+) -> SparkResult<ArrayRef> {
+    let string_array = array
+        .as_any()
+        .downcast_ref::<GenericStringArray<i32>>()
+        .expect("Expected a string array");
+
+    if to_type != &DataType::Date32 {
+        unreachable!("Invalid data type {:?} in cast from string", to_type);
     }
 
-    fn cast_string_to_date(
-        array: &ArrayRef,
-        to_type: &DataType,
-        eval_mode: EvalMode,
-    ) -> SparkResult<ArrayRef> {
-        let string_array = array
-            .as_any()
-            .downcast_ref::<GenericStringArray<i32>>()
-            .expect("Expected a string array");
+    let len = string_array.len();
+    let mut cast_array = PrimitiveArray::<Date32Type>::builder(len);
 
-        if to_type != &DataType::Date32 {
-            unreachable!("Invalid data type {:?} in cast from string", to_type);
-        }
+    for i in 0..len {
+        let value = if string_array.is_null(i) {
+            None
+        } else {
+            match date_parser(string_array.value(i), eval_mode) {
+                Ok(Some(cast_value)) => Some(cast_value),
+                Ok(None) => None,
+                Err(e) => return Err(e),
+            }
+        };
 
-        let len = string_array.len();
-        let mut cast_array = PrimitiveArray::<Date32Type>::builder(len);
+        match value {
+            Some(cast_value) => cast_array.append_value(cast_value),
+            None => cast_array.append_null(),
+        }
+    }
 
-        for i in 0..len {
-            let value = if string_array.is_null(i) {
-                None
-            } else {
-                match date_parser(string_array.value(i), eval_mode) {
-                    Ok(Some(cast_value)) => Some(cast_value),
-                    Ok(None) => None,
-                    Err(e) => return Err(e),
-                }
-            };
+    Ok(Arc::new(cast_array.finish()) as ArrayRef)
+}
 
-            match value {
-                Some(cast_value) => cast_array.append_value(cast_value),
-                None => cast_array.append_null(),
-            }
+fn cast_string_to_timestamp(
+    array: &ArrayRef,
+    to_type: &DataType,
+    eval_mode: EvalMode,
+) -> SparkResult<ArrayRef> {
+    let string_array = array
+        .as_any()
+        .downcast_ref::<GenericStringArray<i32>>()
+        .expect("Expected a string array");
+
+    let cast_array: ArrayRef = match to_type {
+        DataType::Timestamp(_, _) => {
+            cast_utf8_to_timestamp!(
+                string_array,
+                eval_mode,
+                TimestampMicrosecondType,
+                timestamp_parser
+            )
         }
+        _ => unreachable!("Invalid data type {:?} in cast from string", to_type),
+    };
+    Ok(cast_array)
+}
 
-        Ok(Arc::new(cast_array.finish()) as ArrayRef)
-    }
+fn cast_float64_to_decimal128(
+    array: &dyn Array,
+    precision: u8,
+    scale: i8,
+    eval_mode: EvalMode,
+) -> SparkResult<ArrayRef> {
+    cast_floating_point_to_decimal128::<Float64Type>(array, precision, scale, eval_mode)
+}
 
-    fn cast_string_to_timestamp(
-        array: &ArrayRef,
-        to_type: &DataType,
-        eval_mode: EvalMode,
-    ) -> SparkResult<ArrayRef> {
-        let string_array = array
-            .as_any()
-            .downcast_ref::<GenericStringArray<i32>>()
-            .expect("Expected a string array");
+fn cast_float32_to_decimal128(
+    array: &dyn Array,
+    precision: u8,
+    scale: i8,
+    eval_mode: EvalMode,
+) -> SparkResult<ArrayRef> {
+    cast_floating_point_to_decimal128::<Float32Type>(array, precision, scale, eval_mode)
+}
 
-        let cast_array: ArrayRef = match to_type {
-            DataType::Timestamp(_, _) => {
-                cast_utf8_to_timestamp!(
-                    string_array,
-                    eval_mode,
-                    TimestampMicrosecondType,
-                    timestamp_parser
-                )
-            }
-            _ => unreachable!("Invalid data type {:?} in cast from string", to_type),
-        };
-        Ok(cast_array)
-    }
+fn cast_floating_point_to_decimal128<T: ArrowPrimitiveType>(
+    array: &dyn Array,
+    precision: u8,
+    scale: i8,
+    eval_mode: EvalMode,
+) -> SparkResult<ArrayRef>
+where
+    <T as ArrowPrimitiveType>::Native: AsPrimitive<f64>,
+{
+    let input = array.as_any().downcast_ref::<PrimitiveArray<T>>().unwrap();
+    let mut cast_array = PrimitiveArray::<Decimal128Type>::builder(input.len());
 
-    fn cast_float64_to_decimal128(
-        array: &dyn Array,
-        precision: u8,
-        scale: i8,
-        eval_mode: EvalMode,
-    ) -> SparkResult<ArrayRef> {
-        Self::cast_floating_point_to_decimal128::<Float64Type>(array, precision, scale, eval_mode)
-    }
+    let mul = 10_f64.powi(scale as i32);
 
-    fn cast_float32_to_decimal128(
-        array: &dyn Array,
-        precision: u8,
-        scale: i8,
-        eval_mode: EvalMode,
-    ) -> SparkResult<ArrayRef> {
-        Self::cast_floating_point_to_decimal128::<Float32Type>(array, precision, scale, eval_mode)
-    }
+    for i in 0..input.len() {
+        if input.is_null(i) {
+            cast_array.append_null();
+        } else {
+            let input_value = input.value(i).as_();
+            let value = (input_value * mul).round().to_i128();
 
-    fn cast_floating_point_to_decimal128<T: ArrowPrimitiveType>(
-        array: &dyn Array,
-        precision: u8,
-        scale: i8,
-        eval_mode: EvalMode,
-    ) -> SparkResult<ArrayRef>
-    where
-        <T as ArrowPrimitiveType>::Native: AsPrimitive<f64>,
-    {
-        let input = array.as_any().downcast_ref::<PrimitiveArray<T>>().unwrap();
-        let mut cast_array = PrimitiveArray::<Decimal128Type>::builder(input.len());
-
-        let mul = 10_f64.powi(scale as i32);
-
-        for i in 0..input.len() {
-            if input.is_null(i) {
-                cast_array.append_null();
-            } else {
-                let input_value = input.value(i).as_();
-                let value = (input_value * mul).round().to_i128();
-
-                match value {
-                    Some(v) => {
-                        if Decimal128Type::validate_decimal_precision(v, precision).is_err() {
-                            if eval_mode == EvalMode::Ansi {
-                                return Err(SparkError::NumericValueOutOfRange {
-                                    value: input_value.to_string(),
-                                    precision,
-                                    scale,
-                                });
-                            } else {
-                                cast_array.append_null();
-                            }
-                        }
-                        cast_array.append_value(v);
-                    }
-                    None => {
+            match value {
+                Some(v) => {
+                    if Decimal128Type::validate_decimal_precision(v, precision).is_err() {
                         if eval_mode == EvalMode::Ansi {
                             return Err(SparkError::NumericValueOutOfRange {
                                 value: input_value.to_string(),
@@ -848,240 +854,252 @@ impl Cast {
                             cast_array.append_null();
                         }
                     }
+                    cast_array.append_value(v);
+                }
+                None => {
+                    if eval_mode == EvalMode::Ansi {
+                        return Err(SparkError::NumericValueOutOfRange {
+                            value: input_value.to_string(),
+                            precision,
+                            scale,
+                        });
+                    } else {
+                        cast_array.append_null();
+                    }
                 }
             }
         }
-
-        let res = Arc::new(
-            cast_array
-                .with_precision_and_scale(precision, scale)?
-                .finish(),
-        ) as ArrayRef;
-        Ok(res)
     }
 
-    fn spark_cast_float64_to_utf8<OffsetSize>(
-        from: &dyn Array,
-        _eval_mode: EvalMode,
-    ) -> SparkResult<ArrayRef>
-    where
-        OffsetSize: OffsetSizeTrait,
-    {
-        cast_float_to_string!(from, _eval_mode, f64, Float64Array, OffsetSize)
-    }
+    let res = Arc::new(
+        cast_array
+            .with_precision_and_scale(precision, scale)?
+            .finish(),
+    ) as ArrayRef;
+    Ok(res)
+}
 
-    fn spark_cast_float32_to_utf8<OffsetSize>(
-        from: &dyn Array,
-        _eval_mode: EvalMode,
-    ) -> SparkResult<ArrayRef>
-    where
-        OffsetSize: OffsetSizeTrait,
-    {
-        cast_float_to_string!(from, _eval_mode, f32, Float32Array, OffsetSize)
-    }
+fn spark_cast_float64_to_utf8<OffsetSize>(
+    from: &dyn Array,
+    _eval_mode: EvalMode,
+) -> SparkResult<ArrayRef>
+where
+    OffsetSize: OffsetSizeTrait,
+{
+    cast_float_to_string!(from, _eval_mode, f64, Float64Array, OffsetSize)
+}
 
-    fn spark_cast_int_to_int(
-        array: &dyn Array,
-        eval_mode: EvalMode,
-        from_type: &DataType,
-        to_type: &DataType,
-    ) -> SparkResult<ArrayRef> {
-        match (from_type, to_type) {
-            (DataType::Int64, DataType::Int32) => cast_int_to_int_macro!(
-                array, eval_mode, Int64Type, Int32Type, from_type, i32, "BIGINT", "INT"
-            ),
-            (DataType::Int64, DataType::Int16) => cast_int_to_int_macro!(
-                array, eval_mode, Int64Type, Int16Type, from_type, i16, "BIGINT", "SMALLINT"
-            ),
-            (DataType::Int64, DataType::Int8) => cast_int_to_int_macro!(
-                array, eval_mode, Int64Type, Int8Type, from_type, i8, "BIGINT", "TINYINT"
-            ),
-            (DataType::Int32, DataType::Int16) => cast_int_to_int_macro!(
-                array, eval_mode, Int32Type, Int16Type, from_type, i16, "INT", "SMALLINT"
-            ),
-            (DataType::Int32, DataType::Int8) => cast_int_to_int_macro!(
-                array, eval_mode, Int32Type, Int8Type, from_type, i8, "INT", "TINYINT"
-            ),
-            (DataType::Int16, DataType::Int8) => cast_int_to_int_macro!(
-                array, eval_mode, Int16Type, Int8Type, from_type, i8, "SMALLINT", "TINYINT"
-            ),
-            _ => unreachable!(
-                "{}",
-                format!("invalid integer type {to_type} in cast from {from_type}")
-            ),
-        }
+fn spark_cast_float32_to_utf8<OffsetSize>(
+    from: &dyn Array,
+    _eval_mode: EvalMode,
+) -> SparkResult<ArrayRef>
+where
+    OffsetSize: OffsetSizeTrait,
+{
+    cast_float_to_string!(from, _eval_mode, f32, Float32Array, OffsetSize)
+}
+
+fn spark_cast_int_to_int(
+    array: &dyn Array,
+    eval_mode: EvalMode,
+    from_type: &DataType,
+    to_type: &DataType,
+) -> SparkResult<ArrayRef> {
+    match (from_type, to_type) {
+        (DataType::Int64, DataType::Int32) => cast_int_to_int_macro!(
+            array, eval_mode, Int64Type, Int32Type, from_type, i32, "BIGINT", "INT"
+        ),
+        (DataType::Int64, DataType::Int16) => cast_int_to_int_macro!(
+            array, eval_mode, Int64Type, Int16Type, from_type, i16, "BIGINT", "SMALLINT"
+        ),
+        (DataType::Int64, DataType::Int8) => cast_int_to_int_macro!(
+            array, eval_mode, Int64Type, Int8Type, from_type, i8, "BIGINT", "TINYINT"
+        ),
+        (DataType::Int32, DataType::Int16) => cast_int_to_int_macro!(
+            array, eval_mode, Int32Type, Int16Type, from_type, i16, "INT", "SMALLINT"
+        ),
+        (DataType::Int32, DataType::Int8) => cast_int_to_int_macro!(
+            array, eval_mode, Int32Type, Int8Type, from_type, i8, "INT", "TINYINT"
+        ),
+        (DataType::Int16, DataType::Int8) => cast_int_to_int_macro!(
+            array, eval_mode, Int16Type, Int8Type, from_type, i8, "SMALLINT", "TINYINT"
+        ),
+        _ => unreachable!(
+            "{}",
+            format!("invalid integer type {to_type} in cast from {from_type}")
+        ),
     }
+}
 
-    fn spark_cast_utf8_to_boolean<OffsetSize>(
-        from: &dyn Array,
-        eval_mode: EvalMode,
-    ) -> SparkResult<ArrayRef>
-    where
-        OffsetSize: OffsetSizeTrait,
-    {
-        let array = from
-            .as_any()
-            .downcast_ref::<GenericStringArray<OffsetSize>>()
-            .unwrap();
+fn spark_cast_utf8_to_boolean<OffsetSize>(
+    from: &dyn Array,
+    eval_mode: EvalMode,
+) -> SparkResult<ArrayRef>
+where
+    OffsetSize: OffsetSizeTrait,
+{
+    let array = from
+        .as_any()
+        .downcast_ref::<GenericStringArray<OffsetSize>>()
+        .unwrap();
 
-        let output_array = array
-            .iter()
-            .map(|value| match value {
-                Some(value) => match value.to_ascii_lowercase().trim() {
-                    "t" | "true" | "y" | "yes" | "1" => Ok(Some(true)),
-                    "f" | "false" | "n" | "no" | "0" => Ok(Some(false)),
-                    _ if eval_mode == EvalMode::Ansi => Err(SparkError::CastInvalidValue {
-                        value: value.to_string(),
-                        from_type: "STRING".to_string(),
-                        to_type: "BOOLEAN".to_string(),
-                    }),
-                    _ => Ok(None),
-                },
+    let output_array = array
+        .iter()
+        .map(|value| match value {
+            Some(value) => match value.to_ascii_lowercase().trim() {
+                "t" | "true" | "y" | "yes" | "1" => Ok(Some(true)),
+                "f" | "false" | "n" | "no" | "0" => Ok(Some(false)),
+                _ if eval_mode == EvalMode::Ansi => Err(SparkError::CastInvalidValue {
+                    value: value.to_string(),
+                    from_type: "STRING".to_string(),
+                    to_type: "BOOLEAN".to_string(),
+                }),
                 _ => Ok(None),
-            })
-            .collect::<Result<BooleanArray, _>>()?;
+            },
+            _ => Ok(None),
+        })
+        .collect::<Result<BooleanArray, _>>()?;
 
-        Ok(Arc::new(output_array))
-    }
+    Ok(Arc::new(output_array))
+}
 
-    fn spark_cast_nonintegral_numeric_to_integral(
-        array: &dyn Array,
-        eval_mode: EvalMode,
-        from_type: &DataType,
-        to_type: &DataType,
-    ) -> SparkResult<ArrayRef> {
-        match (from_type, to_type) {
-            (DataType::Float32, DataType::Int8) => cast_float_to_int16_down!(
-                array,
-                eval_mode,
-                Float32Array,
-                Int8Array,
-                f32,
-                i8,
-                "FLOAT",
-                "TINYINT",
-                "{:e}"
-            ),
-            (DataType::Float32, DataType::Int16) => cast_float_to_int16_down!(
-                array,
-                eval_mode,
-                Float32Array,
-                Int16Array,
-                f32,
-                i16,
-                "FLOAT",
-                "SMALLINT",
-                "{:e}"
-            ),
-            (DataType::Float32, DataType::Int32) => cast_float_to_int32_up!(
-                array,
-                eval_mode,
-                Float32Array,
-                Int32Array,
-                f32,
-                i32,
-                "FLOAT",
-                "INT",
-                i32::MAX,
-                "{:e}"
-            ),
-            (DataType::Float32, DataType::Int64) => cast_float_to_int32_up!(
-                array,
-                eval_mode,
-                Float32Array,
-                Int64Array,
-                f32,
-                i64,
-                "FLOAT",
-                "BIGINT",
-                i64::MAX,
-                "{:e}"
-            ),
-            (DataType::Float64, DataType::Int8) => cast_float_to_int16_down!(
-                array,
-                eval_mode,
-                Float64Array,
-                Int8Array,
-                f64,
-                i8,
-                "DOUBLE",
-                "TINYINT",
-                "{:e}D"
-            ),
-            (DataType::Float64, DataType::Int16) => cast_float_to_int16_down!(
-                array,
-                eval_mode,
-                Float64Array,
-                Int16Array,
-                f64,
-                i16,
-                "DOUBLE",
-                "SMALLINT",
-                "{:e}D"
-            ),
-            (DataType::Float64, DataType::Int32) => cast_float_to_int32_up!(
+fn spark_cast_nonintegral_numeric_to_integral(
+    array: &dyn Array,
+    eval_mode: EvalMode,
+    from_type: &DataType,
+    to_type: &DataType,
+) -> SparkResult<ArrayRef> {
+    match (from_type, to_type) {
+        (DataType::Float32, DataType::Int8) => cast_float_to_int16_down!(
+            array,
+            eval_mode,
+            Float32Array,
+            Int8Array,
+            f32,
+            i8,
+            "FLOAT",
+            "TINYINT",
+            "{:e}"
+        ),
+        (DataType::Float32, DataType::Int16) => cast_float_to_int16_down!(
+            array,
+            eval_mode,
+            Float32Array,
+            Int16Array,
+            f32,
+            i16,
+            "FLOAT",
+            "SMALLINT",
+            "{:e}"
+        ),
+        (DataType::Float32, DataType::Int32) => cast_float_to_int32_up!(
+            array,
+            eval_mode,
+            Float32Array,
+            Int32Array,
+            f32,
+            i32,
+            "FLOAT",
+            "INT",
+            i32::MAX,
+            "{:e}"
+        ),
+        (DataType::Float32, DataType::Int64) => cast_float_to_int32_up!(
+            array,
+            eval_mode,
+            Float32Array,
+            Int64Array,
+            f32,
+            i64,
+            "FLOAT",
+            "BIGINT",
+            i64::MAX,
+            "{:e}"
+        ),
+        (DataType::Float64, DataType::Int8) => cast_float_to_int16_down!(
+            array,
+            eval_mode,
+            Float64Array,
+            Int8Array,
+            f64,
+            i8,
+            "DOUBLE",
+            "TINYINT",
+            "{:e}D"
+        ),
+        (DataType::Float64, DataType::Int16) => cast_float_to_int16_down!(
+            array,
+            eval_mode,
+            Float64Array,
+            Int16Array,
+            f64,
+            i16,
+            "DOUBLE",
+            "SMALLINT",
+            "{:e}D"
+        ),
+        (DataType::Float64, DataType::Int32) => cast_float_to_int32_up!(
+            array,
+            eval_mode,
+            Float64Array,
+            Int32Array,
+            f64,
+            i32,
+            "DOUBLE",
+            "INT",
+            i32::MAX,
+            "{:e}D"
+        ),
+        (DataType::Float64, DataType::Int64) => cast_float_to_int32_up!(
+            array,
+            eval_mode,
+            Float64Array,
+            Int64Array,
+            f64,
+            i64,
+            "DOUBLE",
+            "BIGINT",
+            i64::MAX,
+            "{:e}D"
+        ),
+        (DataType::Decimal128(precision, scale), DataType::Int8) => {
+            cast_decimal_to_int16_down!(
+                array, eval_mode, Int8Array, i8, "TINYINT", precision, *scale
+            )
+        }
+        (DataType::Decimal128(precision, scale), DataType::Int16) => {
+            cast_decimal_to_int16_down!(
+                array, eval_mode, Int16Array, i16, "SMALLINT", precision, *scale
+            )
+        }
+        (DataType::Decimal128(precision, scale), DataType::Int32) => {
+            cast_decimal_to_int32_up!(
                 array,
                 eval_mode,
-                Float64Array,
                 Int32Array,
-                f64,
                 i32,
-                "DOUBLE",
                 "INT",
                 i32::MAX,
-                "{:e}D"
-            ),
-            (DataType::Float64, DataType::Int64) => cast_float_to_int32_up!(
+                *precision,
+                *scale
+            )
+        }
+        (DataType::Decimal128(precision, scale), DataType::Int64) => {
+            cast_decimal_to_int32_up!(
                 array,
                 eval_mode,
-                Float64Array,
                 Int64Array,
-                f64,
                 i64,
-                "DOUBLE",
                 "BIGINT",
                 i64::MAX,
-                "{:e}D"
-            ),
-            (DataType::Decimal128(precision, scale), DataType::Int8) => {
-                cast_decimal_to_int16_down!(
-                    array, eval_mode, Int8Array, i8, "TINYINT", precision, *scale
-                )
-            }
-            (DataType::Decimal128(precision, scale), DataType::Int16) => {
-                cast_decimal_to_int16_down!(
-                    array, eval_mode, Int16Array, i16, "SMALLINT", precision, *scale
-                )
-            }
-            (DataType::Decimal128(precision, scale), DataType::Int32) => {
-                cast_decimal_to_int32_up!(
-                    array,
-                    eval_mode,
-                    Int32Array,
-                    i32,
-                    "INT",
-                    i32::MAX,
-                    *precision,
-                    *scale
-                )
-            }
-            (DataType::Decimal128(precision, scale), DataType::Int64) => {
-                cast_decimal_to_int32_up!(
-                    array,
-                    eval_mode,
-                    Int64Array,
-                    i64,
-                    "BIGINT",
-                    i64::MAX,
-                    *precision,
-                    *scale
-                )
-            }
-            _ => unreachable!(
-                "{}",
-                format!("invalid cast from non-integral numeric type: {from_type} to integral numeric type: {to_type}")
-            ),
+                *precision,
+                *scale
+            )
         }
+        _ => unreachable!(
+            "{}",
+            format!("invalid cast from non-integral numeric type: {from_type} to integral numeric type: {to_type}")
+        ),
     }
 }
 
@@ -1294,17 +1312,7 @@ impl PhysicalExpr for Cast {
 
     fn evaluate(&self, batch: &RecordBatch) -> DataFusionResult<ColumnarValue> {
         let arg = self.child.evaluate(batch)?;
-        match arg {
-            ColumnarValue::Array(array) => Ok(ColumnarValue::Array(self.cast_array(array)?)),
-            ColumnarValue::Scalar(scalar) => {
-                // Note that normally CAST(scalar) should be fold in Spark JVM side. However, for
-                // some cases e.g., scalar subquery, Spark will not fold it, so we need to handle it
-                // here.
-                let array = scalar.to_array()?;
-                let scalar = ScalarValue::try_from_array(&self.cast_array(array)?, 0)?;
-                Ok(ColumnarValue::Scalar(scalar))
-            }
-        }
+        spark_cast(arg, &self.data_type, self.eval_mode, self.timezone.clone())
     }
 
     fn children(&self) -> Vec<&Arc<dyn PhysicalExpr>> {
@@ -1660,7 +1668,7 @@ fn date_parser(date_str: &str, eval_mode: EvalMode) -> SparkResult<Option<i32>>
 /// Dictionary arrays are already unpacked by the DataFusion cast() since Spark cannot specify
 /// Dictionary as to_type. The from_type is taken before the DataFusion cast() runs in
 /// expressions/cast.rs, so it can be still Dictionary.
-fn spark_cast(array: ArrayRef, from_type: &DataType, to_type: &DataType) -> ArrayRef {
+fn spark_cast_postprocess(array: ArrayRef, from_type: &DataType, to_type: &DataType) -> ArrayRef {
     match (from_type, to_type) {
         (DataType::Timestamp(_, _), DataType::Int64) => {
             // See Spark's `Cast` expression
@@ -1739,8 +1747,6 @@ mod tests {
     use arrow_array::StringArray;
     use arrow_schema::TimeUnit;
 
-    use datafusion_physical_expr::expressions::Column;
-
     use super::*;
 
     #[test]
@@ -1819,18 +1825,14 @@ mod tests {
         ]));
         let dict_array = Arc::new(DictionaryArray::new(keys, values));
 
-        // prepare cast expression
         let timezone = "UTC".to_string();
-        let expr = Arc::new(Column::new("a", 0)); // this is not used by the test
-        let cast = Cast::new(
-            expr,
-            DataType::Timestamp(TimeUnit::Microsecond, Some(timezone.clone().into())),
+        // test casting string dictionary array to timestamp array
+        let result = cast_array(
+            dict_array,
+            &DataType::Timestamp(TimeUnit::Microsecond, Some(timezone.clone().into())),
             EvalMode::Legacy,
             timezone.clone(),
-        );
-
-        // test casting string dictionary array to timestamp array
-        let result = cast.cast_array(dict_array)?;
+        )?;
         assert_eq!(
             *result.data_type(),
             DataType::Timestamp(TimeUnit::Microsecond, Some(timezone.into()))
@@ -1912,8 +1914,7 @@ mod tests {
             Some("2020-01-01T"),
         ]));
 
-        let result =
-            Cast::cast_string_to_date(&array, &DataType::Date32, EvalMode::Legacy).unwrap();
+        let result = cast_string_to_date(&array, &DataType::Date32, EvalMode::Legacy).unwrap();
 
         let date32_array = result
             .as_any()
@@ -1939,7 +1940,7 @@ mod tests {
 
         for eval_mode in &[EvalMode::Legacy, EvalMode::Try, EvalMode::Ansi] {
             let result =
-                Cast::cast_string_to_date(&array_with_invalid_date, &DataType::Date32, *eval_mode)
+                cast_string_to_date(&array_with_invalid_date, &DataType::Date32, *eval_mode)
                     .unwrap();
 
             let date32_array = result
@@ -1971,7 +1972,7 @@ mod tests {
 
         for eval_mode in &[EvalMode::Legacy, EvalMode::Try] {
             let result =
-                Cast::cast_string_to_date(&array_with_invalid_date, &DataType::Date32, *eval_mode)
+                cast_string_to_date(&array_with_invalid_date, &DataType::Date32, *eval_mode)
                     .unwrap();
 
             let date32_array = result
@@ -1995,7 +1996,7 @@ mod tests {
         }
 
         let result =
-            Cast::cast_string_to_date(&array_with_invalid_date, &DataType::Date32, EvalMode::Ansi);
+            cast_string_to_date(&array_with_invalid_date, &DataType::Date32, EvalMode::Ansi);
         match result {
             Err(e) => assert!(
                 e.to_string().contains(
@@ -2035,26 +2036,24 @@ mod tests {
     fn test_cast_unsupported_timestamp_to_date() {
         // Since datafusion uses chrono::Datetime internally not all dates representable by TimestampMicrosecondType are supported
         let timestamps: PrimitiveArray<TimestampMicrosecondType> = vec![i64::MAX].into();
-        let cast = Cast::new(
-            Arc::new(Column::new("a", 0)),
-            DataType::Date32,
+        let result = cast_array(
+            Arc::new(timestamps.with_timezone("Europe/Copenhagen")),
+            &DataType::Date32,
             EvalMode::Legacy,
             "UTC".to_owned(),
         );
-        let result = cast.cast_array(Arc::new(timestamps.with_timezone("Europe/Copenhagen")));
         assert!(result.is_err())
     }
 
     #[test]
     fn test_cast_invalid_timezone() {
         let timestamps: PrimitiveArray<TimestampMicrosecondType> = vec![i64::MAX].into();
-        let cast = Cast::new(
-            Arc::new(Column::new("a", 0)),
-            DataType::Date32,
+        let result = cast_array(
+            Arc::new(timestamps.with_timezone("Europe/Copenhagen")),
+            &DataType::Date32,
             EvalMode::Legacy,
             "Not a valid timezone".to_owned(),
         );
-        let result = cast.cast_array(Arc::new(timestamps.with_timezone("Europe/Copenhagen")));
         assert!(result.is_err())
     }
 }
diff --git a/src/lib.rs b/src/lib.rs
index 336201f4846e..22628978d5b5 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -24,7 +24,7 @@ mod temporal;
 pub mod timezone;
 pub mod utils;
 
-pub use cast::Cast;
+pub use cast::{spark_cast, Cast};
 pub use error::{SparkError, SparkResult};
 pub use if_expr::IfExpr;
 pub use temporal::{DateTruncExpr, HourExpr, MinuteExpr, SecondExpr, TimestampTruncExpr};

From 2a4dc7b1ed6a17f21df9aae2f59de2460921714d Mon Sep 17 00:00:00 2001
From: Arttu <Blizzara@users.noreply.github.com>
Date: Sun, 28 Jul 2024 15:44:32 +0200
Subject: [PATCH 13/68] chore: move scalar_funcs into spark-expr (#712)

---
 Cargo.toml                           |   7 +-
 src/lib.rs                           |   3 +
 src/scalar_funcs.rs                  | 533 ++++++++++++++++++++
 src/scalar_funcs/chr.rs              | 125 +++++
 src/scalar_funcs/hash_expressions.rs | 162 ++++++
 src/scalar_funcs/hex.rs              | 296 +++++++++++
 src/scalar_funcs/unhex.rs            | 258 ++++++++++
 src/spark_hash.rs                    | 708 +++++++++++++++++++++++++++
 src/xxhash64.rs                      | 190 +++++++
 9 files changed, 2280 insertions(+), 2 deletions(-)
 create mode 100644 src/scalar_funcs.rs
 create mode 100644 src/scalar_funcs/chr.rs
 create mode 100644 src/scalar_funcs/hash_expressions.rs
 create mode 100644 src/scalar_funcs/hex.rs
 create mode 100644 src/scalar_funcs/unhex.rs
 create mode 100644 src/spark_hash.rs
 create mode 100644 src/xxhash64.rs

diff --git a/Cargo.toml b/Cargo.toml
index aa4fcfc5f022..a535a2b817e6 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -41,10 +41,13 @@ chrono-tz = { workspace = true }
 num = { workspace = true }
 regex = { workspace = true }
 thiserror = { workspace = true }
+unicode-segmentation = "1.11.0"
 
 [dev-dependencies]
+arrow-data = {workspace = true}
 criterion = "0.5.1"
-rand = "0.8.5"
+rand = { workspace = true}
+twox-hash = "1.6.3"
 
 [lib]
 name = "datafusion_comet_spark_expr"
@@ -60,4 +63,4 @@ harness = false
 
 [[bench]]
 name = "conditional"
-harness = false
\ No newline at end of file
+harness = false
diff --git a/src/lib.rs b/src/lib.rs
index 22628978d5b5..14ab080b466c 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -20,9 +20,12 @@ mod error;
 mod if_expr;
 
 mod kernels;
+pub mod scalar_funcs;
+pub mod spark_hash;
 mod temporal;
 pub mod timezone;
 pub mod utils;
+mod xxhash64;
 
 pub use cast::{spark_cast, Cast};
 pub use error::{SparkError, SparkResult};
diff --git a/src/scalar_funcs.rs b/src/scalar_funcs.rs
new file mode 100644
index 000000000000..c50b98bafea4
--- /dev/null
+++ b/src/scalar_funcs.rs
@@ -0,0 +1,533 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::{cmp::min, sync::Arc};
+
+use arrow::{
+    array::{
+        ArrayRef, AsArray, Decimal128Builder, Float32Array, Float64Array, GenericStringArray,
+        Int16Array, Int32Array, Int64Array, Int64Builder, Int8Array, OffsetSizeTrait,
+    },
+    datatypes::{validate_decimal_precision, Decimal128Type, Int64Type},
+};
+use arrow_array::{Array, ArrowNativeTypeOp, BooleanArray, Decimal128Array};
+use arrow_schema::DataType;
+use datafusion::{functions::math::round::round, physical_plan::ColumnarValue};
+use datafusion_common::{
+    cast::as_generic_string_array, exec_err, internal_err, DataFusionError,
+    Result as DataFusionResult, ScalarValue,
+};
+use num::{
+    integer::{div_ceil, div_floor},
+    BigInt, Signed, ToPrimitive,
+};
+use unicode_segmentation::UnicodeSegmentation;
+
+mod unhex;
+pub use unhex::spark_unhex;
+
+mod hex;
+pub use hex::spark_hex;
+
+mod chr;
+pub use chr::SparkChrFunc;
+
+pub mod hash_expressions;
+// exposed for benchmark only
+pub use hash_expressions::{spark_murmur3_hash, spark_xxhash64};
+
+#[inline]
+fn get_precision_scale(data_type: &DataType) -> (u8, i8) {
+    let DataType::Decimal128(precision, scale) = data_type else {
+        unreachable!()
+    };
+    (*precision, *scale)
+}
+
+macro_rules! downcast_compute_op {
+    ($ARRAY:expr, $NAME:expr, $FUNC:ident, $TYPE:ident, $RESULT:ident) => {{
+        let n = $ARRAY.as_any().downcast_ref::<$TYPE>();
+        match n {
+            Some(array) => {
+                let res: $RESULT =
+                    arrow::compute::kernels::arity::unary(array, |x| x.$FUNC() as i64);
+                Ok(Arc::new(res))
+            }
+            _ => Err(DataFusionError::Internal(format!(
+                "Invalid data type for {}",
+                $NAME
+            ))),
+        }
+    }};
+}
+
+/// `ceil` function that simulates Spark `ceil` expression
+pub fn spark_ceil(
+    args: &[ColumnarValue],
+    data_type: &DataType,
+) -> Result<ColumnarValue, DataFusionError> {
+    let value = &args[0];
+    match value {
+        ColumnarValue::Array(array) => match array.data_type() {
+            DataType::Float32 => {
+                let result = downcast_compute_op!(array, "ceil", ceil, Float32Array, Int64Array);
+                Ok(ColumnarValue::Array(result?))
+            }
+            DataType::Float64 => {
+                let result = downcast_compute_op!(array, "ceil", ceil, Float64Array, Int64Array);
+                Ok(ColumnarValue::Array(result?))
+            }
+            DataType::Int64 => {
+                let result = array.as_any().downcast_ref::<Int64Array>().unwrap();
+                Ok(ColumnarValue::Array(Arc::new(result.clone())))
+            }
+            DataType::Decimal128(_, scale) if *scale > 0 => {
+                let f = decimal_ceil_f(scale);
+                let (precision, scale) = get_precision_scale(data_type);
+                make_decimal_array(array, precision, scale, &f)
+            }
+            other => Err(DataFusionError::Internal(format!(
+                "Unsupported data type {:?} for function ceil",
+                other,
+            ))),
+        },
+        ColumnarValue::Scalar(a) => match a {
+            ScalarValue::Float32(a) => Ok(ColumnarValue::Scalar(ScalarValue::Int64(
+                a.map(|x| x.ceil() as i64),
+            ))),
+            ScalarValue::Float64(a) => Ok(ColumnarValue::Scalar(ScalarValue::Int64(
+                a.map(|x| x.ceil() as i64),
+            ))),
+            ScalarValue::Int64(a) => Ok(ColumnarValue::Scalar(ScalarValue::Int64(a.map(|x| x)))),
+            ScalarValue::Decimal128(a, _, scale) if *scale > 0 => {
+                let f = decimal_ceil_f(scale);
+                let (precision, scale) = get_precision_scale(data_type);
+                make_decimal_scalar(a, precision, scale, &f)
+            }
+            _ => Err(DataFusionError::Internal(format!(
+                "Unsupported data type {:?} for function ceil",
+                value.data_type(),
+            ))),
+        },
+    }
+}
+
+/// `floor` function that simulates Spark `floor` expression
+pub fn spark_floor(
+    args: &[ColumnarValue],
+    data_type: &DataType,
+) -> Result<ColumnarValue, DataFusionError> {
+    let value = &args[0];
+    match value {
+        ColumnarValue::Array(array) => match array.data_type() {
+            DataType::Float32 => {
+                let result = downcast_compute_op!(array, "floor", floor, Float32Array, Int64Array);
+                Ok(ColumnarValue::Array(result?))
+            }
+            DataType::Float64 => {
+                let result = downcast_compute_op!(array, "floor", floor, Float64Array, Int64Array);
+                Ok(ColumnarValue::Array(result?))
+            }
+            DataType::Int64 => {
+                let result = array.as_any().downcast_ref::<Int64Array>().unwrap();
+                Ok(ColumnarValue::Array(Arc::new(result.clone())))
+            }
+            DataType::Decimal128(_, scale) if *scale > 0 => {
+                let f = decimal_floor_f(scale);
+                let (precision, scale) = get_precision_scale(data_type);
+                make_decimal_array(array, precision, scale, &f)
+            }
+            other => Err(DataFusionError::Internal(format!(
+                "Unsupported data type {:?} for function floor",
+                other,
+            ))),
+        },
+        ColumnarValue::Scalar(a) => match a {
+            ScalarValue::Float32(a) => Ok(ColumnarValue::Scalar(ScalarValue::Int64(
+                a.map(|x| x.floor() as i64),
+            ))),
+            ScalarValue::Float64(a) => Ok(ColumnarValue::Scalar(ScalarValue::Int64(
+                a.map(|x| x.floor() as i64),
+            ))),
+            ScalarValue::Int64(a) => Ok(ColumnarValue::Scalar(ScalarValue::Int64(a.map(|x| x)))),
+            ScalarValue::Decimal128(a, _, scale) if *scale > 0 => {
+                let f = decimal_floor_f(scale);
+                let (precision, scale) = get_precision_scale(data_type);
+                make_decimal_scalar(a, precision, scale, &f)
+            }
+            _ => Err(DataFusionError::Internal(format!(
+                "Unsupported data type {:?} for function floor",
+                value.data_type(),
+            ))),
+        },
+    }
+}
+
+/// Spark-compatible `UnscaledValue` expression (internal to Spark optimizer)
+pub fn spark_unscaled_value(args: &[ColumnarValue]) -> DataFusionResult<ColumnarValue> {
+    match &args[0] {
+        ColumnarValue::Scalar(v) => match v {
+            ScalarValue::Decimal128(d, _, _) => Ok(ColumnarValue::Scalar(ScalarValue::Int64(
+                d.map(|n| n as i64),
+            ))),
+            dt => internal_err!("Expected Decimal128 but found {dt:}"),
+        },
+        ColumnarValue::Array(a) => {
+            let arr = a.as_primitive::<Decimal128Type>();
+            let mut result = Int64Builder::new();
+            for v in arr.into_iter() {
+                result.append_option(v.map(|v| v as i64));
+            }
+            Ok(ColumnarValue::Array(Arc::new(result.finish())))
+        }
+    }
+}
+
+/// Spark-compatible `MakeDecimal` expression (internal to Spark optimizer)
+pub fn spark_make_decimal(
+    args: &[ColumnarValue],
+    data_type: &DataType,
+) -> DataFusionResult<ColumnarValue> {
+    let (precision, scale) = get_precision_scale(data_type);
+    match &args[0] {
+        ColumnarValue::Scalar(v) => match v {
+            ScalarValue::Int64(n) => Ok(ColumnarValue::Scalar(ScalarValue::Decimal128(
+                long_to_decimal(n, precision),
+                precision,
+                scale,
+            ))),
+            sv => internal_err!("Expected Int64 but found {sv:?}"),
+        },
+        ColumnarValue::Array(a) => {
+            let arr = a.as_primitive::<Int64Type>();
+            let mut result = Decimal128Builder::new();
+            for v in arr.into_iter() {
+                result.append_option(long_to_decimal(&v, precision))
+            }
+            let result_type = DataType::Decimal128(precision, scale);
+
+            Ok(ColumnarValue::Array(Arc::new(
+                result.finish().with_data_type(result_type),
+            )))
+        }
+    }
+}
+
+/// Convert the input long to decimal with the given maximum precision. If overflows, returns null
+/// instead.
+#[inline]
+fn long_to_decimal(v: &Option<i64>, precision: u8) -> Option<i128> {
+    match v {
+        Some(v) if validate_decimal_precision(*v as i128, precision).is_ok() => Some(*v as i128),
+        _ => None,
+    }
+}
+
+#[inline]
+fn decimal_ceil_f(scale: &i8) -> impl Fn(i128) -> i128 {
+    let div = 10_i128.pow_wrapping(*scale as u32);
+    move |x: i128| div_ceil(x, div)
+}
+
+#[inline]
+fn decimal_floor_f(scale: &i8) -> impl Fn(i128) -> i128 {
+    let div = 10_i128.pow_wrapping(*scale as u32);
+    move |x: i128| div_floor(x, div)
+}
+
+// Spark uses BigDecimal. See RoundBase implementation in Spark. Instead, we do the same by
+// 1) add the half of divisor, 2) round down by division, 3) adjust precision by multiplication
+#[inline]
+fn decimal_round_f(scale: &i8, point: &i64) -> Box<dyn Fn(i128) -> i128> {
+    if *point < 0 {
+        if let Some(div) = 10_i128.checked_pow((-(*point) as u32) + (*scale as u32)) {
+            let half = div / 2;
+            let mul = 10_i128.pow_wrapping((-(*point)) as u32);
+            // i128 can hold 39 digits of a base 10 number, adding half will not cause overflow
+            Box::new(move |x: i128| (x + x.signum() * half) / div * mul)
+        } else {
+            Box::new(move |_: i128| 0)
+        }
+    } else {
+        let div = 10_i128.pow_wrapping((*scale as u32) - min(*scale as u32, *point as u32));
+        let half = div / 2;
+        Box::new(move |x: i128| (x + x.signum() * half) / div)
+    }
+}
+
+#[inline]
+fn make_decimal_array(
+    array: &ArrayRef,
+    precision: u8,
+    scale: i8,
+    f: &dyn Fn(i128) -> i128,
+) -> Result<ColumnarValue, DataFusionError> {
+    let array = array.as_primitive::<Decimal128Type>();
+    let result: Decimal128Array = arrow::compute::kernels::arity::unary(array, f);
+    let result = result.with_data_type(DataType::Decimal128(precision, scale));
+    Ok(ColumnarValue::Array(Arc::new(result)))
+}
+
+#[inline]
+fn make_decimal_scalar(
+    a: &Option<i128>,
+    precision: u8,
+    scale: i8,
+    f: &dyn Fn(i128) -> i128,
+) -> Result<ColumnarValue, DataFusionError> {
+    let result = ScalarValue::Decimal128(a.map(f), precision, scale);
+    Ok(ColumnarValue::Scalar(result))
+}
+
+macro_rules! integer_round {
+    ($X:expr, $DIV:expr, $HALF:expr) => {{
+        let rem = $X % $DIV;
+        if rem <= -$HALF {
+            ($X - rem).sub_wrapping($DIV)
+        } else if rem >= $HALF {
+            ($X - rem).add_wrapping($DIV)
+        } else {
+            $X - rem
+        }
+    }};
+}
+
+macro_rules! round_integer_array {
+    ($ARRAY:expr, $POINT:expr, $TYPE:ty, $NATIVE:ty) => {{
+        let array = $ARRAY.as_any().downcast_ref::<$TYPE>().unwrap();
+        let ten: $NATIVE = 10;
+        let result: $TYPE = if let Some(div) = ten.checked_pow((-(*$POINT)) as u32) {
+            let half = div / 2;
+            arrow::compute::kernels::arity::unary(array, |x| integer_round!(x, div, half))
+        } else {
+            arrow::compute::kernels::arity::unary(array, |_| 0)
+        };
+        Ok(ColumnarValue::Array(Arc::new(result)))
+    }};
+}
+
+macro_rules! round_integer_scalar {
+    ($SCALAR:expr, $POINT:expr, $TYPE:expr, $NATIVE:ty) => {{
+        let ten: $NATIVE = 10;
+        if let Some(div) = ten.checked_pow((-(*$POINT)) as u32) {
+            let half = div / 2;
+            Ok(ColumnarValue::Scalar($TYPE(
+                $SCALAR.map(|x| integer_round!(x, div, half)),
+            )))
+        } else {
+            Ok(ColumnarValue::Scalar($TYPE(Some(0))))
+        }
+    }};
+}
+
+/// `round` function that simulates Spark `round` expression
+pub fn spark_round(
+    args: &[ColumnarValue],
+    data_type: &DataType,
+) -> Result<ColumnarValue, DataFusionError> {
+    let value = &args[0];
+    let point = &args[1];
+    let ColumnarValue::Scalar(ScalarValue::Int64(Some(point))) = point else {
+        return internal_err!("Invalid point argument for Round(): {:#?}", point);
+    };
+    match value {
+        ColumnarValue::Array(array) => match array.data_type() {
+            DataType::Int64 if *point < 0 => round_integer_array!(array, point, Int64Array, i64),
+            DataType::Int32 if *point < 0 => round_integer_array!(array, point, Int32Array, i32),
+            DataType::Int16 if *point < 0 => round_integer_array!(array, point, Int16Array, i16),
+            DataType::Int8 if *point < 0 => round_integer_array!(array, point, Int8Array, i8),
+            DataType::Decimal128(_, scale) if *scale > 0 => {
+                let f = decimal_round_f(scale, point);
+                let (precision, scale) = get_precision_scale(data_type);
+                make_decimal_array(array, precision, scale, &f)
+            }
+            DataType::Float32 | DataType::Float64 => {
+                Ok(ColumnarValue::Array(round(&[array.clone()])?))
+            }
+            dt => exec_err!("Not supported datatype for ROUND: {dt}"),
+        },
+        ColumnarValue::Scalar(a) => match a {
+            ScalarValue::Int64(a) if *point < 0 => {
+                round_integer_scalar!(a, point, ScalarValue::Int64, i64)
+            }
+            ScalarValue::Int32(a) if *point < 0 => {
+                round_integer_scalar!(a, point, ScalarValue::Int32, i32)
+            }
+            ScalarValue::Int16(a) if *point < 0 => {
+                round_integer_scalar!(a, point, ScalarValue::Int16, i16)
+            }
+            ScalarValue::Int8(a) if *point < 0 => {
+                round_integer_scalar!(a, point, ScalarValue::Int8, i8)
+            }
+            ScalarValue::Decimal128(a, _, scale) if *scale >= 0 => {
+                let f = decimal_round_f(scale, point);
+                let (precision, scale) = get_precision_scale(data_type);
+                make_decimal_scalar(a, precision, scale, &f)
+            }
+            ScalarValue::Float32(_) | ScalarValue::Float64(_) => Ok(ColumnarValue::Scalar(
+                ScalarValue::try_from_array(&round(&[a.to_array()?])?, 0)?,
+            )),
+            dt => exec_err!("Not supported datatype for ROUND: {dt}"),
+        },
+    }
+}
+
+/// Similar to DataFusion `rpad`, but not to truncate when the string is already longer than length
+pub fn spark_rpad(args: &[ColumnarValue]) -> Result<ColumnarValue, DataFusionError> {
+    match args {
+        [ColumnarValue::Array(array), ColumnarValue::Scalar(ScalarValue::Int32(Some(length)))] => {
+            match args[0].data_type() {
+                DataType::Utf8 => spark_rpad_internal::<i32>(array, *length),
+                DataType::LargeUtf8 => spark_rpad_internal::<i64>(array, *length),
+                // TODO: handle Dictionary types
+                other => Err(DataFusionError::Internal(format!(
+                    "Unsupported data type {other:?} for function rpad",
+                ))),
+            }
+        }
+        other => Err(DataFusionError::Internal(format!(
+            "Unsupported arguments {other:?} for function rpad",
+        ))),
+    }
+}
+
+fn spark_rpad_internal<T: OffsetSizeTrait>(
+    array: &ArrayRef,
+    length: i32,
+) -> Result<ColumnarValue, DataFusionError> {
+    let string_array = as_generic_string_array::<T>(array)?;
+
+    let result = string_array
+        .iter()
+        .map(|string| match string {
+            Some(string) => {
+                let length = if length < 0 { 0 } else { length as usize };
+                if length == 0 {
+                    Ok(Some("".to_string()))
+                } else {
+                    let graphemes = string.graphemes(true).collect::<Vec<&str>>();
+                    if length < graphemes.len() {
+                        Ok(Some(string.to_string()))
+                    } else {
+                        let mut s = string.to_string();
+                        s.push_str(" ".repeat(length - graphemes.len()).as_str());
+                        Ok(Some(s))
+                    }
+                }
+            }
+            _ => Ok(None),
+        })
+        .collect::<Result<GenericStringArray<T>, DataFusionError>>()?;
+    Ok(ColumnarValue::Array(Arc::new(result)))
+}
+
+// Let Decimal(p3, s3) as return type i.e. Decimal(p1, s1) / Decimal(p2, s2) = Decimal(p3, s3).
+// Conversely, Decimal(p1, s1) = Decimal(p2, s2) * Decimal(p3, s3). This means that, in order to
+// get enough scale that matches with Spark behavior, it requires to widen s1 to s2 + s3 + 1. Since
+// both s2 and s3 are 38 at max., s1 is 77 at max. DataFusion division cannot handle such scale >
+// Decimal256Type::MAX_SCALE. Therefore, we need to implement this decimal division using BigInt.
+pub fn spark_decimal_div(
+    args: &[ColumnarValue],
+    data_type: &DataType,
+) -> Result<ColumnarValue, DataFusionError> {
+    let left = &args[0];
+    let right = &args[1];
+    let (p3, s3) = get_precision_scale(data_type);
+
+    let (left, right): (ArrayRef, ArrayRef) = match (left, right) {
+        (ColumnarValue::Array(l), ColumnarValue::Array(r)) => (l.clone(), r.clone()),
+        (ColumnarValue::Scalar(l), ColumnarValue::Array(r)) => {
+            (l.to_array_of_size(r.len())?, r.clone())
+        }
+        (ColumnarValue::Array(l), ColumnarValue::Scalar(r)) => {
+            (l.clone(), r.to_array_of_size(l.len())?)
+        }
+        (ColumnarValue::Scalar(l), ColumnarValue::Scalar(r)) => (l.to_array()?, r.to_array()?),
+    };
+    let left = left.as_primitive::<Decimal128Type>();
+    let right = right.as_primitive::<Decimal128Type>();
+    let (_, s1) = get_precision_scale(left.data_type());
+    let (_, s2) = get_precision_scale(right.data_type());
+
+    let ten = BigInt::from(10);
+    let l_exp = ((s2 + s3 + 1) as u32).saturating_sub(s1 as u32);
+    let r_exp = (s1 as u32).saturating_sub((s2 + s3 + 1) as u32);
+    let l_mul = ten.pow(l_exp);
+    let r_mul = ten.pow(r_exp);
+    let five = BigInt::from(5);
+    let zero = BigInt::from(0);
+    let result: Decimal128Array = arrow::compute::kernels::arity::binary(left, right, |l, r| {
+        let l = BigInt::from(l) * &l_mul;
+        let r = BigInt::from(r) * &r_mul;
+        let div = if r.eq(&zero) { zero.clone() } else { &l / &r };
+        let res = if div.is_negative() {
+            div - &five
+        } else {
+            div + &five
+        } / &ten;
+        res.to_i128().unwrap_or(i128::MAX)
+    })?;
+    let result = result.with_data_type(DataType::Decimal128(p3, s3));
+    Ok(ColumnarValue::Array(Arc::new(result)))
+}
+
+/// Spark-compatible `isnan` expression
+pub fn spark_isnan(args: &[ColumnarValue]) -> Result<ColumnarValue, DataFusionError> {
+    fn set_nulls_to_false(is_nan: BooleanArray) -> ColumnarValue {
+        match is_nan.nulls() {
+            Some(nulls) => {
+                let is_not_null = nulls.inner();
+                ColumnarValue::Array(Arc::new(BooleanArray::new(
+                    is_nan.values() & is_not_null,
+                    None,
+                )))
+            }
+            None => ColumnarValue::Array(Arc::new(is_nan)),
+        }
+    }
+    let value = &args[0];
+    match value {
+        ColumnarValue::Array(array) => match array.data_type() {
+            DataType::Float64 => {
+                let array = array.as_any().downcast_ref::<Float64Array>().unwrap();
+                let is_nan = BooleanArray::from_unary(array, |x| x.is_nan());
+                Ok(set_nulls_to_false(is_nan))
+            }
+            DataType::Float32 => {
+                let array = array.as_any().downcast_ref::<Float32Array>().unwrap();
+                let is_nan = BooleanArray::from_unary(array, |x| x.is_nan());
+                Ok(set_nulls_to_false(is_nan))
+            }
+            other => Err(DataFusionError::Internal(format!(
+                "Unsupported data type {:?} for function isnan",
+                other,
+            ))),
+        },
+        ColumnarValue::Scalar(a) => match a {
+            ScalarValue::Float64(a) => Ok(ColumnarValue::Scalar(ScalarValue::Boolean(Some(
+                a.map(|x| x.is_nan()).unwrap_or(false),
+            )))),
+            ScalarValue::Float32(a) => Ok(ColumnarValue::Scalar(ScalarValue::Boolean(Some(
+                a.map(|x| x.is_nan()).unwrap_or(false),
+            )))),
+            _ => Err(DataFusionError::Internal(format!(
+                "Unsupported data type {:?} for function isnan",
+                value.data_type(),
+            ))),
+        },
+    }
+}
diff --git a/src/scalar_funcs/chr.rs b/src/scalar_funcs/chr.rs
new file mode 100644
index 000000000000..5de59f9f27ca
--- /dev/null
+++ b/src/scalar_funcs/chr.rs
@@ -0,0 +1,125 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::{any::Any, sync::Arc};
+
+use arrow::{
+    array::{ArrayRef, StringArray},
+    datatypes::{
+        DataType,
+        DataType::{Int64, Utf8},
+    },
+};
+
+use datafusion::logical_expr::{ColumnarValue, ScalarUDFImpl, Signature, Volatility};
+use datafusion_common::{cast::as_int64_array, exec_err, Result, ScalarValue};
+
+fn chr(args: &[ArrayRef]) -> Result<ArrayRef> {
+    let integer_array = as_int64_array(&args[0])?;
+
+    // first map is the iterator, second is for the `Option<_>`
+    let result = integer_array
+        .iter()
+        .map(|integer: Option<i64>| {
+            integer
+                .map(|integer| {
+                    if integer < 0 {
+                        return Ok("".to_string()); // Return empty string for negative integers
+                    }
+                    match core::char::from_u32((integer % 256) as u32) {
+                        Some(ch) => Ok(ch.to_string()),
+                        None => {
+                            exec_err!("requested character not compatible for encoding.")
+                        }
+                    }
+                })
+                .transpose()
+        })
+        .collect::<Result<StringArray>>()?;
+
+    Ok(Arc::new(result) as ArrayRef)
+}
+
+/// Spark-compatible `chr` expression
+#[derive(Debug)]
+pub struct SparkChrFunc {
+    signature: Signature,
+}
+
+impl Default for SparkChrFunc {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl SparkChrFunc {
+    pub fn new() -> Self {
+        Self {
+            signature: Signature::uniform(1, vec![Int64], Volatility::Immutable),
+        }
+    }
+}
+
+impl ScalarUDFImpl for SparkChrFunc {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn name(&self) -> &str {
+        "chr"
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        Ok(Utf8)
+    }
+
+    fn invoke(&self, args: &[ColumnarValue]) -> Result<ColumnarValue> {
+        spark_chr(args)
+    }
+}
+
+/// Returns the ASCII character having the binary equivalent to the input expression.
+/// E.g., chr(65) = 'A'.
+/// Compatible with Apache Spark's Chr function
+fn spark_chr(args: &[ColumnarValue]) -> Result<ColumnarValue> {
+    let array = args[0].clone();
+    match array {
+        ColumnarValue::Array(array) => {
+            let array = chr(&[array])?;
+            Ok(ColumnarValue::Array(array))
+        }
+        ColumnarValue::Scalar(ScalarValue::Int64(Some(value))) => {
+            if value < 0 {
+                Ok(ColumnarValue::Scalar(ScalarValue::Utf8(Some(
+                    "".to_string(),
+                ))))
+            } else {
+                match core::char::from_u32((value % 256) as u32) {
+                    Some(ch) => Ok(ColumnarValue::Scalar(ScalarValue::Utf8(Some(
+                        ch.to_string(),
+                    )))),
+                    None => exec_err!("requested character was incompatible for encoding."),
+                }
+            }
+        }
+        _ => exec_err!("The argument must be an Int64 array or scalar."),
+    }
+}
diff --git a/src/scalar_funcs/hash_expressions.rs b/src/scalar_funcs/hash_expressions.rs
new file mode 100644
index 000000000000..1a403b9e3db1
--- /dev/null
+++ b/src/scalar_funcs/hash_expressions.rs
@@ -0,0 +1,162 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::scalar_funcs::hex::hex_strings;
+use crate::spark_hash::{create_murmur3_hashes, create_xxhash64_hashes};
+
+use arrow_array::{ArrayRef, Int32Array, Int64Array, StringArray};
+use datafusion::functions::crypto::{sha224, sha256, sha384, sha512};
+use datafusion_common::cast::as_binary_array;
+use datafusion_common::{exec_err, internal_err, DataFusionError, ScalarValue};
+use datafusion_expr::{ColumnarValue, ScalarFunctionImplementation};
+use std::sync::Arc;
+
+/// Spark compatible murmur3 hash (just `hash` in Spark) in vectorized execution fashion
+pub fn spark_murmur3_hash(args: &[ColumnarValue]) -> Result<ColumnarValue, DataFusionError> {
+    let length = args.len();
+    let seed = &args[length - 1];
+    match seed {
+        ColumnarValue::Scalar(ScalarValue::Int32(Some(seed))) => {
+            // iterate over the arguments to find out the length of the array
+            let num_rows = args[0..args.len() - 1]
+                .iter()
+                .find_map(|arg| match arg {
+                    ColumnarValue::Array(array) => Some(array.len()),
+                    ColumnarValue::Scalar(_) => None,
+                })
+                .unwrap_or(1);
+            let mut hashes: Vec<u32> = vec![0_u32; num_rows];
+            hashes.fill(*seed as u32);
+            let arrays = args[0..args.len() - 1]
+                .iter()
+                .map(|arg| match arg {
+                    ColumnarValue::Array(array) => array.clone(),
+                    ColumnarValue::Scalar(scalar) => {
+                        scalar.clone().to_array_of_size(num_rows).unwrap()
+                    }
+                })
+                .collect::<Vec<ArrayRef>>();
+            create_murmur3_hashes(&arrays, &mut hashes)?;
+            if num_rows == 1 {
+                Ok(ColumnarValue::Scalar(ScalarValue::Int32(Some(
+                    hashes[0] as i32,
+                ))))
+            } else {
+                let hashes: Vec<i32> = hashes.into_iter().map(|x| x as i32).collect();
+                Ok(ColumnarValue::Array(Arc::new(Int32Array::from(hashes))))
+            }
+        }
+        _ => {
+            internal_err!(
+                "The seed of function murmur3_hash must be an Int32 scalar value, but got: {:?}.",
+                seed
+            )
+        }
+    }
+}
+
+/// Spark compatible xxhash64 in vectorized execution fashion
+pub fn spark_xxhash64(args: &[ColumnarValue]) -> Result<ColumnarValue, DataFusionError> {
+    let length = args.len();
+    let seed = &args[length - 1];
+    match seed {
+        ColumnarValue::Scalar(ScalarValue::Int64(Some(seed))) => {
+            // iterate over the arguments to find out the length of the array
+            let num_rows = args[0..args.len() - 1]
+                .iter()
+                .find_map(|arg| match arg {
+                    ColumnarValue::Array(array) => Some(array.len()),
+                    ColumnarValue::Scalar(_) => None,
+                })
+                .unwrap_or(1);
+            let mut hashes: Vec<u64> = vec![0_u64; num_rows];
+            hashes.fill(*seed as u64);
+            let arrays = args[0..args.len() - 1]
+                .iter()
+                .map(|arg| match arg {
+                    ColumnarValue::Array(array) => array.clone(),
+                    ColumnarValue::Scalar(scalar) => {
+                        scalar.clone().to_array_of_size(num_rows).unwrap()
+                    }
+                })
+                .collect::<Vec<ArrayRef>>();
+            create_xxhash64_hashes(&arrays, &mut hashes)?;
+            if num_rows == 1 {
+                Ok(ColumnarValue::Scalar(ScalarValue::Int64(Some(
+                    hashes[0] as i64,
+                ))))
+            } else {
+                let hashes: Vec<i64> = hashes.into_iter().map(|x| x as i64).collect();
+                Ok(ColumnarValue::Array(Arc::new(Int64Array::from(hashes))))
+            }
+        }
+        _ => {
+            internal_err!(
+                "The seed of function xxhash64 must be an Int64 scalar value, but got: {:?}.",
+                seed
+            )
+        }
+    }
+}
+
+/// `sha224` function that simulates Spark's `sha2` expression with bit width 224
+pub fn spark_sha224(args: &[ColumnarValue]) -> Result<ColumnarValue, DataFusionError> {
+    wrap_digest_result_as_hex_string(args, sha224().fun())
+}
+
+/// `sha256` function that simulates Spark's `sha2` expression with bit width 0 or 256
+pub fn spark_sha256(args: &[ColumnarValue]) -> Result<ColumnarValue, DataFusionError> {
+    wrap_digest_result_as_hex_string(args, sha256().fun())
+}
+
+/// `sha384` function that simulates Spark's `sha2` expression with bit width 384
+pub fn spark_sha384(args: &[ColumnarValue]) -> Result<ColumnarValue, DataFusionError> {
+    wrap_digest_result_as_hex_string(args, sha384().fun())
+}
+
+/// `sha512` function that simulates Spark's `sha2` expression with bit width 512
+pub fn spark_sha512(args: &[ColumnarValue]) -> Result<ColumnarValue, DataFusionError> {
+    wrap_digest_result_as_hex_string(args, sha512().fun())
+}
+
+// Spark requires hex string as the result of sha2 functions, we have to wrap the
+// result of digest functions as hex string
+fn wrap_digest_result_as_hex_string(
+    args: &[ColumnarValue],
+    digest: ScalarFunctionImplementation,
+) -> Result<ColumnarValue, DataFusionError> {
+    let value = digest(args)?;
+    match value {
+        ColumnarValue::Array(array) => {
+            let binary_array = as_binary_array(&array)?;
+            let string_array: StringArray = binary_array
+                .iter()
+                .map(|opt| opt.map(hex_strings::<_>))
+                .collect();
+            Ok(ColumnarValue::Array(Arc::new(string_array)))
+        }
+        ColumnarValue::Scalar(ScalarValue::Binary(opt)) => Ok(ColumnarValue::Scalar(
+            ScalarValue::Utf8(opt.map(hex_strings::<_>)),
+        )),
+        _ => {
+            exec_err!(
+                "digest function should return binary value, but got: {:?}",
+                value.data_type()
+            )
+        }
+    }
+}
diff --git a/src/scalar_funcs/hex.rs b/src/scalar_funcs/hex.rs
new file mode 100644
index 000000000000..e572ba5ef39a
--- /dev/null
+++ b/src/scalar_funcs/hex.rs
@@ -0,0 +1,296 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::sync::Arc;
+
+use arrow::{
+    array::{as_dictionary_array, as_largestring_array, as_string_array},
+    datatypes::Int32Type,
+};
+use arrow_array::StringArray;
+use arrow_schema::DataType;
+use datafusion::logical_expr::ColumnarValue;
+use datafusion_common::{
+    cast::{as_binary_array, as_fixed_size_binary_array, as_int64_array},
+    exec_err, DataFusionError,
+};
+use std::fmt::Write;
+
+fn hex_int64(num: i64) -> String {
+    format!("{:X}", num)
+}
+
+#[inline(always)]
+fn hex_encode<T: AsRef<[u8]>>(data: T, lower_case: bool) -> String {
+    let mut s = String::with_capacity(data.as_ref().len() * 2);
+    if lower_case {
+        for b in data.as_ref() {
+            // Writing to a string never errors, so we can unwrap here.
+            write!(&mut s, "{b:02x}").unwrap();
+        }
+    } else {
+        for b in data.as_ref() {
+            // Writing to a string never errors, so we can unwrap here.
+            write!(&mut s, "{b:02X}").unwrap();
+        }
+    }
+    s
+}
+
+#[inline(always)]
+pub(super) fn hex_strings<T: AsRef<[u8]>>(data: T) -> String {
+    hex_encode(data, true)
+}
+
+#[inline(always)]
+fn hex_bytes<T: AsRef<[u8]>>(bytes: T) -> Result<String, std::fmt::Error> {
+    let hex_string = hex_encode(bytes, false);
+    Ok(hex_string)
+}
+
+/// Spark-compatible `hex` function
+pub fn spark_hex(args: &[ColumnarValue]) -> Result<ColumnarValue, DataFusionError> {
+    if args.len() != 1 {
+        return Err(DataFusionError::Internal(
+            "hex expects exactly one argument".to_string(),
+        ));
+    }
+
+    match &args[0] {
+        ColumnarValue::Array(array) => match array.data_type() {
+            DataType::Int64 => {
+                let array = as_int64_array(array)?;
+
+                let hexed_array: StringArray = array.iter().map(|v| v.map(hex_int64)).collect();
+
+                Ok(ColumnarValue::Array(Arc::new(hexed_array)))
+            }
+            DataType::Utf8 => {
+                let array = as_string_array(array);
+
+                let hexed: StringArray = array
+                    .iter()
+                    .map(|v| v.map(hex_bytes).transpose())
+                    .collect::<Result<_, _>>()?;
+
+                Ok(ColumnarValue::Array(Arc::new(hexed)))
+            }
+            DataType::LargeUtf8 => {
+                let array = as_largestring_array(array);
+
+                let hexed: StringArray = array
+                    .iter()
+                    .map(|v| v.map(hex_bytes).transpose())
+                    .collect::<Result<_, _>>()?;
+
+                Ok(ColumnarValue::Array(Arc::new(hexed)))
+            }
+            DataType::Binary => {
+                let array = as_binary_array(array)?;
+
+                let hexed: StringArray = array
+                    .iter()
+                    .map(|v| v.map(hex_bytes).transpose())
+                    .collect::<Result<_, _>>()?;
+
+                Ok(ColumnarValue::Array(Arc::new(hexed)))
+            }
+            DataType::FixedSizeBinary(_) => {
+                let array = as_fixed_size_binary_array(array)?;
+
+                let hexed: StringArray = array
+                    .iter()
+                    .map(|v| v.map(hex_bytes).transpose())
+                    .collect::<Result<_, _>>()?;
+
+                Ok(ColumnarValue::Array(Arc::new(hexed)))
+            }
+            DataType::Dictionary(_, value_type) => {
+                let dict = as_dictionary_array::<Int32Type>(&array);
+
+                let values = match **value_type {
+                    DataType::Int64 => as_int64_array(dict.values())?
+                        .iter()
+                        .map(|v| v.map(hex_int64))
+                        .collect::<Vec<_>>(),
+                    DataType::Utf8 => as_string_array(dict.values())
+                        .iter()
+                        .map(|v| v.map(hex_bytes).transpose())
+                        .collect::<Result<_, _>>()?,
+                    DataType::Binary => as_binary_array(dict.values())?
+                        .iter()
+                        .map(|v| v.map(hex_bytes).transpose())
+                        .collect::<Result<_, _>>()?,
+                    _ => exec_err!(
+                        "hex got an unexpected argument type: {:?}",
+                        array.data_type()
+                    )?,
+                };
+
+                let new_values: Vec<Option<String>> = dict
+                    .keys()
+                    .iter()
+                    .map(|key| key.map(|k| values[k as usize].clone()).unwrap_or(None))
+                    .collect();
+
+                let string_array_values = StringArray::from(new_values);
+
+                Ok(ColumnarValue::Array(Arc::new(string_array_values)))
+            }
+            _ => exec_err!(
+                "hex got an unexpected argument type: {:?}",
+                array.data_type()
+            ),
+        },
+        _ => exec_err!("native hex does not support scalar values at this time"),
+    }
+}
+
+#[cfg(test)]
+mod test {
+    use std::sync::Arc;
+
+    use arrow::{
+        array::{
+            as_string_array, BinaryDictionaryBuilder, PrimitiveDictionaryBuilder, StringBuilder,
+            StringDictionaryBuilder,
+        },
+        datatypes::{Int32Type, Int64Type},
+    };
+    use arrow_array::{Int64Array, StringArray};
+    use datafusion::logical_expr::ColumnarValue;
+
+    #[test]
+    fn test_dictionary_hex_utf8() {
+        let mut input_builder = StringDictionaryBuilder::<Int32Type>::new();
+        input_builder.append_value("hi");
+        input_builder.append_value("bye");
+        input_builder.append_null();
+        input_builder.append_value("rust");
+        let input = input_builder.finish();
+
+        let mut string_builder = StringBuilder::new();
+        string_builder.append_value("6869");
+        string_builder.append_value("627965");
+        string_builder.append_null();
+        string_builder.append_value("72757374");
+        let expected = string_builder.finish();
+
+        let columnar_value = ColumnarValue::Array(Arc::new(input));
+        let result = super::spark_hex(&[columnar_value]).unwrap();
+
+        let result = match result {
+            ColumnarValue::Array(array) => array,
+            _ => panic!("Expected array"),
+        };
+
+        let result = as_string_array(&result);
+
+        assert_eq!(result, &expected);
+    }
+
+    #[test]
+    fn test_dictionary_hex_int64() {
+        let mut input_builder = PrimitiveDictionaryBuilder::<Int32Type, Int64Type>::new();
+        input_builder.append_value(1);
+        input_builder.append_value(2);
+        input_builder.append_null();
+        input_builder.append_value(3);
+        let input = input_builder.finish();
+
+        let mut string_builder = StringBuilder::new();
+        string_builder.append_value("1");
+        string_builder.append_value("2");
+        string_builder.append_null();
+        string_builder.append_value("3");
+        let expected = string_builder.finish();
+
+        let columnar_value = ColumnarValue::Array(Arc::new(input));
+        let result = super::spark_hex(&[columnar_value]).unwrap();
+
+        let result = match result {
+            ColumnarValue::Array(array) => array,
+            _ => panic!("Expected array"),
+        };
+
+        let result = as_string_array(&result);
+
+        assert_eq!(result, &expected);
+    }
+
+    #[test]
+    fn test_dictionary_hex_binary() {
+        let mut input_builder = BinaryDictionaryBuilder::<Int32Type>::new();
+        input_builder.append_value("1");
+        input_builder.append_value("j");
+        input_builder.append_null();
+        input_builder.append_value("3");
+        let input = input_builder.finish();
+
+        let mut expected_builder = StringBuilder::new();
+        expected_builder.append_value("31");
+        expected_builder.append_value("6A");
+        expected_builder.append_null();
+        expected_builder.append_value("33");
+        let expected = expected_builder.finish();
+
+        let columnar_value = ColumnarValue::Array(Arc::new(input));
+        let result = super::spark_hex(&[columnar_value]).unwrap();
+
+        let result = match result {
+            ColumnarValue::Array(array) => array,
+            _ => panic!("Expected array"),
+        };
+
+        let result = as_string_array(&result);
+
+        assert_eq!(result, &expected);
+    }
+
+    #[test]
+    fn test_hex_int64() {
+        let num = 1234;
+        let hexed = super::hex_int64(num);
+        assert_eq!(hexed, "4D2".to_string());
+
+        let num = -1;
+        let hexed = super::hex_int64(num);
+        assert_eq!(hexed, "FFFFFFFFFFFFFFFF".to_string());
+    }
+
+    #[test]
+    fn test_spark_hex_int64() {
+        let int_array = Int64Array::from(vec![Some(1), Some(2), None, Some(3)]);
+        let columnar_value = ColumnarValue::Array(Arc::new(int_array));
+
+        let result = super::spark_hex(&[columnar_value]).unwrap();
+        let result = match result {
+            ColumnarValue::Array(array) => array,
+            _ => panic!("Expected array"),
+        };
+
+        let string_array = as_string_array(&result);
+        let expected_array = StringArray::from(vec![
+            Some("1".to_string()),
+            Some("2".to_string()),
+            None,
+            Some("3".to_string()),
+        ]);
+
+        assert_eq!(string_array, &expected_array);
+    }
+}
diff --git a/src/scalar_funcs/unhex.rs b/src/scalar_funcs/unhex.rs
new file mode 100644
index 000000000000..9996392b63a4
--- /dev/null
+++ b/src/scalar_funcs/unhex.rs
@@ -0,0 +1,258 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::sync::Arc;
+
+use arrow_array::OffsetSizeTrait;
+use arrow_schema::DataType;
+use datafusion::logical_expr::ColumnarValue;
+use datafusion_common::{cast::as_generic_string_array, exec_err, DataFusionError, ScalarValue};
+
+/// Helper function to convert a hex digit to a binary value.
+fn unhex_digit(c: u8) -> Result<u8, DataFusionError> {
+    match c {
+        b'0'..=b'9' => Ok(c - b'0'),
+        b'A'..=b'F' => Ok(10 + c - b'A'),
+        b'a'..=b'f' => Ok(10 + c - b'a'),
+        _ => Err(DataFusionError::Execution(
+            "Input to unhex_digit is not a valid hex digit".to_string(),
+        )),
+    }
+}
+
+/// Convert a hex string to binary and store the result in `result`. Returns an error if the input
+/// is not a valid hex string.
+fn unhex(hex_str: &str, result: &mut Vec<u8>) -> Result<(), DataFusionError> {
+    let bytes = hex_str.as_bytes();
+
+    let mut i = 0;
+
+    if (bytes.len() & 0x01) != 0 {
+        let v = unhex_digit(bytes[0])?;
+
+        result.push(v);
+        i += 1;
+    }
+
+    while i < bytes.len() {
+        let first = unhex_digit(bytes[i])?;
+        let second = unhex_digit(bytes[i + 1])?;
+        result.push((first << 4) | second);
+
+        i += 2;
+    }
+
+    Ok(())
+}
+
+fn spark_unhex_inner<T: OffsetSizeTrait>(
+    array: &ColumnarValue,
+    fail_on_error: bool,
+) -> Result<ColumnarValue, DataFusionError> {
+    match array {
+        ColumnarValue::Array(array) => {
+            let string_array = as_generic_string_array::<T>(array)?;
+
+            let mut encoded = Vec::new();
+            let mut builder = arrow::array::BinaryBuilder::new();
+
+            for item in string_array.iter() {
+                if let Some(s) = item {
+                    if unhex(s, &mut encoded).is_ok() {
+                        builder.append_value(encoded.as_slice());
+                    } else if fail_on_error {
+                        return exec_err!("Input to unhex is not a valid hex string: {s}");
+                    } else {
+                        builder.append_null();
+                    }
+                    encoded.clear();
+                } else {
+                    builder.append_null();
+                }
+            }
+            Ok(ColumnarValue::Array(Arc::new(builder.finish())))
+        }
+        ColumnarValue::Scalar(ScalarValue::Utf8(Some(string))) => {
+            let mut encoded = Vec::new();
+
+            if unhex(string, &mut encoded).is_ok() {
+                Ok(ColumnarValue::Scalar(ScalarValue::Binary(Some(encoded))))
+            } else if fail_on_error {
+                exec_err!("Input to unhex is not a valid hex string: {string}")
+            } else {
+                Ok(ColumnarValue::Scalar(ScalarValue::Binary(None)))
+            }
+        }
+        ColumnarValue::Scalar(ScalarValue::Utf8(None)) => {
+            Ok(ColumnarValue::Scalar(ScalarValue::Binary(None)))
+        }
+        _ => {
+            exec_err!(
+                "The first argument must be a string scalar or array, but got: {:?}",
+                array
+            )
+        }
+    }
+}
+
+/// Spark-compatible `unhex` expression
+pub fn spark_unhex(args: &[ColumnarValue]) -> Result<ColumnarValue, DataFusionError> {
+    if args.len() > 2 {
+        return exec_err!("unhex takes at most 2 arguments, but got: {}", args.len());
+    }
+
+    let val_to_unhex = &args[0];
+    let fail_on_error = if args.len() == 2 {
+        match &args[1] {
+            ColumnarValue::Scalar(ScalarValue::Boolean(Some(fail_on_error))) => *fail_on_error,
+            _ => {
+                return exec_err!(
+                    "The second argument must be boolean scalar, but got: {:?}",
+                    args[1]
+                );
+            }
+        }
+    } else {
+        false
+    };
+
+    match val_to_unhex.data_type() {
+        DataType::Utf8 => spark_unhex_inner::<i32>(val_to_unhex, fail_on_error),
+        DataType::LargeUtf8 => spark_unhex_inner::<i64>(val_to_unhex, fail_on_error),
+        other => exec_err!(
+            "The first argument must be a Utf8 or LargeUtf8: {:?}",
+            other
+        ),
+    }
+}
+
+#[cfg(test)]
+mod test {
+    use std::sync::Arc;
+
+    use arrow::array::{BinaryBuilder, StringBuilder};
+    use arrow_array::make_array;
+    use arrow_data::ArrayData;
+    use datafusion::logical_expr::ColumnarValue;
+    use datafusion_common::ScalarValue;
+
+    use super::unhex;
+
+    #[test]
+    fn test_spark_unhex_null() -> Result<(), Box<dyn std::error::Error>> {
+        let input = ArrayData::new_null(&arrow_schema::DataType::Utf8, 2);
+        let output = ArrayData::new_null(&arrow_schema::DataType::Binary, 2);
+
+        let input = ColumnarValue::Array(Arc::new(make_array(input)));
+        let expected = ColumnarValue::Array(Arc::new(make_array(output)));
+
+        let result = super::spark_unhex(&[input])?;
+
+        match (result, expected) {
+            (ColumnarValue::Array(result), ColumnarValue::Array(expected)) => {
+                assert_eq!(*result, *expected);
+                Ok(())
+            }
+            _ => Err("Unexpected result type".into()),
+        }
+    }
+
+    #[test]
+    fn test_partial_error() -> Result<(), Box<dyn std::error::Error>> {
+        let mut input = StringBuilder::new();
+
+        input.append_value("1CGG"); // 1C is ok, but GG is invalid
+        input.append_value("537061726B2053514C"); // followed by valid
+
+        let input = ColumnarValue::Array(Arc::new(input.finish()));
+        let fail_on_error = ColumnarValue::Scalar(ScalarValue::Boolean(Some(false)));
+
+        let result = super::spark_unhex(&[input, fail_on_error])?;
+
+        let mut expected = BinaryBuilder::new();
+        expected.append_null();
+        expected.append_value("Spark SQL".as_bytes());
+
+        match (result, ColumnarValue::Array(Arc::new(expected.finish()))) {
+            (ColumnarValue::Array(result), ColumnarValue::Array(expected)) => {
+                assert_eq!(*result, *expected);
+
+                Ok(())
+            }
+            _ => Err("Unexpected result type".into()),
+        }
+    }
+
+    #[test]
+    fn test_unhex_valid() -> Result<(), Box<dyn std::error::Error>> {
+        let mut result = Vec::new();
+
+        unhex("537061726B2053514C", &mut result)?;
+        let result_str = std::str::from_utf8(&result)?;
+        assert_eq!(result_str, "Spark SQL");
+        result.clear();
+
+        unhex("1C", &mut result)?;
+        assert_eq!(result, vec![28]);
+        result.clear();
+
+        unhex("737472696E67", &mut result)?;
+        assert_eq!(result, "string".as_bytes());
+        result.clear();
+
+        unhex("1", &mut result)?;
+        assert_eq!(result, vec![1]);
+        result.clear();
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_odd_length() -> Result<(), Box<dyn std::error::Error>> {
+        let mut result = Vec::new();
+
+        unhex("A1B", &mut result)?;
+        assert_eq!(result, vec![10, 27]);
+        result.clear();
+
+        unhex("0A1B", &mut result)?;
+        assert_eq!(result, vec![10, 27]);
+        result.clear();
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_unhex_empty() {
+        let mut result = Vec::new();
+
+        // Empty hex string
+        unhex("", &mut result).unwrap();
+        assert!(result.is_empty());
+    }
+
+    #[test]
+    fn test_unhex_invalid() {
+        let mut result = Vec::new();
+
+        // Invalid hex strings
+        assert!(unhex("##", &mut result).is_err());
+        assert!(unhex("G123", &mut result).is_err());
+        assert!(unhex("hello", &mut result).is_err());
+        assert!(unhex("\0", &mut result).is_err());
+    }
+}
diff --git a/src/spark_hash.rs b/src/spark_hash.rs
new file mode 100644
index 000000000000..66a103a2ae27
--- /dev/null
+++ b/src/spark_hash.rs
@@ -0,0 +1,708 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! This includes utilities for hashing and murmur3 hashing.
+
+use arrow::{
+    compute::take,
+    datatypes::{ArrowNativeTypeOp, UInt16Type, UInt32Type, UInt64Type, UInt8Type},
+};
+use std::sync::Arc;
+
+use datafusion::{
+    arrow::{
+        array::*,
+        datatypes::{
+            ArrowDictionaryKeyType, ArrowNativeType, DataType, Int16Type, Int32Type, Int64Type,
+            Int8Type, TimeUnit,
+        },
+    },
+    error::{DataFusionError, Result},
+};
+
+use crate::xxhash64::spark_compatible_xxhash64;
+
+/// Spark-compatible murmur3 hash function
+#[inline]
+pub fn spark_compatible_murmur3_hash<T: AsRef<[u8]>>(data: T, seed: u32) -> u32 {
+    #[inline]
+    fn mix_k1(mut k1: i32) -> i32 {
+        k1 = k1.mul_wrapping(0xcc9e2d51u32 as i32);
+        k1 = k1.rotate_left(15);
+        k1 = k1.mul_wrapping(0x1b873593u32 as i32);
+        k1
+    }
+
+    #[inline]
+    fn mix_h1(mut h1: i32, k1: i32) -> i32 {
+        h1 ^= k1;
+        h1 = h1.rotate_left(13);
+        h1 = h1.mul_wrapping(5).add_wrapping(0xe6546b64u32 as i32);
+        h1
+    }
+
+    #[inline]
+    fn fmix(mut h1: i32, len: i32) -> i32 {
+        h1 ^= len;
+        h1 ^= (h1 as u32 >> 16) as i32;
+        h1 = h1.mul_wrapping(0x85ebca6bu32 as i32);
+        h1 ^= (h1 as u32 >> 13) as i32;
+        h1 = h1.mul_wrapping(0xc2b2ae35u32 as i32);
+        h1 ^= (h1 as u32 >> 16) as i32;
+        h1
+    }
+
+    #[inline]
+    unsafe fn hash_bytes_by_int(data: &[u8], seed: u32) -> i32 {
+        // safety: data length must be aligned to 4 bytes
+        let mut h1 = seed as i32;
+        for i in (0..data.len()).step_by(4) {
+            let ints = data.as_ptr().add(i) as *const i32;
+            let mut half_word = ints.read_unaligned();
+            if cfg!(target_endian = "big") {
+                half_word = half_word.reverse_bits();
+            }
+            h1 = mix_h1(h1, mix_k1(half_word));
+        }
+        h1
+    }
+    let data = data.as_ref();
+    let len = data.len();
+    let len_aligned = len - len % 4;
+
+    // safety:
+    // avoid boundary checking in performance critical codes.
+    // all operations are guaranteed to be safe
+    // data is &[u8] so we do not need to check for proper alignment
+    unsafe {
+        let mut h1 = if len_aligned > 0 {
+            hash_bytes_by_int(&data[0..len_aligned], seed)
+        } else {
+            seed as i32
+        };
+
+        for i in len_aligned..len {
+            let half_word = *data.get_unchecked(i) as i8 as i32;
+            h1 = mix_h1(h1, mix_k1(half_word));
+        }
+        fmix(h1, len as i32) as u32
+    }
+}
+
+macro_rules! hash_array {
+    ($array_type: ident, $column: ident, $hashes: ident, $hash_method: ident) => {
+        let array = $column.as_any().downcast_ref::<$array_type>().unwrap();
+        if array.null_count() == 0 {
+            for (i, hash) in $hashes.iter_mut().enumerate() {
+                *hash = $hash_method(&array.value(i), *hash);
+            }
+        } else {
+            for (i, hash) in $hashes.iter_mut().enumerate() {
+                if !array.is_null(i) {
+                    *hash = $hash_method(&array.value(i), *hash);
+                }
+            }
+        }
+    };
+}
+
+macro_rules! hash_array_boolean {
+    ($array_type: ident, $column: ident, $hash_input_type: ident, $hashes: ident, $hash_method: ident) => {
+        let array = $column.as_any().downcast_ref::<$array_type>().unwrap();
+        if array.null_count() == 0 {
+            for (i, hash) in $hashes.iter_mut().enumerate() {
+                *hash = $hash_method($hash_input_type::from(array.value(i)).to_le_bytes(), *hash);
+            }
+        } else {
+            for (i, hash) in $hashes.iter_mut().enumerate() {
+                if !array.is_null(i) {
+                    *hash =
+                        $hash_method($hash_input_type::from(array.value(i)).to_le_bytes(), *hash);
+                }
+            }
+        }
+    };
+}
+
+macro_rules! hash_array_primitive {
+    ($array_type: ident, $column: ident, $ty: ident, $hashes: ident, $hash_method: ident) => {
+        let array = $column.as_any().downcast_ref::<$array_type>().unwrap();
+        let values = array.values();
+
+        if array.null_count() == 0 {
+            for (hash, value) in $hashes.iter_mut().zip(values.iter()) {
+                *hash = $hash_method((*value as $ty).to_le_bytes(), *hash);
+            }
+        } else {
+            for (i, (hash, value)) in $hashes.iter_mut().zip(values.iter()).enumerate() {
+                if !array.is_null(i) {
+                    *hash = $hash_method((*value as $ty).to_le_bytes(), *hash);
+                }
+            }
+        }
+    };
+}
+
+macro_rules! hash_array_primitive_float {
+    ($array_type: ident, $column: ident, $ty: ident, $ty2: ident, $hashes: ident, $hash_method: ident) => {
+        let array = $column.as_any().downcast_ref::<$array_type>().unwrap();
+        let values = array.values();
+
+        if array.null_count() == 0 {
+            for (hash, value) in $hashes.iter_mut().zip(values.iter()) {
+                // Spark uses 0 as hash for -0.0, see `Murmur3Hash` expression.
+                if *value == 0.0 && value.is_sign_negative() {
+                    *hash = $hash_method((0 as $ty2).to_le_bytes(), *hash);
+                } else {
+                    *hash = $hash_method((*value as $ty).to_le_bytes(), *hash);
+                }
+            }
+        } else {
+            for (i, (hash, value)) in $hashes.iter_mut().zip(values.iter()).enumerate() {
+                if !array.is_null(i) {
+                    // Spark uses 0 as hash for -0.0, see `Murmur3Hash` expression.
+                    if *value == 0.0 && value.is_sign_negative() {
+                        *hash = $hash_method((0 as $ty2).to_le_bytes(), *hash);
+                    } else {
+                        *hash = $hash_method((*value as $ty).to_le_bytes(), *hash);
+                    }
+                }
+            }
+        }
+    };
+}
+
+macro_rules! hash_array_decimal {
+    ($array_type:ident, $column: ident, $hashes: ident, $hash_method: ident) => {
+        let array = $column.as_any().downcast_ref::<$array_type>().unwrap();
+
+        if array.null_count() == 0 {
+            for (i, hash) in $hashes.iter_mut().enumerate() {
+                *hash = $hash_method(array.value(i).to_le_bytes(), *hash);
+            }
+        } else {
+            for (i, hash) in $hashes.iter_mut().enumerate() {
+                if !array.is_null(i) {
+                    *hash = $hash_method(array.value(i).to_le_bytes(), *hash);
+                }
+            }
+        }
+    };
+}
+
+/// Hash the values in a dictionary array
+fn create_hashes_dictionary<K: ArrowDictionaryKeyType>(
+    array: &ArrayRef,
+    hashes_buffer: &mut [u32],
+    first_col: bool,
+) -> Result<()> {
+    let dict_array = array.as_any().downcast_ref::<DictionaryArray<K>>().unwrap();
+    if !first_col {
+        // unpack the dictionary array as each row may have a different hash input
+        let unpacked = take(dict_array.values().as_ref(), dict_array.keys(), None)?;
+        create_murmur3_hashes(&[unpacked], hashes_buffer)?;
+    } else {
+        // For the first column, hash each dictionary value once, and then use
+        // that computed hash for each key value to avoid a potentially
+        // expensive redundant hashing for large dictionary elements (e.g. strings)
+        let dict_values = Arc::clone(dict_array.values());
+        // same initial seed as Spark
+        let mut dict_hashes = vec![42; dict_values.len()];
+        create_murmur3_hashes(&[dict_values], &mut dict_hashes)?;
+        for (hash, key) in hashes_buffer.iter_mut().zip(dict_array.keys().iter()) {
+            if let Some(key) = key {
+                let idx = key.to_usize().ok_or_else(|| {
+                    DataFusionError::Internal(format!(
+                        "Can not convert key value {:?} to usize in dictionary of type {:?}",
+                        key,
+                        dict_array.data_type()
+                    ))
+                })?;
+                *hash = dict_hashes[idx]
+            } // no update for Null, consistent with other hashes
+        }
+    }
+    Ok(())
+}
+
+// Hash the values in a dictionary array using xxhash64
+fn create_xxhash64_hashes_dictionary<K: ArrowDictionaryKeyType>(
+    array: &ArrayRef,
+    hashes_buffer: &mut [u64],
+    first_col: bool,
+) -> Result<()> {
+    let dict_array = array.as_any().downcast_ref::<DictionaryArray<K>>().unwrap();
+    if !first_col {
+        let unpacked = take(dict_array.values().as_ref(), dict_array.keys(), None)?;
+        create_xxhash64_hashes(&[unpacked], hashes_buffer)?;
+    } else {
+        // Hash each dictionary value once, and then use that computed
+        // hash for each key value to avoid a potentially expensive
+        // redundant hashing for large dictionary elements (e.g. strings)
+        let dict_values = Arc::clone(dict_array.values());
+        // same initial seed as Spark
+        let mut dict_hashes = vec![42u64; dict_values.len()];
+        create_xxhash64_hashes(&[dict_values], &mut dict_hashes)?;
+
+        for (hash, key) in hashes_buffer.iter_mut().zip(dict_array.keys().iter()) {
+            if let Some(key) = key {
+                let idx = key.to_usize().ok_or_else(|| {
+                    DataFusionError::Internal(format!(
+                        "Can not convert key value {:?} to usize in dictionary of type {:?}",
+                        key,
+                        dict_array.data_type()
+                    ))
+                })?;
+                *hash = dict_hashes[idx]
+            } // no update for Null, consistent with other hashes
+        }
+    }
+    Ok(())
+}
+
+/// Creates hash values for every row, based on the values in the
+/// columns.
+///
+/// The number of rows to hash is determined by `hashes_buffer.len()`.
+/// `hashes_buffer` should be pre-sized appropriately
+///
+/// `hash_method` is the hash function to use.
+/// `create_dictionary_hash_method` is the function to create hashes for dictionary arrays input.
+macro_rules! create_hashes_internal {
+    ($arrays: ident, $hashes_buffer: ident, $hash_method: ident, $create_dictionary_hash_method: ident) => {
+        for (i, col) in $arrays.iter().enumerate() {
+            let first_col = i == 0;
+            match col.data_type() {
+                DataType::Boolean => {
+                    hash_array_boolean!(BooleanArray, col, i32, $hashes_buffer, $hash_method);
+                }
+                DataType::Int8 => {
+                    hash_array_primitive!(Int8Array, col, i32, $hashes_buffer, $hash_method);
+                }
+                DataType::Int16 => {
+                    hash_array_primitive!(Int16Array, col, i32, $hashes_buffer, $hash_method);
+                }
+                DataType::Int32 => {
+                    hash_array_primitive!(Int32Array, col, i32, $hashes_buffer, $hash_method);
+                }
+                DataType::Int64 => {
+                    hash_array_primitive!(Int64Array, col, i64, $hashes_buffer, $hash_method);
+                }
+                DataType::Float32 => {
+                    hash_array_primitive_float!(
+                        Float32Array,
+                        col,
+                        f32,
+                        i32,
+                        $hashes_buffer,
+                        $hash_method
+                    );
+                }
+                DataType::Float64 => {
+                    hash_array_primitive_float!(
+                        Float64Array,
+                        col,
+                        f64,
+                        i64,
+                        $hashes_buffer,
+                        $hash_method
+                    );
+                }
+                DataType::Timestamp(TimeUnit::Second, _) => {
+                    hash_array_primitive!(
+                        TimestampSecondArray,
+                        col,
+                        i64,
+                        $hashes_buffer,
+                        $hash_method
+                    );
+                }
+                DataType::Timestamp(TimeUnit::Millisecond, _) => {
+                    hash_array_primitive!(
+                        TimestampMillisecondArray,
+                        col,
+                        i64,
+                        $hashes_buffer,
+                        $hash_method
+                    );
+                }
+                DataType::Timestamp(TimeUnit::Microsecond, _) => {
+                    hash_array_primitive!(
+                        TimestampMicrosecondArray,
+                        col,
+                        i64,
+                        $hashes_buffer,
+                        $hash_method
+                    );
+                }
+                DataType::Timestamp(TimeUnit::Nanosecond, _) => {
+                    hash_array_primitive!(
+                        TimestampNanosecondArray,
+                        col,
+                        i64,
+                        $hashes_buffer,
+                        $hash_method
+                    );
+                }
+                DataType::Date32 => {
+                    hash_array_primitive!(Date32Array, col, i32, $hashes_buffer, $hash_method);
+                }
+                DataType::Date64 => {
+                    hash_array_primitive!(Date64Array, col, i64, $hashes_buffer, $hash_method);
+                }
+                DataType::Utf8 => {
+                    hash_array!(StringArray, col, $hashes_buffer, $hash_method);
+                }
+                DataType::LargeUtf8 => {
+                    hash_array!(LargeStringArray, col, $hashes_buffer, $hash_method);
+                }
+                DataType::Binary => {
+                    hash_array!(BinaryArray, col, $hashes_buffer, $hash_method);
+                }
+                DataType::LargeBinary => {
+                    hash_array!(LargeBinaryArray, col, $hashes_buffer, $hash_method);
+                }
+                DataType::FixedSizeBinary(_) => {
+                    hash_array!(FixedSizeBinaryArray, col, $hashes_buffer, $hash_method);
+                }
+                DataType::Decimal128(_, _) => {
+                    hash_array_decimal!(Decimal128Array, col, $hashes_buffer, $hash_method);
+                }
+                DataType::Dictionary(index_type, _) => match **index_type {
+                    DataType::Int8 => {
+                        $create_dictionary_hash_method::<Int8Type>(col, $hashes_buffer, first_col)?;
+                    }
+                    DataType::Int16 => {
+                        $create_dictionary_hash_method::<Int16Type>(
+                            col,
+                            $hashes_buffer,
+                            first_col,
+                        )?;
+                    }
+                    DataType::Int32 => {
+                        $create_dictionary_hash_method::<Int32Type>(
+                            col,
+                            $hashes_buffer,
+                            first_col,
+                        )?;
+                    }
+                    DataType::Int64 => {
+                        $create_dictionary_hash_method::<Int64Type>(
+                            col,
+                            $hashes_buffer,
+                            first_col,
+                        )?;
+                    }
+                    DataType::UInt8 => {
+                        $create_dictionary_hash_method::<UInt8Type>(
+                            col,
+                            $hashes_buffer,
+                            first_col,
+                        )?;
+                    }
+                    DataType::UInt16 => {
+                        $create_dictionary_hash_method::<UInt16Type>(
+                            col,
+                            $hashes_buffer,
+                            first_col,
+                        )?;
+                    }
+                    DataType::UInt32 => {
+                        $create_dictionary_hash_method::<UInt32Type>(
+                            col,
+                            $hashes_buffer,
+                            first_col,
+                        )?;
+                    }
+                    DataType::UInt64 => {
+                        $create_dictionary_hash_method::<UInt64Type>(
+                            col,
+                            $hashes_buffer,
+                            first_col,
+                        )?;
+                    }
+                    _ => {
+                        return Err(DataFusionError::Internal(format!(
+                            "Unsupported dictionary type in hasher hashing: {}",
+                            col.data_type(),
+                        )))
+                    }
+                },
+                _ => {
+                    // This is internal because we should have caught this before.
+                    return Err(DataFusionError::Internal(format!(
+                        "Unsupported data type in hasher: {}",
+                        col.data_type()
+                    )));
+                }
+            }
+        }
+    };
+}
+
+/// Creates hash values for every row, based on the values in the
+/// columns.
+///
+/// The number of rows to hash is determined by `hashes_buffer.len()`.
+/// `hashes_buffer` should be pre-sized appropriately
+pub fn create_murmur3_hashes<'a>(
+    arrays: &[ArrayRef],
+    hashes_buffer: &'a mut [u32],
+) -> Result<&'a mut [u32]> {
+    create_hashes_internal!(
+        arrays,
+        hashes_buffer,
+        spark_compatible_murmur3_hash,
+        create_hashes_dictionary
+    );
+    Ok(hashes_buffer)
+}
+
+/// Creates xxhash64 hash values for every row, based on the values in the
+/// columns.
+///
+/// The number of rows to hash is determined by `hashes_buffer.len()`.
+/// `hashes_buffer` should be pre-sized appropriately
+pub fn create_xxhash64_hashes<'a>(
+    arrays: &[ArrayRef],
+    hashes_buffer: &'a mut [u64],
+) -> Result<&'a mut [u64]> {
+    create_hashes_internal!(
+        arrays,
+        hashes_buffer,
+        spark_compatible_xxhash64,
+        create_xxhash64_hashes_dictionary
+    );
+    Ok(hashes_buffer)
+}
+
+#[cfg(test)]
+mod tests {
+    use arrow::array::{Float32Array, Float64Array};
+    use std::sync::Arc;
+
+    use super::{create_murmur3_hashes, create_xxhash64_hashes};
+    use datafusion::arrow::array::{ArrayRef, Int32Array, Int64Array, Int8Array, StringArray};
+
+    macro_rules! test_hashes_internal {
+        ($hash_method: ident, $input: expr, $initial_seeds: expr, $expected: expr) => {
+            let i = $input;
+            let mut hashes = $initial_seeds.clone();
+            $hash_method(&[i], &mut hashes).unwrap();
+            assert_eq!(hashes, $expected);
+        };
+    }
+
+    macro_rules! test_hashes_with_nulls {
+        ($method: ident, $t: ty, $values: ident, $expected: ident, $seed_type: ty) => {
+            // copied before inserting nulls
+            let mut input_with_nulls = $values.clone();
+            let mut expected_with_nulls = $expected.clone();
+            // test before inserting nulls
+            let len = $values.len();
+            let initial_seeds = vec![42 as $seed_type; len];
+            let i = Arc::new(<$t>::from($values)) as ArrayRef;
+            test_hashes_internal!($method, i, initial_seeds, $expected);
+
+            // test with nulls
+            let median = len / 2;
+            input_with_nulls.insert(0, None);
+            input_with_nulls.insert(median, None);
+            expected_with_nulls.insert(0, 42 as $seed_type);
+            expected_with_nulls.insert(median, 42 as $seed_type);
+            let len_with_nulls = len + 2;
+            let initial_seeds_with_nulls = vec![42 as $seed_type; len_with_nulls];
+            let nullable_input = Arc::new(<$t>::from(input_with_nulls)) as ArrayRef;
+            test_hashes_internal!(
+                $method,
+                nullable_input,
+                initial_seeds_with_nulls,
+                expected_with_nulls
+            );
+        };
+    }
+
+    fn test_murmur3_hash<I: Clone, T: arrow_array::Array + From<Vec<Option<I>>> + 'static>(
+        values: Vec<Option<I>>,
+        expected: Vec<u32>,
+    ) {
+        test_hashes_with_nulls!(create_murmur3_hashes, T, values, expected, u32);
+    }
+
+    fn test_xxhash64_hash<I: Clone, T: arrow_array::Array + From<Vec<Option<I>>> + 'static>(
+        values: Vec<Option<I>>,
+        expected: Vec<u64>,
+    ) {
+        test_hashes_with_nulls!(create_xxhash64_hashes, T, values, expected, u64);
+    }
+
+    #[test]
+    fn test_i8() {
+        test_murmur3_hash::<i8, Int8Array>(
+            vec![Some(1), Some(0), Some(-1), Some(i8::MAX), Some(i8::MIN)],
+            vec![0xdea578e3, 0x379fae8f, 0xa0590e3d, 0x43b4d8ed, 0x422a1365],
+        );
+        test_xxhash64_hash::<i8, Int8Array>(
+            vec![Some(1), Some(0), Some(-1), Some(i8::MAX), Some(i8::MIN)],
+            vec![
+                0xa309b38455455929,
+                0x3229fbc4681e48f3,
+                0x1bfdda8861c06e45,
+                0x77cc15d9f9f2cdc2,
+                0x39bc22b9e94d81d0,
+            ],
+        );
+    }
+
+    #[test]
+    fn test_i32() {
+        test_murmur3_hash::<i32, Int32Array>(
+            vec![Some(1), Some(0), Some(-1), Some(i32::MAX), Some(i32::MIN)],
+            vec![0xdea578e3, 0x379fae8f, 0xa0590e3d, 0x07fb67e7, 0x2b1f0fc6],
+        );
+        test_xxhash64_hash::<i32, Int32Array>(
+            vec![Some(1), Some(0), Some(-1), Some(i32::MAX), Some(i32::MIN)],
+            vec![
+                0xa309b38455455929,
+                0x3229fbc4681e48f3,
+                0x1bfdda8861c06e45,
+                0x14f0ac009c21721c,
+                0x1cc7cb8d034769cd,
+            ],
+        );
+    }
+
+    #[test]
+    fn test_i64() {
+        test_murmur3_hash::<i64, Int64Array>(
+            vec![Some(1), Some(0), Some(-1), Some(i64::MAX), Some(i64::MIN)],
+            vec![0x99f0149d, 0x9c67b85d, 0xc8008529, 0xa05b5d7b, 0xcd1e64fb],
+        );
+        test_xxhash64_hash::<i64, Int64Array>(
+            vec![Some(1), Some(0), Some(-1), Some(i64::MAX), Some(i64::MIN)],
+            vec![
+                0x9ed50fd59358d232,
+                0xb71b47ebda15746c,
+                0x358ae035bfb46fd2,
+                0xd2f1c616ae7eb306,
+                0x88608019c494c1f4,
+            ],
+        );
+    }
+
+    #[test]
+    fn test_f32() {
+        test_murmur3_hash::<f32, Float32Array>(
+            vec![
+                Some(1.0),
+                Some(0.0),
+                Some(-0.0),
+                Some(-1.0),
+                Some(99999999999.99999999999),
+                Some(-99999999999.99999999999),
+            ],
+            vec![
+                0xe434cc39, 0x379fae8f, 0x379fae8f, 0xdc0da8eb, 0xcbdc340f, 0xc0361c86,
+            ],
+        );
+        test_xxhash64_hash::<f32, Float32Array>(
+            vec![
+                Some(1.0),
+                Some(0.0),
+                Some(-0.0),
+                Some(-1.0),
+                Some(99999999999.99999999999),
+                Some(-99999999999.99999999999),
+            ],
+            vec![
+                0x9b92689757fcdbd,
+                0x3229fbc4681e48f3,
+                0x3229fbc4681e48f3,
+                0xa2becc0e61bb3823,
+                0x8f20ab82d4f3687f,
+                0xdce4982d97f7ac4,
+            ],
+        )
+    }
+
+    #[test]
+    fn test_f64() {
+        test_murmur3_hash::<f64, Float64Array>(
+            vec![
+                Some(1.0),
+                Some(0.0),
+                Some(-0.0),
+                Some(-1.0),
+                Some(99999999999.99999999999),
+                Some(-99999999999.99999999999),
+            ],
+            vec![
+                0xe4876492, 0x9c67b85d, 0x9c67b85d, 0x13d81357, 0xb87e1595, 0xa0eef9f9,
+            ],
+        );
+
+        test_xxhash64_hash::<f64, Float64Array>(
+            vec![
+                Some(1.0),
+                Some(0.0),
+                Some(-0.0),
+                Some(-1.0),
+                Some(99999999999.99999999999),
+                Some(-99999999999.99999999999),
+            ],
+            vec![
+                0xe1fd6e07fee8ad53,
+                0xb71b47ebda15746c,
+                0xb71b47ebda15746c,
+                0x8cdde022746f8f1f,
+                0x793c5c88d313eac7,
+                0xc5e60e7b75d9b232,
+            ],
+        )
+    }
+
+    #[test]
+    fn test_str() {
+        let input = [
+            "hello", "bar", "", "😁", "天地", "a", "ab", "abc", "abcd", "abcde",
+        ]
+        .iter()
+        .map(|s| Some(s.to_string()))
+        .collect::<Vec<Option<String>>>();
+        let expected: Vec<u32> = vec![
+            3286402344, 2486176763, 142593372, 885025535, 2395000894, 1485273170, 0xfa37157b,
+            1322437556, 0xe860e5cc, 814637928,
+        ];
+
+        test_murmur3_hash::<String, StringArray>(input.clone(), expected);
+        test_xxhash64_hash::<String, StringArray>(
+            input,
+            vec![
+                0xc3629e6318d53932,
+                0xe7097b6a54378d8a,
+                0x98b1582b0977e704,
+                0xa80d9d5a6a523bd5,
+                0xfcba5f61ac666c61,
+                0x88e4fe59adf7b0cc,
+                0x259dd873209a3fe3,
+                0x13c1d910702770e6,
+                0xa17b5eb5dc364dff,
+                0xf241303e4a90f299,
+            ],
+        )
+    }
+}
diff --git a/src/xxhash64.rs b/src/xxhash64.rs
new file mode 100644
index 000000000000..f5a11f66cd7d
--- /dev/null
+++ b/src/xxhash64.rs
@@ -0,0 +1,190 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! xxhash64 implementation
+
+const CHUNK_SIZE: usize = 32;
+
+const PRIME_1: u64 = 11_400_714_785_074_694_791;
+const PRIME_2: u64 = 14_029_467_366_897_019_727;
+const PRIME_3: u64 = 1_609_587_929_392_839_161;
+const PRIME_4: u64 = 9_650_029_242_287_828_579;
+const PRIME_5: u64 = 2_870_177_450_012_600_261;
+
+/// Custom implementation of xxhash64 based on code from https://github.com/shepmaster/twox-hash
+/// but optimized for our use case by removing any intermediate buffering, which is
+/// not required because we are operating on data that is already in memory.
+#[inline]
+pub(crate) fn spark_compatible_xxhash64<T: AsRef<[u8]>>(data: T, seed: u64) -> u64 {
+    let data: &[u8] = data.as_ref();
+    let length_bytes = data.len();
+
+    let mut v1 = seed.wrapping_add(PRIME_1).wrapping_add(PRIME_2);
+    let mut v2 = seed.wrapping_add(PRIME_2);
+    let mut v3 = seed;
+    let mut v4 = seed.wrapping_sub(PRIME_1);
+
+    // process chunks of 32 bytes
+    let mut offset_u64_4 = 0;
+    let ptr_u64 = data.as_ptr() as *const u64;
+    unsafe {
+        while offset_u64_4 * CHUNK_SIZE + CHUNK_SIZE <= length_bytes {
+            v1 = ingest_one_number(v1, ptr_u64.add(offset_u64_4 * 4).read_unaligned().to_le());
+            v2 = ingest_one_number(
+                v2,
+                ptr_u64.add(offset_u64_4 * 4 + 1).read_unaligned().to_le(),
+            );
+            v3 = ingest_one_number(
+                v3,
+                ptr_u64.add(offset_u64_4 * 4 + 2).read_unaligned().to_le(),
+            );
+            v4 = ingest_one_number(
+                v4,
+                ptr_u64.add(offset_u64_4 * 4 + 3).read_unaligned().to_le(),
+            );
+            offset_u64_4 += 1;
+        }
+    }
+
+    let mut hash = if length_bytes >= CHUNK_SIZE {
+        // We have processed at least one full chunk
+        let mut hash = v1.rotate_left(1);
+        hash = hash.wrapping_add(v2.rotate_left(7));
+        hash = hash.wrapping_add(v3.rotate_left(12));
+        hash = hash.wrapping_add(v4.rotate_left(18));
+
+        hash = mix_one(hash, v1);
+        hash = mix_one(hash, v2);
+        hash = mix_one(hash, v3);
+        hash = mix_one(hash, v4);
+
+        hash
+    } else {
+        seed.wrapping_add(PRIME_5)
+    };
+
+    hash = hash.wrapping_add(length_bytes as u64);
+
+    // process u64s
+    let mut offset_u64 = offset_u64_4 * 4;
+    while offset_u64 * 8 + 8 <= length_bytes {
+        let mut k1 = unsafe {
+            ptr_u64
+                .add(offset_u64)
+                .read_unaligned()
+                .to_le()
+                .wrapping_mul(PRIME_2)
+        };
+        k1 = k1.rotate_left(31);
+        k1 = k1.wrapping_mul(PRIME_1);
+        hash ^= k1;
+        hash = hash.rotate_left(27);
+        hash = hash.wrapping_mul(PRIME_1);
+        hash = hash.wrapping_add(PRIME_4);
+        offset_u64 += 1;
+    }
+
+    // process u32s
+    let data = &data[offset_u64 * 8..];
+    let ptr_u32 = data.as_ptr() as *const u32;
+    let length_bytes = length_bytes - offset_u64 * 8;
+    let mut offset_u32 = 0;
+    while offset_u32 * 4 + 4 <= length_bytes {
+        let k1 = unsafe {
+            u64::from(ptr_u32.add(offset_u32).read_unaligned().to_le()).wrapping_mul(PRIME_1)
+        };
+        hash ^= k1;
+        hash = hash.rotate_left(23);
+        hash = hash.wrapping_mul(PRIME_2);
+        hash = hash.wrapping_add(PRIME_3);
+        offset_u32 += 1;
+    }
+
+    // process u8s
+    let data = &data[offset_u32 * 4..];
+    let length_bytes = length_bytes - offset_u32 * 4;
+    let mut offset_u8 = 0;
+    while offset_u8 < length_bytes {
+        let k1 = u64::from(data[offset_u8]).wrapping_mul(PRIME_5);
+        hash ^= k1;
+        hash = hash.rotate_left(11);
+        hash = hash.wrapping_mul(PRIME_1);
+        offset_u8 += 1;
+    }
+
+    // The final intermixing
+    hash ^= hash >> 33;
+    hash = hash.wrapping_mul(PRIME_2);
+    hash ^= hash >> 29;
+    hash = hash.wrapping_mul(PRIME_3);
+    hash ^= hash >> 32;
+
+    hash
+}
+
+#[inline(always)]
+fn ingest_one_number(mut current_value: u64, mut value: u64) -> u64 {
+    value = value.wrapping_mul(PRIME_2);
+    current_value = current_value.wrapping_add(value);
+    current_value = current_value.rotate_left(31);
+    current_value.wrapping_mul(PRIME_1)
+}
+
+#[inline(always)]
+fn mix_one(mut hash: u64, mut value: u64) -> u64 {
+    value = value.wrapping_mul(PRIME_2);
+    value = value.rotate_left(31);
+    value = value.wrapping_mul(PRIME_1);
+    hash ^= value;
+    hash = hash.wrapping_mul(PRIME_1);
+    hash.wrapping_add(PRIME_4)
+}
+
+#[cfg(test)]
+mod test {
+    use super::spark_compatible_xxhash64;
+    use rand::Rng;
+    use std::hash::Hasher;
+    use twox_hash::XxHash64;
+
+    #[test]
+    #[cfg_attr(miri, ignore)] // test takes too long with miri
+    fn test_xxhash64_random() {
+        let mut rng = rand::thread_rng();
+        for len in 0..128 {
+            for _ in 0..10 {
+                let data: Vec<u8> = (0..len).map(|_| rng.gen()).collect();
+                let seed = rng.gen();
+                check_xxhash64(&data, seed);
+            }
+        }
+    }
+
+    fn check_xxhash64(data: &[u8], seed: u64) {
+        let mut hasher = XxHash64::with_seed(seed);
+        hasher.write(data.as_ref());
+        let hash1 = hasher.finish();
+        let hash2 = spark_compatible_xxhash64(data, seed);
+        if hash1 != hash2 {
+            panic!("input: {} with seed {seed} produced incorrect hash (comet={hash2}, twox-hash={hash1})",
+                   data.iter().fold(String::new(), |mut output, byte| {
+                       output.push_str(&format!("{:02x}", byte));
+                       output
+                   }))
+        }
+    }
+}

From 0a003250aa425fe936a5930df74d498f1f2a01bd Mon Sep 17 00:00:00 2001
From: Andy Grove <agrove@apache.org>
Date: Wed, 31 Jul 2024 10:57:46 -0600
Subject: [PATCH 14/68] chore: Add criterion benchmark for decimal_div (#743)

---
 Cargo.toml             |  4 ++++
 benches/decimal_div.rs | 54 ++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 58 insertions(+)
 create mode 100644 benches/decimal_div.rs

diff --git a/Cargo.toml b/Cargo.toml
index a535a2b817e6..96eae39ffbd2 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -64,3 +64,7 @@ harness = false
 [[bench]]
 name = "conditional"
 harness = false
+
+[[bench]]
+name = "decimal_div"
+harness = false
diff --git a/benches/decimal_div.rs b/benches/decimal_div.rs
new file mode 100644
index 000000000000..89f06e50532e
--- /dev/null
+++ b/benches/decimal_div.rs
@@ -0,0 +1,54 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::compute::cast;
+use arrow_array::builder::Decimal128Builder;
+use arrow_schema::DataType;
+use criterion::{black_box, criterion_group, criterion_main, Criterion};
+use datafusion_comet_spark_expr::scalar_funcs::spark_decimal_div;
+use datafusion_expr::ColumnarValue;
+use std::sync::Arc;
+
+fn criterion_benchmark(c: &mut Criterion) {
+    // create input data
+    let mut c1 = Decimal128Builder::new();
+    let mut c2 = Decimal128Builder::new();
+    for i in 0..1000 {
+        c1.append_value(99999999 + i);
+        c2.append_value(88888888 - i);
+    }
+    let c1 = Arc::new(c1.finish());
+    let c2 = Arc::new(c2.finish());
+
+    let c1_type = DataType::Decimal128(10, 4);
+    let c1 = cast(c1.as_ref(), &c1_type).unwrap();
+    let c2_type = DataType::Decimal128(10, 3);
+    let c2 = cast(c2.as_ref(), &c2_type).unwrap();
+
+    let args = [ColumnarValue::Array(c1), ColumnarValue::Array(c2)];
+    c.bench_function("decimal_div", |b| {
+        b.iter(|| {
+            black_box(spark_decimal_div(
+                black_box(&args),
+                black_box(&DataType::Decimal128(10, 4)),
+            ))
+        })
+    });
+}
+
+criterion_group!(benches, criterion_benchmark);
+criterion_main!(benches);

From abfce136d94e4a07cca77ac79c28175376be36e7 Mon Sep 17 00:00:00 2001
From: Akhil S S <88586412+akhilss99@users.noreply.github.com>
Date: Thu, 1 Aug 2024 20:30:58 +0530
Subject: [PATCH 15/68] Add support for time-zone, 3 & 5 digit years: Cast from
 string to timestamp (#704)

---
 src/cast.rs | 343 ++++++++++++++++++++++++++++++++++++++--------------
 1 file changed, 251 insertions(+), 92 deletions(-)

diff --git a/src/cast.rs b/src/cast.rs
index ae0818970f03..e44b1c9f5db4 100644
--- a/src/cast.rs
+++ b/src/cast.rs
@@ -15,14 +15,6 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use std::{
-    any::Any,
-    fmt::{Debug, Display, Formatter},
-    hash::{Hash, Hasher},
-    num::Wrapping,
-    sync::Arc,
-};
-
 use arrow::{
     array::{
         cast::AsArray,
@@ -42,6 +34,14 @@ use arrow::{
 };
 use arrow_array::DictionaryArray;
 use arrow_schema::{DataType, Schema};
+use std::str::FromStr;
+use std::{
+    any::Any,
+    fmt::{Debug, Display, Formatter},
+    hash::{Hash, Hasher},
+    num::Wrapping,
+    sync::Arc,
+};
 
 use datafusion_common::{
     cast::as_generic_string_array, internal_err, Result as DataFusionResult, ScalarValue,
@@ -56,6 +56,7 @@ use num::{
 };
 use regex::Regex;
 
+use crate::timezone;
 use crate::utils::{array_with_timezone, down_cast_any_ref};
 
 use crate::{EvalMode, SparkError, SparkResult};
@@ -71,6 +72,67 @@ static CAST_OPTIONS: CastOptions = CastOptions {
         .with_timestamp_format(TIMESTAMP_FORMAT),
 };
 
+struct TimeStampInfo {
+    year: i32,
+    month: u32,
+    day: u32,
+    hour: u32,
+    minute: u32,
+    second: u32,
+    microsecond: u32,
+}
+
+impl Default for TimeStampInfo {
+    fn default() -> Self {
+        TimeStampInfo {
+            year: 1,
+            month: 1,
+            day: 1,
+            hour: 0,
+            minute: 0,
+            second: 0,
+            microsecond: 0,
+        }
+    }
+}
+
+impl TimeStampInfo {
+    pub fn with_year(&mut self, year: i32) -> &mut Self {
+        self.year = year;
+        self
+    }
+
+    pub fn with_month(&mut self, month: u32) -> &mut Self {
+        self.month = month;
+        self
+    }
+
+    pub fn with_day(&mut self, day: u32) -> &mut Self {
+        self.day = day;
+        self
+    }
+
+    pub fn with_hour(&mut self, hour: u32) -> &mut Self {
+        self.hour = hour;
+        self
+    }
+
+    pub fn with_minute(&mut self, minute: u32) -> &mut Self {
+        self.minute = minute;
+        self
+    }
+
+    pub fn with_second(&mut self, second: u32) -> &mut Self {
+        self.second = second;
+        self
+    }
+
+    pub fn with_microsecond(&mut self, microsecond: u32) -> &mut Self {
+        self.microsecond = microsecond;
+        self
+    }
+}
+
 #[derive(Debug, Hash)]
 pub struct Cast {
     pub child: Arc<dyn PhysicalExpr>,
@@ -100,13 +162,15 @@ macro_rules! cast_utf8_to_int {
     }};
 }
 macro_rules! cast_utf8_to_timestamp {
-    ($array:expr, $eval_mode:expr, $array_type:ty, $cast_method:ident) => {{
+    ($array:expr, $eval_mode:expr, $array_type:ty, $cast_method:ident, $tz:expr) => {{
         let len = $array.len();
         let mut cast_array = PrimitiveArray::<$array_type>::builder(len).with_timezone("UTC");
         for i in 0..len {
             if $array.is_null(i) {
                 cast_array.append_null()
-            } else if let Ok(Some(cast_value)) = $cast_method($array.value(i).trim(), $eval_mode) {
+            } else if let Ok(Some(cast_value)) =
+                $cast_method($array.value(i).trim(), $eval_mode, $tz)
+            {
                 cast_array.append_value(cast_value);
             } else {
                 cast_array.append_null()
@@ -574,7 +638,7 @@ fn cast_array(
             spark_cast_utf8_to_boolean::<i64>(&array, eval_mode)
         }
         (DataType::Utf8, DataType::Timestamp(_, _)) => {
-            cast_string_to_timestamp(&array, to_type, eval_mode)
+            cast_string_to_timestamp(&array, to_type, eval_mode, &timezone)
         }
         (DataType::Utf8, DataType::Date32) => cast_string_to_date(&array, to_type, eval_mode),
         (DataType::Int64, DataType::Int32)
@@ -782,19 +846,23 @@ fn cast_string_to_timestamp(
     array: &ArrayRef,
     to_type: &DataType,
     eval_mode: EvalMode,
+    timezone_str: &str,
 ) -> SparkResult<ArrayRef> {
     let string_array = array
         .as_any()
         .downcast_ref::<GenericStringArray<i32>>()
         .expect("Expected a string array");
 
+    let tz = &timezone::Tz::from_str(timezone_str).unwrap();
+
     let cast_array: ArrayRef = match to_type {
         DataType::Timestamp(_, _) => {
             cast_utf8_to_timestamp!(
                 string_array,
                 eval_mode,
                 TimestampMicrosecondType,
-                timestamp_parser
+                timestamp_parser,
+                tz
             )
         }
         _ => unreachable!("Invalid data type {:?} in cast from string", to_type),
@@ -1344,7 +1412,11 @@ impl PhysicalExpr for Cast {
     }
 }
 
-fn timestamp_parser(value: &str, eval_mode: EvalMode) -> SparkResult<Option<i64>> {
+fn timestamp_parser<T: TimeZone>(
+    value: &str,
+    eval_mode: EvalMode,
+    tz: &T,
+) -> SparkResult<Option<i64>> {
     let value = value.trim();
     if value.is_empty() {
         return Ok(None);
@@ -1352,31 +1424,31 @@ fn timestamp_parser(value: &str, eval_mode: EvalMode) -> SparkResult<Option<i64>
     // Define regex patterns and corresponding parsing functions
     let patterns = &[
         (
-            Regex::new(r"^\d{4}$").unwrap(),
-            parse_str_to_year_timestamp as fn(&str) -> SparkResult<Option<i64>>,
+            Regex::new(r"^\d{4,5}$").unwrap(),
+            parse_str_to_year_timestamp as fn(&str, &T) -> SparkResult<Option<i64>>,
         ),
         (
-            Regex::new(r"^\d{4}-\d{2}$").unwrap(),
+            Regex::new(r"^\d{4,5}-\d{2}$").unwrap(),
             parse_str_to_month_timestamp,
         ),
         (
-            Regex::new(r"^\d{4}-\d{2}-\d{2}$").unwrap(),
+            Regex::new(r"^\d{4,5}-\d{2}-\d{2}$").unwrap(),
             parse_str_to_day_timestamp,
         ),
         (
-            Regex::new(r"^\d{4}-\d{2}-\d{2}T\d{1,2}$").unwrap(),
+            Regex::new(r"^\d{4,5}-\d{2}-\d{2}T\d{1,2}$").unwrap(),
             parse_str_to_hour_timestamp,
         ),
         (
-            Regex::new(r"^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}$").unwrap(),
+            Regex::new(r"^\d{4,5}-\d{2}-\d{2}T\d{2}:\d{2}$").unwrap(),
             parse_str_to_minute_timestamp,
         ),
         (
-            Regex::new(r"^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}$").unwrap(),
+            Regex::new(r"^\d{4,5}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}$").unwrap(),
             parse_str_to_second_timestamp,
         ),
         (
-            Regex::new(r"^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d{1,6}$").unwrap(),
+            Regex::new(r"^\d{4,5}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d{1,6}$").unwrap(),
             parse_str_to_microsecond_timestamp,
         ),
         (
@@ -1390,7 +1462,7 @@ fn timestamp_parser(value: &str, eval_mode: EvalMode) -> SparkResult<Option<i64>
     // Iterate through patterns and try matching
     for (pattern, parse_func) in patterns {
         if pattern.is_match(value) {
-            timestamp = parse_func(value)?;
+            timestamp = parse_func(value, tz)?;
             break;
         }
     }
@@ -1415,38 +1487,24 @@ fn timestamp_parser(value: &str, eval_mode: EvalMode) -> SparkResult<Option<i64>
     }
 }
 
-fn parse_ymd_timestamp(year: i32, month: u32, day: u32) -> SparkResult<Option<i64>> {
-    let datetime = chrono::Utc.with_ymd_and_hms(year, month, day, 0, 0, 0);
-
-    // Check if datetime is not None
-    let utc_datetime = match datetime.single() {
-        Some(dt) => dt.with_timezone(&chrono::Utc),
-        None => {
-            return Err(SparkError::Internal(
-                "Failed to parse timestamp".to_string(),
-            ));
-        }
-    };
-
-    Ok(Some(utc_datetime.timestamp_micros()))
-}
-
-fn parse_hms_timestamp(
-    year: i32,
-    month: u32,
-    day: u32,
-    hour: u32,
-    minute: u32,
-    second: u32,
-    microsecond: u32,
+fn parse_timestamp_to_micros<T: TimeZone>(
+    timestamp_info: &TimeStampInfo,
+    tz: &T,
 ) -> SparkResult<Option<i64>> {
-    let datetime = chrono::Utc.with_ymd_and_hms(year, month, day, hour, minute, second);
+    let datetime = tz.with_ymd_and_hms(
+        timestamp_info.year,
+        timestamp_info.month,
+        timestamp_info.day,
+        timestamp_info.hour,
+        timestamp_info.minute,
+        timestamp_info.second,
+    );
 
     // Check if datetime is not None
-    let utc_datetime = match datetime.single() {
+    let tz_datetime = match datetime.single() {
         Some(dt) => dt
-            .with_timezone(&chrono::Utc)
-            .with_nanosecond(microsecond * 1000),
+            .with_timezone(tz)
+            .with_nanosecond(timestamp_info.microsecond * 1000),
         None => {
             return Err(SparkError::Internal(
                 "Failed to parse timestamp".to_string(),
@@ -1454,7 +1512,7 @@ fn parse_hms_timestamp(
         }
     };
 
-    let result = match utc_datetime {
+    let result = match tz_datetime {
         Some(dt) => dt.timestamp_micros(),
         None => {
             return Err(SparkError::Internal(
@@ -1466,7 +1524,11 @@ fn parse_hms_timestamp(
     Ok(Some(result))
 }
 
-fn get_timestamp_values(value: &str, timestamp_type: &str) -> SparkResult<Option<i64>> {
+fn get_timestamp_values<T: TimeZone>(
+    value: &str,
+    timestamp_type: &str,
+    tz: &T,
+) -> SparkResult<Option<i64>> {
     let values: Vec<_> = value
         .split(|c| c == 'T' || c == '-' || c == ':' || c == '.')
         .collect();
@@ -1478,64 +1540,99 @@ fn get_timestamp_values(value: &str, timestamp_type: &str) -> SparkResult<Option
     let second = values.get(5).map_or(0, |s| s.parse::<u32>().unwrap_or(0));
     let microsecond = values.get(6).map_or(0, |ms| ms.parse::<u32>().unwrap_or(0));
 
-    match timestamp_type {
-        "year" => parse_ymd_timestamp(year, 1, 1),
-        "month" => parse_ymd_timestamp(year, month, 1),
-        "day" => parse_ymd_timestamp(year, month, day),
-        "hour" => parse_hms_timestamp(year, month, day, hour, 0, 0, 0),
-        "minute" => parse_hms_timestamp(year, month, day, hour, minute, 0, 0),
-        "second" => parse_hms_timestamp(year, month, day, hour, minute, second, 0),
-        "microsecond" => parse_hms_timestamp(year, month, day, hour, minute, second, microsecond),
-        _ => Err(SparkError::CastInvalidValue {
-            value: value.to_string(),
-            from_type: "STRING".to_string(),
-            to_type: "TIMESTAMP".to_string(),
-        }),
-    }
+    let mut timestamp_info = TimeStampInfo::default();
+
+    let timestamp_info = match timestamp_type {
+        "year" => timestamp_info.with_year(year),
+        "month" => timestamp_info.with_year(year).with_month(month),
+        "day" => timestamp_info
+            .with_year(year)
+            .with_month(month)
+            .with_day(day),
+        "hour" => timestamp_info
+            .with_year(year)
+            .with_month(month)
+            .with_day(day)
+            .with_hour(hour),
+        "minute" => timestamp_info
+            .with_year(year)
+            .with_month(month)
+            .with_day(day)
+            .with_hour(hour)
+            .with_minute(minute),
+        "second" => timestamp_info
+            .with_year(year)
+            .with_month(month)
+            .with_day(day)
+            .with_hour(hour)
+            .with_minute(minute)
+            .with_second(second),
+        "microsecond" => timestamp_info
+            .with_year(year)
+            .with_month(month)
+            .with_day(day)
+            .with_hour(hour)
+            .with_minute(minute)
+            .with_second(second)
+            .with_microsecond(microsecond),
+        _ => {
+            return Err(SparkError::CastInvalidValue {
+                value: value.to_string(),
+                from_type: "STRING".to_string(),
+                to_type: "TIMESTAMP".to_string(),
+            })
+        }
+    };
+
+    parse_timestamp_to_micros(timestamp_info, tz)
 }
 
-fn parse_str_to_year_timestamp(value: &str) -> SparkResult<Option<i64>> {
-    get_timestamp_values(value, "year")
+fn parse_str_to_year_timestamp<T: TimeZone>(value: &str, tz: &T) -> SparkResult<Option<i64>> {
+    get_timestamp_values(value, "year", tz)
 }
 
-fn parse_str_to_month_timestamp(value: &str) -> SparkResult<Option<i64>> {
-    get_timestamp_values(value, "month")
+fn parse_str_to_month_timestamp<T: TimeZone>(value: &str, tz: &T) -> SparkResult<Option<i64>> {
+    get_timestamp_values(value, "month", tz)
 }
 
-fn parse_str_to_day_timestamp(value: &str) -> SparkResult<Option<i64>> {
-    get_timestamp_values(value, "day")
+fn parse_str_to_day_timestamp<T: TimeZone>(value: &str, tz: &T) -> SparkResult<Option<i64>> {
+    get_timestamp_values(value, "day", tz)
 }
 
-fn parse_str_to_hour_timestamp(value: &str) -> SparkResult<Option<i64>> {
-    get_timestamp_values(value, "hour")
+fn parse_str_to_hour_timestamp<T: TimeZone>(value: &str, tz: &T) -> SparkResult<Option<i64>> {
+    get_timestamp_values(value, "hour", tz)
 }
 
-fn parse_str_to_minute_timestamp(value: &str) -> SparkResult<Option<i64>> {
-    get_timestamp_values(value, "minute")
+fn parse_str_to_minute_timestamp<T: TimeZone>(value: &str, tz: &T) -> SparkResult<Option<i64>> {
+    get_timestamp_values(value, "minute", tz)
 }
 
-fn parse_str_to_second_timestamp(value: &str) -> SparkResult<Option<i64>> {
-    get_timestamp_values(value, "second")
+fn parse_str_to_second_timestamp<T: TimeZone>(value: &str, tz: &T) -> SparkResult<Option<i64>> {
+    get_timestamp_values(value, "second", tz)
 }
 
-fn parse_str_to_microsecond_timestamp(value: &str) -> SparkResult<Option<i64>> {
-    get_timestamp_values(value, "microsecond")
+fn parse_str_to_microsecond_timestamp<T: TimeZone>(
+    value: &str,
+    tz: &T,
+) -> SparkResult<Option<i64>> {
+    get_timestamp_values(value, "microsecond", tz)
 }
 
-fn parse_str_to_time_only_timestamp(value: &str) -> SparkResult<Option<i64>> {
+fn parse_str_to_time_only_timestamp<T: TimeZone>(value: &str, tz: &T) -> SparkResult<Option<i64>> {
     let values: Vec<&str> = value.split('T').collect();
     let time_values: Vec<u32> = values[1]
         .split(':')
         .map(|v| v.parse::<u32>().unwrap_or(0))
         .collect();
 
-    let datetime = chrono::Utc::now();
+    let datetime = tz.from_utc_datetime(&chrono::Utc::now().naive_utc());
     let timestamp = datetime
+        .with_timezone(tz)
         .with_hour(time_values.first().copied().unwrap_or_default())
         .and_then(|dt| dt.with_minute(*time_values.get(1).unwrap_or(&0)))
         .and_then(|dt| dt.with_second(*time_values.get(2).unwrap_or(&0)))
         .and_then(|dt| dt.with_nanosecond(*time_values.get(3).unwrap_or(&0) * 1_000))
-        .map(|dt| dt.to_utc().timestamp_micros())
+        .map(|dt| dt.timestamp_micros())
         .unwrap_or_default();
 
     Ok(Some(timestamp))
@@ -1746,41 +1843,99 @@ mod tests {
     use arrow::datatypes::TimestampMicrosecondType;
     use arrow_array::StringArray;
     use arrow_schema::TimeUnit;
+    use std::str::FromStr;
 
     use super::*;
 
     #[test]
     #[cfg_attr(miri, ignore)] // test takes too long with miri
     fn timestamp_parser_test() {
+        let tz = &timezone::Tz::from_str("UTC").unwrap();
         // write for all formats
         assert_eq!(
-            timestamp_parser("2020", EvalMode::Legacy).unwrap(),
+            timestamp_parser("2020", EvalMode::Legacy, tz).unwrap(),
             Some(1577836800000000) // this is in milliseconds
         );
         assert_eq!(
-            timestamp_parser("2020-01", EvalMode::Legacy).unwrap(),
+            timestamp_parser("2020-01", EvalMode::Legacy, tz).unwrap(),
             Some(1577836800000000)
         );
         assert_eq!(
-            timestamp_parser("2020-01-01", EvalMode::Legacy).unwrap(),
+            timestamp_parser("2020-01-01", EvalMode::Legacy, tz).unwrap(),
             Some(1577836800000000)
         );
         assert_eq!(
-            timestamp_parser("2020-01-01T12", EvalMode::Legacy).unwrap(),
+            timestamp_parser("2020-01-01T12", EvalMode::Legacy, tz).unwrap(),
             Some(1577880000000000)
         );
         assert_eq!(
-            timestamp_parser("2020-01-01T12:34", EvalMode::Legacy).unwrap(),
+            timestamp_parser("2020-01-01T12:34", EvalMode::Legacy, tz).unwrap(),
             Some(1577882040000000)
         );
         assert_eq!(
-            timestamp_parser("2020-01-01T12:34:56", EvalMode::Legacy).unwrap(),
+            timestamp_parser("2020-01-01T12:34:56", EvalMode::Legacy, tz).unwrap(),
             Some(1577882096000000)
         );
         assert_eq!(
-            timestamp_parser("2020-01-01T12:34:56.123456", EvalMode::Legacy).unwrap(),
+            timestamp_parser("2020-01-01T12:34:56.123456", EvalMode::Legacy, tz).unwrap(),
             Some(1577882096123456)
         );
+        assert_eq!(
+            timestamp_parser("0100", EvalMode::Legacy, tz).unwrap(),
+            Some(-59011459200000000)
+        );
+        assert_eq!(
+            timestamp_parser("0100-01", EvalMode::Legacy, tz).unwrap(),
+            Some(-59011459200000000)
+        );
+        assert_eq!(
+            timestamp_parser("0100-01-01", EvalMode::Legacy, tz).unwrap(),
+            Some(-59011459200000000)
+        );
+        assert_eq!(
+            timestamp_parser("0100-01-01T12", EvalMode::Legacy, tz).unwrap(),
+            Some(-59011416000000000)
+        );
+        assert_eq!(
+            timestamp_parser("0100-01-01T12:34", EvalMode::Legacy, tz).unwrap(),
+            Some(-59011413960000000)
+        );
+        assert_eq!(
+            timestamp_parser("0100-01-01T12:34:56", EvalMode::Legacy, tz).unwrap(),
+            Some(-59011413904000000)
+        );
+        assert_eq!(
+            timestamp_parser("0100-01-01T12:34:56.123456", EvalMode::Legacy, tz).unwrap(),
+            Some(-59011413903876544)
+        );
+        assert_eq!(
+            timestamp_parser("10000", EvalMode::Legacy, tz).unwrap(),
+            Some(253402300800000000)
+        );
+        assert_eq!(
+            timestamp_parser("10000-01", EvalMode::Legacy, tz).unwrap(),
+            Some(253402300800000000)
+        );
+        assert_eq!(
+            timestamp_parser("10000-01-01", EvalMode::Legacy, tz).unwrap(),
+            Some(253402300800000000)
+        );
+        assert_eq!(
+            timestamp_parser("10000-01-01T12", EvalMode::Legacy, tz).unwrap(),
+            Some(253402344000000000)
+        );
+        assert_eq!(
+            timestamp_parser("10000-01-01T12:34", EvalMode::Legacy, tz).unwrap(),
+            Some(253402346040000000)
+        );
+        assert_eq!(
+            timestamp_parser("10000-01-01T12:34:56", EvalMode::Legacy, tz).unwrap(),
+            Some(253402346096000000)
+        );
+        assert_eq!(
+            timestamp_parser("10000-01-01T12:34:56.123456", EvalMode::Legacy, tz).unwrap(),
+            Some(253402346096123456)
+        );
         // assert_eq!(
         //     timestamp_parser("T2",  EvalMode::Legacy).unwrap(),
         //     Some(1714356000000000) // this value needs to change everyday.
@@ -1793,7 +1948,10 @@ mod tests {
         let array: ArrayRef = Arc::new(StringArray::from(vec![
             Some("2020-01-01T12:34:56.123456"),
             Some("T2"),
+            Some("0100-01-01T12:34:56.123456"),
+            Some("10000-01-01T12:34:56.123456"),
         ]));
+        let tz = &timezone::Tz::from_str("UTC").unwrap();
 
         let string_array = array
             .as_any()
@@ -1805,14 +1963,15 @@ mod tests {
             &string_array,
             eval_mode,
             TimestampMicrosecondType,
-            timestamp_parser
+            timestamp_parser,
+            tz
         );
 
         assert_eq!(
             result.data_type(),
             &DataType::Timestamp(TimeUnit::Microsecond, Some("UTC".into()))
         );
-        assert_eq!(result.len(), 2);
+        assert_eq!(result.len(), 4);
     }
 
     #[test]

From 1a02d585fcfb307d939bf228503ed6925651ce78 Mon Sep 17 00:00:00 2001
From: KAZUYUKI TANIMURA <ktanimura@apple.com>
Date: Thu, 1 Aug 2024 11:03:26 -0700
Subject: [PATCH 16/68] fix: Remove castting on decimals with a small precision
 to decimal256  (#741)

## Which issue does this PR close?

Part of #670

## Rationale for this change

This PR improves the native execution performance on decimals with a small precision

## What changes are included in this PR?

This PR changes not to promote decimal128 to decimal256 if the precisions are small enough

## How are these changes tested?

Existing tests
---
 src/scalar_funcs.rs | 52 ++++++++++++++++++++++++++++-----------------
 1 file changed, 33 insertions(+), 19 deletions(-)

diff --git a/src/scalar_funcs.rs b/src/scalar_funcs.rs
index c50b98bafea4..7cbaf12aa423 100644
--- a/src/scalar_funcs.rs
+++ b/src/scalar_funcs.rs
@@ -25,7 +25,7 @@ use arrow::{
     datatypes::{validate_decimal_precision, Decimal128Type, Int64Type},
 };
 use arrow_array::{Array, ArrowNativeTypeOp, BooleanArray, Decimal128Array};
-use arrow_schema::DataType;
+use arrow_schema::{DataType, DECIMAL128_MAX_PRECISION};
 use datafusion::{functions::math::round::round, physical_plan::ColumnarValue};
 use datafusion_common::{
     cast::as_generic_string_array, exec_err, internal_err, DataFusionError,
@@ -460,27 +460,41 @@ pub fn spark_decimal_div(
     };
     let left = left.as_primitive::<Decimal128Type>();
     let right = right.as_primitive::<Decimal128Type>();
-    let (_, s1) = get_precision_scale(left.data_type());
-    let (_, s2) = get_precision_scale(right.data_type());
+    let (p1, s1) = get_precision_scale(left.data_type());
+    let (p2, s2) = get_precision_scale(right.data_type());
 
-    let ten = BigInt::from(10);
     let l_exp = ((s2 + s3 + 1) as u32).saturating_sub(s1 as u32);
     let r_exp = (s1 as u32).saturating_sub((s2 + s3 + 1) as u32);
-    let l_mul = ten.pow(l_exp);
-    let r_mul = ten.pow(r_exp);
-    let five = BigInt::from(5);
-    let zero = BigInt::from(0);
-    let result: Decimal128Array = arrow::compute::kernels::arity::binary(left, right, |l, r| {
-        let l = BigInt::from(l) * &l_mul;
-        let r = BigInt::from(r) * &r_mul;
-        let div = if r.eq(&zero) { zero.clone() } else { &l / &r };
-        let res = if div.is_negative() {
-            div - &five
-        } else {
-            div + &five
-        } / &ten;
-        res.to_i128().unwrap_or(i128::MAX)
-    })?;
+    let result: Decimal128Array = if p1 as u32 + l_exp > DECIMAL128_MAX_PRECISION as u32
+        || p2 as u32 + r_exp > DECIMAL128_MAX_PRECISION as u32
+    {
+        let ten = BigInt::from(10);
+        let l_mul = ten.pow(l_exp);
+        let r_mul = ten.pow(r_exp);
+        let five = BigInt::from(5);
+        let zero = BigInt::from(0);
+        arrow::compute::kernels::arity::binary(left, right, |l, r| {
+            let l = BigInt::from(l) * &l_mul;
+            let r = BigInt::from(r) * &r_mul;
+            let div = if r.eq(&zero) { zero.clone() } else { &l / &r };
+            let res = if div.is_negative() {
+                div - &five
+            } else {
+                div + &five
+            } / &ten;
+            res.to_i128().unwrap_or(i128::MAX)
+        })?
+    } else {
+        let l_mul = 10_i128.pow(l_exp);
+        let r_mul = 10_i128.pow(r_exp);
+        arrow::compute::kernels::arity::binary(left, right, |l, r| {
+            let l = l * l_mul;
+            let r = r * r_mul;
+            let div = if r == 0 { 0 } else { l / r };
+            let res = if div.is_negative() { div - 5 } else { div + 5 } / 10;
+            res.to_i128().unwrap_or(i128::MAX)
+        })?
+    };
     let result = result.with_data_type(DataType::Decimal128(p3, s3));
     Ok(ColumnarValue::Array(Arc::new(result)))
 }

From 33f1ce98059f327fa129d25fee6ff2a971ae7d64 Mon Sep 17 00:00:00 2001
From: Andy Grove <agrove@apache.org>
Date: Fri, 2 Aug 2024 06:39:49 -0600
Subject: [PATCH 17/68] feat: Implement basic version of RLIKE (#734)

---
 src/lib.rs    |   2 +
 src/regexp.rs | 170 ++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 172 insertions(+)
 create mode 100644 src/regexp.rs

diff --git a/src/lib.rs b/src/lib.rs
index 14ab080b466c..c9606233d0d1 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -20,6 +20,7 @@ mod error;
 mod if_expr;
 
 mod kernels;
+mod regexp;
 pub mod scalar_funcs;
 pub mod spark_hash;
 mod temporal;
@@ -30,6 +31,7 @@ mod xxhash64;
 pub use cast::{spark_cast, Cast};
 pub use error::{SparkError, SparkResult};
 pub use if_expr::IfExpr;
+pub use regexp::RLike;
 pub use temporal::{DateTruncExpr, HourExpr, MinuteExpr, SecondExpr, TimestampTruncExpr};
 
 /// Spark supports three evaluation modes when evaluating expressions, which affect
diff --git a/src/regexp.rs b/src/regexp.rs
new file mode 100644
index 000000000000..2672d754f1e9
--- /dev/null
+++ b/src/regexp.rs
@@ -0,0 +1,170 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::utils::down_cast_any_ref;
+use crate::SparkError;
+use arrow::compute::take;
+use arrow_array::builder::BooleanBuilder;
+use arrow_array::types::Int32Type;
+use arrow_array::{Array, BooleanArray, DictionaryArray, RecordBatch, StringArray};
+use arrow_schema::{DataType, Schema};
+use datafusion_common::{internal_err, Result};
+use datafusion_expr::ColumnarValue;
+use datafusion_physical_expr_common::physical_expr::PhysicalExpr;
+use regex::Regex;
+use std::any::Any;
+use std::fmt::{Display, Formatter};
+use std::hash::{Hash, Hasher};
+use std::sync::Arc;
+
+/// Implementation of RLIKE operator.
+///
+/// Note that this implementation is not yet Spark-compatible and simply delegates to
+/// the Rust regexp crate. It will match Spark behavior for some simple cases but has
+/// differences in whitespace handling and does not support all the features of Java's
+/// regular expression engine, which are documented at:
+///
+/// https://docs.oracle.com/javase/8/docs/api/java/util/regex/Pattern.html
+#[derive(Debug)]
+pub struct RLike {
+    child: Arc<dyn PhysicalExpr>,
+    // Only scalar patterns are supported
+    pattern_str: String,
+    pattern: Regex,
+}
+
+impl Hash for RLike {
+    fn hash<H: Hasher>(&self, state: &mut H) {
+        state.write(self.pattern_str.as_bytes());
+    }
+}
+
+impl RLike {
+    pub fn try_new(child: Arc<dyn PhysicalExpr>, pattern: &str) -> Result<Self> {
+        Ok(Self {
+            child,
+            pattern_str: pattern.to_string(),
+            pattern: Regex::new(pattern).map_err(|e| {
+                SparkError::Internal(format!("Failed to compile pattern {}: {}", pattern, e))
+            })?,
+        })
+    }
+
+    fn is_match(&self, inputs: &StringArray) -> BooleanArray {
+        let mut builder = BooleanBuilder::with_capacity(inputs.len());
+        if inputs.is_nullable() {
+            for i in 0..inputs.len() {
+                if inputs.is_null(i) {
+                    builder.append_null();
+                } else {
+                    builder.append_value(self.pattern.is_match(inputs.value(i)));
+                }
+            }
+        } else {
+            for i in 0..inputs.len() {
+                builder.append_value(self.pattern.is_match(inputs.value(i)));
+            }
+        }
+        builder.finish()
+    }
+}
+
+impl Display for RLike {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        write!(
+            f,
+            "RLike [child: {}, pattern: {}] ",
+            self.child, self.pattern_str
+        )
+    }
+}
+
+impl PartialEq<dyn Any> for RLike {
+    fn eq(&self, other: &dyn Any) -> bool {
+        down_cast_any_ref(other)
+            .downcast_ref::<Self>()
+            .map(|x| self.child.eq(&x.child) && self.pattern_str.eq(&x.pattern_str))
+            .unwrap_or(false)
+    }
+}
+
+impl PhysicalExpr for RLike {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn data_type(&self, _input_schema: &Schema) -> Result<DataType> {
+        Ok(DataType::Boolean)
+    }
+
+    fn nullable(&self, input_schema: &Schema) -> Result<bool> {
+        self.child.nullable(input_schema)
+    }
+
+    fn evaluate(&self, batch: &RecordBatch) -> Result<ColumnarValue> {
+        match self.child.evaluate(batch)? {
+            ColumnarValue::Array(array) if array.as_any().is::<DictionaryArray<Int32Type>>() => {
+                let dict_array = array
+                    .as_any()
+                    .downcast_ref::<DictionaryArray<Int32Type>>()
+                    .expect("dict array");
+                let dict_values = dict_array
+                    .values()
+                    .as_any()
+                    .downcast_ref::<StringArray>()
+                    .expect("strings");
+                // evaluate the regexp pattern against the dictionary values
+                let new_values = self.is_match(dict_values);
+                // convert to conventional (not dictionary-encoded) array
+                let result = take(&new_values, dict_array.keys(), None)?;
+                Ok(ColumnarValue::Array(result))
+            }
+            ColumnarValue::Array(array) => {
+                let inputs = array
+                    .as_any()
+                    .downcast_ref::<StringArray>()
+                    .expect("string array");
+                let array = self.is_match(inputs);
+                Ok(ColumnarValue::Array(Arc::new(array)))
+            }
+            ColumnarValue::Scalar(_) => {
+                internal_err!("non scalar regexp patterns are not supported")
+            }
+        }
+    }
+
+    fn children(&self) -> Vec<&Arc<dyn PhysicalExpr>> {
+        vec![&self.child]
+    }
+
+    fn with_new_children(
+        self: Arc<Self>,
+        children: Vec<Arc<dyn PhysicalExpr>>,
+    ) -> Result<Arc<dyn PhysicalExpr>> {
+        assert!(children.len() == 1);
+        Ok(Arc::new(RLike::try_new(
+            children[0].clone(),
+            &self.pattern_str,
+        )?))
+    }
+
+    fn dyn_hash(&self, state: &mut dyn Hasher) {
+        use std::hash::Hash;
+        let mut s = state;
+        self.hash(&mut s);
+    }
+}

From 607ee7dad07a705aae39a5f248e783b90e277a80 Mon Sep 17 00:00:00 2001
From: Adam Binford <adamq43@gmail.com>
Date: Fri, 2 Aug 2024 20:35:03 -0400
Subject: [PATCH 18/68] feat: Add GetStructField expression (#731)

* Add GetStructField support

* Add custom types to CometBatchScanExec

* Remove test explain

* Rust fmt

* Fix struct type support checks

* Support converting StructArray to native

* fix style

* Attempt to fix scalar subquery issue

* Fix other unit test

* Cleanup

* Default query plan supporting complex type to false

* Migrate struct expressions to spark-expr

* Update shouldApplyRowToColumnar comment

* Add nulls to test

* Rename to allowStruct

* Add DataTypeSupport trait

* Fix parquet datatype test
---
 src/lib.rs     |   2 +
 src/structs.rs | 291 +++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 293 insertions(+)
 create mode 100644 src/structs.rs

diff --git a/src/lib.rs b/src/lib.rs
index c9606233d0d1..cf7fc872b6a7 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -23,6 +23,7 @@ mod kernels;
 mod regexp;
 pub mod scalar_funcs;
 pub mod spark_hash;
+mod structs;
 mod temporal;
 pub mod timezone;
 pub mod utils;
@@ -32,6 +33,7 @@ pub use cast::{spark_cast, Cast};
 pub use error::{SparkError, SparkResult};
 pub use if_expr::IfExpr;
 pub use regexp::RLike;
+pub use structs::{CreateNamedStruct, GetStructField};
 pub use temporal::{DateTruncExpr, HourExpr, MinuteExpr, SecondExpr, TimestampTruncExpr};
 
 /// Spark supports three evaluation modes when evaluating expressions, which affect
diff --git a/src/structs.rs b/src/structs.rs
new file mode 100644
index 000000000000..d6844ea889fe
--- /dev/null
+++ b/src/structs.rs
@@ -0,0 +1,291 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::compute::take;
+use arrow::record_batch::RecordBatch;
+use arrow_array::types::Int32Type;
+use arrow_array::{Array, DictionaryArray, StructArray};
+use arrow_schema::{DataType, Field, Schema};
+use datafusion::logical_expr::ColumnarValue;
+use datafusion_common::{DataFusionError, Result as DataFusionResult, ScalarValue};
+use datafusion_physical_expr::PhysicalExpr;
+use std::{
+    any::Any,
+    fmt::{Display, Formatter},
+    hash::{Hash, Hasher},
+    sync::Arc,
+};
+
+use crate::utils::down_cast_any_ref;
+
+#[derive(Debug, Hash)]
+pub struct CreateNamedStruct {
+    values: Vec<Arc<dyn PhysicalExpr>>,
+    data_type: DataType,
+}
+
+impl CreateNamedStruct {
+    pub fn new(values: Vec<Arc<dyn PhysicalExpr>>, data_type: DataType) -> Self {
+        Self { values, data_type }
+    }
+}
+
+impl PhysicalExpr for CreateNamedStruct {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn data_type(&self, _input_schema: &Schema) -> DataFusionResult<DataType> {
+        Ok(self.data_type.clone())
+    }
+
+    fn nullable(&self, _input_schema: &Schema) -> DataFusionResult<bool> {
+        Ok(false)
+    }
+
+    fn evaluate(&self, batch: &RecordBatch) -> DataFusionResult<ColumnarValue> {
+        let values = self
+            .values
+            .iter()
+            .map(|expr| expr.evaluate(batch))
+            .collect::<datafusion_common::Result<Vec<_>>>()?;
+        let arrays = ColumnarValue::values_to_arrays(&values)?;
+        // TODO it would be more efficient if we could preserve dictionaries within the
+        // struct array but for now we unwrap them to avoid runtime errors
+        // https://github.com/apache/datafusion-comet/issues/755
+        let arrays = arrays
+            .iter()
+            .map(|array| {
+                if let Some(dict_array) =
+                    array.as_any().downcast_ref::<DictionaryArray<Int32Type>>()
+                {
+                    take(dict_array.values().as_ref(), dict_array.keys(), None)
+                } else {
+                    Ok(Arc::clone(array))
+                }
+            })
+            .collect::<Result<Vec<_>, _>>()?;
+        let fields = match &self.data_type {
+            DataType::Struct(fields) => fields,
+            _ => {
+                return Err(DataFusionError::Internal(format!(
+                    "Expected struct data type, got {:?}",
+                    self.data_type
+                )))
+            }
+        };
+        Ok(ColumnarValue::Array(Arc::new(StructArray::new(
+            fields.clone(),
+            arrays,
+            None,
+        ))))
+    }
+
+    fn children(&self) -> Vec<&Arc<dyn PhysicalExpr>> {
+        self.values.iter().collect()
+    }
+
+    fn with_new_children(
+        self: Arc<Self>,
+        children: Vec<Arc<dyn PhysicalExpr>>,
+    ) -> datafusion_common::Result<Arc<dyn PhysicalExpr>> {
+        Ok(Arc::new(CreateNamedStruct::new(
+            children.clone(),
+            self.data_type.clone(),
+        )))
+    }
+
+    fn dyn_hash(&self, state: &mut dyn Hasher) {
+        let mut s = state;
+        self.values.hash(&mut s);
+        self.data_type.hash(&mut s);
+        self.hash(&mut s);
+    }
+}
+
+impl Display for CreateNamedStruct {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        write!(
+            f,
+            "CreateNamedStruct [values: {:?}, data_type: {:?}]",
+            self.values, self.data_type
+        )
+    }
+}
+
+impl PartialEq<dyn Any> for CreateNamedStruct {
+    fn eq(&self, other: &dyn Any) -> bool {
+        down_cast_any_ref(other)
+            .downcast_ref::<Self>()
+            .map(|x| {
+                self.values
+                    .iter()
+                    .zip(x.values.iter())
+                    .all(|(a, b)| a.eq(b))
+                    && self.data_type.eq(&x.data_type)
+            })
+            .unwrap_or(false)
+    }
+}
+
+#[derive(Debug, Hash)]
+pub struct GetStructField {
+    child: Arc<dyn PhysicalExpr>,
+    ordinal: usize,
+}
+
+impl GetStructField {
+    pub fn new(child: Arc<dyn PhysicalExpr>, ordinal: usize) -> Self {
+        Self { child, ordinal }
+    }
+
+    fn child_field(&self, input_schema: &Schema) -> DataFusionResult<Arc<Field>> {
+        match self.child.data_type(input_schema)? {
+            DataType::Struct(fields) => Ok(fields[self.ordinal].clone()),
+            data_type => Err(DataFusionError::Plan(format!(
+                "Expect struct field, got {:?}",
+                data_type
+            ))),
+        }
+    }
+}
+
+impl PhysicalExpr for GetStructField {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn data_type(&self, input_schema: &Schema) -> DataFusionResult<DataType> {
+        Ok(self.child_field(input_schema)?.data_type().clone())
+    }
+
+    fn nullable(&self, input_schema: &Schema) -> DataFusionResult<bool> {
+        Ok(self.child_field(input_schema)?.is_nullable())
+    }
+
+    fn evaluate(&self, batch: &RecordBatch) -> DataFusionResult<ColumnarValue> {
+        let child_value = self.child.evaluate(batch)?;
+
+        match child_value {
+            ColumnarValue::Array(array) => {
+                let struct_array = array
+                    .as_any()
+                    .downcast_ref::<StructArray>()
+                    .expect("A struct is expected");
+
+                Ok(ColumnarValue::Array(
+                    struct_array.column(self.ordinal).clone(),
+                ))
+            }
+            ColumnarValue::Scalar(ScalarValue::Struct(struct_array)) => Ok(ColumnarValue::Array(
+                struct_array.column(self.ordinal).clone(),
+            )),
+            value => Err(DataFusionError::Execution(format!(
+                "Expected a struct array, got {:?}",
+                value
+            ))),
+        }
+    }
+
+    fn children(&self) -> Vec<&Arc<dyn PhysicalExpr>> {
+        vec![&self.child]
+    }
+
+    fn with_new_children(
+        self: Arc<Self>,
+        children: Vec<Arc<dyn PhysicalExpr>>,
+    ) -> datafusion_common::Result<Arc<dyn PhysicalExpr>> {
+        Ok(Arc::new(GetStructField::new(
+            children[0].clone(),
+            self.ordinal,
+        )))
+    }
+
+    fn dyn_hash(&self, state: &mut dyn Hasher) {
+        let mut s = state;
+        self.child.hash(&mut s);
+        self.ordinal.hash(&mut s);
+        self.hash(&mut s);
+    }
+}
+
+impl Display for GetStructField {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        write!(
+            f,
+            "GetStructField [child: {:?}, ordinal: {:?}]",
+            self.child, self.ordinal
+        )
+    }
+}
+
+impl PartialEq<dyn Any> for GetStructField {
+    fn eq(&self, other: &dyn Any) -> bool {
+        down_cast_any_ref(other)
+            .downcast_ref::<Self>()
+            .map(|x| self.child.eq(&x.child) && self.ordinal.eq(&x.ordinal))
+            .unwrap_or(false)
+    }
+}
+
+#[cfg(test)]
+mod test {
+    use super::CreateNamedStruct;
+    use arrow_array::{Array, DictionaryArray, Int32Array, RecordBatch, StringArray};
+    use arrow_schema::{DataType, Field, Fields, Schema};
+    use datafusion_common::Result;
+    use datafusion_expr::ColumnarValue;
+    use datafusion_physical_expr_common::expressions::column::Column;
+    use datafusion_physical_expr_common::physical_expr::PhysicalExpr;
+    use std::sync::Arc;
+
+    #[test]
+    fn test_create_struct_from_dict_encoded_i32() -> Result<()> {
+        let keys = Int32Array::from(vec![0, 1, 2]);
+        let values = Int32Array::from(vec![0, 111, 233]);
+        let dict = DictionaryArray::try_new(keys, Arc::new(values))?;
+        let data_type = DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Int32));
+        let schema = Schema::new(vec![Field::new("a", data_type, false)]);
+        let batch = RecordBatch::try_new(Arc::new(schema), vec![Arc::new(dict)])?;
+        let data_type =
+            DataType::Struct(Fields::from(vec![Field::new("a", DataType::Int32, false)]));
+        let x = CreateNamedStruct::new(vec![Arc::new(Column::new("a", 0))], data_type);
+        let ColumnarValue::Array(x) = x.evaluate(&batch)? else {
+            unreachable!()
+        };
+        assert_eq!(3, x.len());
+        Ok(())
+    }
+
+    #[test]
+    fn test_create_struct_from_dict_encoded_string() -> Result<()> {
+        let keys = Int32Array::from(vec![0, 1, 2]);
+        let values = StringArray::from(vec!["a".to_string(), "b".to_string(), "c".to_string()]);
+        let dict = DictionaryArray::try_new(keys, Arc::new(values))?;
+        let data_type = DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8));
+        let schema = Schema::new(vec![Field::new("a", data_type, false)]);
+        let batch = RecordBatch::try_new(Arc::new(schema), vec![Arc::new(dict)])?;
+        let data_type =
+            DataType::Struct(Fields::from(vec![Field::new("a", DataType::Utf8, false)]));
+        let x = CreateNamedStruct::new(vec![Arc::new(Column::new("a", 0))], data_type);
+        let ColumnarValue::Array(x) = x.evaluate(&batch)? else {
+            unreachable!()
+        };
+        assert_eq!(3, x.len());
+        Ok(())
+    }
+}

From e3709eadcb0fed65f586824284cd9a83fd8a358d Mon Sep 17 00:00:00 2001
From: KAZUYUKI TANIMURA <ktanimura@apple.com>
Date: Thu, 8 Aug 2024 09:48:00 -0700
Subject: [PATCH 19/68] fix: Optimize read_side_padding (#772)

## Which issue does this PR close?

## Rationale for this change

This PR improves read_side_padding that is used for CHAR() schema

## What changes are included in this PR?

Optimized spark_read_side_padding

## How are these changes tested?

Added tests
---
 Cargo.toml          |  1 -
 src/scalar_funcs.rs | 62 +++++++++++++++++++++++----------------------
 2 files changed, 32 insertions(+), 31 deletions(-)

diff --git a/Cargo.toml b/Cargo.toml
index 96eae39ffbd2..1a8c8aeb4b8f 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -41,7 +41,6 @@ chrono-tz = { workspace = true }
 num = { workspace = true }
 regex = { workspace = true }
 thiserror = { workspace = true }
-unicode-segmentation = "1.11.0"
 
 [dev-dependencies]
 arrow-data = {workspace = true}
diff --git a/src/scalar_funcs.rs b/src/scalar_funcs.rs
index 7cbaf12aa423..ffd6fd212f17 100644
--- a/src/scalar_funcs.rs
+++ b/src/scalar_funcs.rs
@@ -15,15 +15,14 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use std::{cmp::min, sync::Arc};
-
 use arrow::{
     array::{
-        ArrayRef, AsArray, Decimal128Builder, Float32Array, Float64Array, GenericStringArray,
-        Int16Array, Int32Array, Int64Array, Int64Builder, Int8Array, OffsetSizeTrait,
+        ArrayRef, AsArray, Decimal128Builder, Float32Array, Float64Array, Int16Array, Int32Array,
+        Int64Array, Int64Builder, Int8Array, OffsetSizeTrait,
     },
     datatypes::{validate_decimal_precision, Decimal128Type, Int64Type},
 };
+use arrow_array::builder::GenericStringBuilder;
 use arrow_array::{Array, ArrowNativeTypeOp, BooleanArray, Decimal128Array};
 use arrow_schema::{DataType, DECIMAL128_MAX_PRECISION};
 use datafusion::{functions::math::round::round, physical_plan::ColumnarValue};
@@ -35,7 +34,8 @@ use num::{
     integer::{div_ceil, div_floor},
     BigInt, Signed, ToPrimitive,
 };
-use unicode_segmentation::UnicodeSegmentation;
+use std::fmt::Write;
+use std::{cmp::min, sync::Arc};
 
 mod unhex;
 pub use unhex::spark_unhex;
@@ -387,52 +387,54 @@ pub fn spark_round(
 }
 
 /// Similar to DataFusion `rpad`, but not to truncate when the string is already longer than length
-pub fn spark_rpad(args: &[ColumnarValue]) -> Result<ColumnarValue, DataFusionError> {
+pub fn spark_read_side_padding(args: &[ColumnarValue]) -> Result<ColumnarValue, DataFusionError> {
     match args {
         [ColumnarValue::Array(array), ColumnarValue::Scalar(ScalarValue::Int32(Some(length)))] => {
-            match args[0].data_type() {
-                DataType::Utf8 => spark_rpad_internal::<i32>(array, *length),
-                DataType::LargeUtf8 => spark_rpad_internal::<i64>(array, *length),
+            match array.data_type() {
+                DataType::Utf8 => spark_read_side_padding_internal::<i32>(array, *length),
+                DataType::LargeUtf8 => spark_read_side_padding_internal::<i64>(array, *length),
                 // TODO: handle Dictionary types
                 other => Err(DataFusionError::Internal(format!(
-                    "Unsupported data type {other:?} for function rpad",
+                    "Unsupported data type {other:?} for function read_side_padding",
                 ))),
             }
         }
         other => Err(DataFusionError::Internal(format!(
-            "Unsupported arguments {other:?} for function rpad",
+            "Unsupported arguments {other:?} for function read_side_padding",
         ))),
     }
 }
 
-fn spark_rpad_internal<T: OffsetSizeTrait>(
+fn spark_read_side_padding_internal<T: OffsetSizeTrait>(
     array: &ArrayRef,
     length: i32,
 ) -> Result<ColumnarValue, DataFusionError> {
     let string_array = as_generic_string_array::<T>(array)?;
+    let length = 0.max(length) as usize;
+    let space_string = " ".repeat(length);
+
+    let mut builder =
+        GenericStringBuilder::<T>::with_capacity(string_array.len(), string_array.len() * length);
 
-    let result = string_array
-        .iter()
-        .map(|string| match string {
+    for string in string_array.iter() {
+        match string {
             Some(string) => {
-                let length = if length < 0 { 0 } else { length as usize };
-                if length == 0 {
-                    Ok(Some("".to_string()))
+                // It looks Spark's UTF8String is closer to chars rather than graphemes
+                // https://stackoverflow.com/a/46290728
+                let char_len = string.chars().count();
+                if length <= char_len {
+                    builder.append_value(string);
                 } else {
-                    let graphemes = string.graphemes(true).collect::<Vec<&str>>();
-                    if length < graphemes.len() {
-                        Ok(Some(string.to_string()))
-                    } else {
-                        let mut s = string.to_string();
-                        s.push_str(" ".repeat(length - graphemes.len()).as_str());
-                        Ok(Some(s))
-                    }
+                    // write_str updates only the value buffer, not null nor offset buffer
+                    // This is convenient for concatenating str(s)
+                    builder.write_str(string)?;
+                    builder.append_value(&space_string[char_len..]);
                 }
             }
-            _ => Ok(None),
-        })
-        .collect::<Result<GenericStringArray<T>, DataFusionError>>()?;
-    Ok(ColumnarValue::Array(Arc::new(result)))
+            _ => builder.append_null(),
+        }
+    }
+    Ok(ColumnarValue::Array(Arc::new(builder.finish())))
 }
 
 // Let Decimal(p3, s3) as return type i.e. Decimal(p1, s1) / Decimal(p2, s2) = Decimal(p3, s3).

From d2853efa73f996503c95683f51d3aa3f7daa31e4 Mon Sep 17 00:00:00 2001
From: Emil Ejbyfeldt <emil.ejbyfeldt@gmail.com>
Date: Fri, 9 Aug 2024 23:43:29 +0200
Subject: [PATCH 20/68] feat: Optimze CreateNamedStruct preserve dictionaries
 (#789)

* feat: Optimze CreateNamedStruct preserve dictionaries

Instead of serializing the return data_type we just serialize the field
names. The original implmentation was done as it lead to slightly
simpler implementation, but it clear from #750 that this was the wrong
choice and leads to issues with the physical data_type.

* Support dictionary data_types in StructVector and MapVector

* Add length checks
---
 src/structs.rs | 78 +++++++++++++++++++++-----------------------------
 1 file changed, 33 insertions(+), 45 deletions(-)

diff --git a/src/structs.rs b/src/structs.rs
index d6844ea889fe..9ce3eb8326fd 100644
--- a/src/structs.rs
+++ b/src/structs.rs
@@ -15,10 +15,8 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use arrow::compute::take;
 use arrow::record_batch::RecordBatch;
-use arrow_array::types::Int32Type;
-use arrow_array::{Array, DictionaryArray, StructArray};
+use arrow_array::{Array, StructArray};
 use arrow_schema::{DataType, Field, Schema};
 use datafusion::logical_expr::ColumnarValue;
 use datafusion_common::{DataFusionError, Result as DataFusionResult, ScalarValue};
@@ -35,12 +33,24 @@ use crate::utils::down_cast_any_ref;
 #[derive(Debug, Hash)]
 pub struct CreateNamedStruct {
     values: Vec<Arc<dyn PhysicalExpr>>,
-    data_type: DataType,
+    names: Vec<String>,
 }
 
 impl CreateNamedStruct {
-    pub fn new(values: Vec<Arc<dyn PhysicalExpr>>, data_type: DataType) -> Self {
-        Self { values, data_type }
+    pub fn new(values: Vec<Arc<dyn PhysicalExpr>>, names: Vec<String>) -> Self {
+        Self { values, names }
+    }
+
+    fn fields(&self, schema: &Schema) -> DataFusionResult<Vec<Field>> {
+        self.values
+            .iter()
+            .zip(&self.names)
+            .map(|(expr, name)| {
+                let data_type = expr.data_type(schema)?;
+                let nullable = expr.nullable(schema)?;
+                Ok(Field::new(name, data_type, nullable))
+            })
+            .collect()
     }
 }
 
@@ -49,8 +59,9 @@ impl PhysicalExpr for CreateNamedStruct {
         self
     }
 
-    fn data_type(&self, _input_schema: &Schema) -> DataFusionResult<DataType> {
-        Ok(self.data_type.clone())
+    fn data_type(&self, input_schema: &Schema) -> DataFusionResult<DataType> {
+        let fields = self.fields(input_schema)?;
+        Ok(DataType::Struct(fields.into()))
     }
 
     fn nullable(&self, _input_schema: &Schema) -> DataFusionResult<bool> {
@@ -64,32 +75,9 @@ impl PhysicalExpr for CreateNamedStruct {
             .map(|expr| expr.evaluate(batch))
             .collect::<datafusion_common::Result<Vec<_>>>()?;
         let arrays = ColumnarValue::values_to_arrays(&values)?;
-        // TODO it would be more efficient if we could preserve dictionaries within the
-        // struct array but for now we unwrap them to avoid runtime errors
-        // https://github.com/apache/datafusion-comet/issues/755
-        let arrays = arrays
-            .iter()
-            .map(|array| {
-                if let Some(dict_array) =
-                    array.as_any().downcast_ref::<DictionaryArray<Int32Type>>()
-                {
-                    take(dict_array.values().as_ref(), dict_array.keys(), None)
-                } else {
-                    Ok(Arc::clone(array))
-                }
-            })
-            .collect::<Result<Vec<_>, _>>()?;
-        let fields = match &self.data_type {
-            DataType::Struct(fields) => fields,
-            _ => {
-                return Err(DataFusionError::Internal(format!(
-                    "Expected struct data type, got {:?}",
-                    self.data_type
-                )))
-            }
-        };
+        let fields = self.fields(&batch.schema())?;
         Ok(ColumnarValue::Array(Arc::new(StructArray::new(
-            fields.clone(),
+            fields.into(),
             arrays,
             None,
         ))))
@@ -105,14 +93,14 @@ impl PhysicalExpr for CreateNamedStruct {
     ) -> datafusion_common::Result<Arc<dyn PhysicalExpr>> {
         Ok(Arc::new(CreateNamedStruct::new(
             children.clone(),
-            self.data_type.clone(),
+            self.names.clone(),
         )))
     }
 
     fn dyn_hash(&self, state: &mut dyn Hasher) {
         let mut s = state;
         self.values.hash(&mut s);
-        self.data_type.hash(&mut s);
+        self.names.hash(&mut s);
         self.hash(&mut s);
     }
 }
@@ -121,8 +109,8 @@ impl Display for CreateNamedStruct {
     fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
         write!(
             f,
-            "CreateNamedStruct [values: {:?}, data_type: {:?}]",
-            self.values, self.data_type
+            "CreateNamedStruct [values: {:?}, names: {:?}]",
+            self.values, self.names
         )
     }
 }
@@ -136,7 +124,9 @@ impl PartialEq<dyn Any> for CreateNamedStruct {
                     .iter()
                     .zip(x.values.iter())
                     .all(|(a, b)| a.eq(b))
-                    && self.data_type.eq(&x.data_type)
+                    && self.values.len() == x.values.len()
+                    && self.names.iter().zip(x.names.iter()).all(|(a, b)| a.eq(b))
+                    && self.names.len() == x.names.len()
             })
             .unwrap_or(false)
     }
@@ -246,7 +236,7 @@ impl PartialEq<dyn Any> for GetStructField {
 mod test {
     use super::CreateNamedStruct;
     use arrow_array::{Array, DictionaryArray, Int32Array, RecordBatch, StringArray};
-    use arrow_schema::{DataType, Field, Fields, Schema};
+    use arrow_schema::{DataType, Field, Schema};
     use datafusion_common::Result;
     use datafusion_expr::ColumnarValue;
     use datafusion_physical_expr_common::expressions::column::Column;
@@ -261,9 +251,8 @@ mod test {
         let data_type = DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Int32));
         let schema = Schema::new(vec![Field::new("a", data_type, false)]);
         let batch = RecordBatch::try_new(Arc::new(schema), vec![Arc::new(dict)])?;
-        let data_type =
-            DataType::Struct(Fields::from(vec![Field::new("a", DataType::Int32, false)]));
-        let x = CreateNamedStruct::new(vec![Arc::new(Column::new("a", 0))], data_type);
+        let field_names = vec!["a".to_string()];
+        let x = CreateNamedStruct::new(vec![Arc::new(Column::new("a", 0))], field_names);
         let ColumnarValue::Array(x) = x.evaluate(&batch)? else {
             unreachable!()
         };
@@ -279,9 +268,8 @@ mod test {
         let data_type = DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8));
         let schema = Schema::new(vec![Field::new("a", data_type, false)]);
         let batch = RecordBatch::try_new(Arc::new(schema), vec![Arc::new(dict)])?;
-        let data_type =
-            DataType::Struct(Fields::from(vec![Field::new("a", DataType::Utf8, false)]));
-        let x = CreateNamedStruct::new(vec![Arc::new(Column::new("a", 0))], data_type);
+        let field_names = vec!["a".to_string()];
+        let x = CreateNamedStruct::new(vec![Arc::new(Column::new("a", 0))], field_names);
         let ColumnarValue::Array(x) = x.evaluate(&batch)? else {
             unreachable!()
         };

From 5a37afc9284b1c975dd46c6e1b348a4e7be95752 Mon Sep 17 00:00:00 2001
From: Oleks V <comphead@users.noreply.github.com>
Date: Wed, 21 Aug 2024 11:45:11 -0700
Subject: [PATCH 21/68] feat: Enable `clippy::clone_on_ref_ptr` on `proto` and
 `spark_exprs` crates (#859)

* feat: Enable `clippy::clone_on_ref_ptr` on `proto` and `spark_exprs` crates
---
 src/cast.rs                          |  9 +++++++--
 src/if_expr.rs                       | 12 ++++++------
 src/lib.rs                           |  4 ++++
 src/regexp.rs                        |  2 +-
 src/scalar_funcs.rs                  |  8 ++++----
 src/scalar_funcs/hash_expressions.rs |  4 ++--
 src/structs.rs                       | 12 ++++++------
 src/temporal.rs                      | 14 +++++++-------
 8 files changed, 37 insertions(+), 28 deletions(-)

diff --git a/src/cast.rs b/src/cast.rs
index e44b1c9f5db4..d8c03d8975de 100644
--- a/src/cast.rs
+++ b/src/cast.rs
@@ -619,7 +619,12 @@ fn cast_array(
 
             let casted_dictionary = DictionaryArray::<Int32Type>::new(
                 dict_array.keys().clone(),
-                cast_array(dict_array.values().clone(), to_type, eval_mode, timezone)?,
+                cast_array(
+                    Arc::clone(dict_array.values()),
+                    to_type,
+                    eval_mode,
+                    timezone,
+                )?,
             );
 
             let casted_result = match to_type {
@@ -1393,7 +1398,7 @@ impl PhysicalExpr for Cast {
     ) -> datafusion_common::Result<Arc<dyn PhysicalExpr>> {
         match children.len() {
             1 => Ok(Arc::new(Cast::new(
-                children[0].clone(),
+                Arc::clone(&children[0]),
                 self.data_type.clone(),
                 self.eval_mode,
                 self.timezone.clone(),
diff --git a/src/if_expr.rs b/src/if_expr.rs
index a5344140bb8a..9a90b727ce77 100644
--- a/src/if_expr.rs
+++ b/src/if_expr.rs
@@ -60,9 +60,9 @@ impl IfExpr {
         false_expr: Arc<dyn PhysicalExpr>,
     ) -> Self {
         Self {
-            if_expr: if_expr.clone(),
-            true_expr: true_expr.clone(),
-            false_expr: false_expr.clone(),
+            if_expr: Arc::clone(&if_expr),
+            true_expr: Arc::clone(&true_expr),
+            false_expr: Arc::clone(&false_expr),
             case_expr: Arc::new(
                 CaseExpr::try_new(None, vec![(if_expr, true_expr)], Some(false_expr)).unwrap(),
             ),
@@ -102,9 +102,9 @@ impl PhysicalExpr for IfExpr {
         children: Vec<Arc<dyn PhysicalExpr>>,
     ) -> Result<Arc<dyn PhysicalExpr>> {
         Ok(Arc::new(IfExpr::new(
-            children[0].clone(),
-            children[1].clone(),
-            children[2].clone(),
+            Arc::clone(&children[0]),
+            Arc::clone(&children[1]),
+            Arc::clone(&children[2]),
         )))
     }
 
diff --git a/src/lib.rs b/src/lib.rs
index cf7fc872b6a7..9fb16f94d986 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -15,6 +15,10 @@
 // specific language governing permissions and limitations
 // under the License.
 
+// The clippy throws an error if the reference clone not wrapped into `Arc::clone`
+// The lint makes easier for code reader/reviewer separate references clones from more heavyweight ones
+#![deny(clippy::clone_on_ref_ptr)]
+
 mod cast;
 mod error;
 mod if_expr;
diff --git a/src/regexp.rs b/src/regexp.rs
index 2672d754f1e9..f5e0d53dc57a 100644
--- a/src/regexp.rs
+++ b/src/regexp.rs
@@ -157,7 +157,7 @@ impl PhysicalExpr for RLike {
     ) -> Result<Arc<dyn PhysicalExpr>> {
         assert!(children.len() == 1);
         Ok(Arc::new(RLike::try_new(
-            children[0].clone(),
+            Arc::clone(&children[0]),
             &self.pattern_str,
         )?))
     }
diff --git a/src/scalar_funcs.rs b/src/scalar_funcs.rs
index ffd6fd212f17..f4270220d708 100644
--- a/src/scalar_funcs.rs
+++ b/src/scalar_funcs.rs
@@ -356,7 +356,7 @@ pub fn spark_round(
                 make_decimal_array(array, precision, scale, &f)
             }
             DataType::Float32 | DataType::Float64 => {
-                Ok(ColumnarValue::Array(round(&[array.clone()])?))
+                Ok(ColumnarValue::Array(round(&[Arc::clone(array)])?))
             }
             dt => exec_err!("Not supported datatype for ROUND: {dt}"),
         },
@@ -451,12 +451,12 @@ pub fn spark_decimal_div(
     let (p3, s3) = get_precision_scale(data_type);
 
     let (left, right): (ArrayRef, ArrayRef) = match (left, right) {
-        (ColumnarValue::Array(l), ColumnarValue::Array(r)) => (l.clone(), r.clone()),
+        (ColumnarValue::Array(l), ColumnarValue::Array(r)) => (Arc::clone(l), Arc::clone(r)),
         (ColumnarValue::Scalar(l), ColumnarValue::Array(r)) => {
-            (l.to_array_of_size(r.len())?, r.clone())
+            (l.to_array_of_size(r.len())?, Arc::clone(r))
         }
         (ColumnarValue::Array(l), ColumnarValue::Scalar(r)) => {
-            (l.clone(), r.to_array_of_size(l.len())?)
+            (Arc::clone(l), r.to_array_of_size(l.len())?)
         }
         (ColumnarValue::Scalar(l), ColumnarValue::Scalar(r)) => (l.to_array()?, r.to_array()?),
     };
diff --git a/src/scalar_funcs/hash_expressions.rs b/src/scalar_funcs/hash_expressions.rs
index 1a403b9e3db1..e76314f41e0c 100644
--- a/src/scalar_funcs/hash_expressions.rs
+++ b/src/scalar_funcs/hash_expressions.rs
@@ -44,7 +44,7 @@ pub fn spark_murmur3_hash(args: &[ColumnarValue]) -> Result<ColumnarValue, DataF
             let arrays = args[0..args.len() - 1]
                 .iter()
                 .map(|arg| match arg {
-                    ColumnarValue::Array(array) => array.clone(),
+                    ColumnarValue::Array(array) => Arc::clone(array),
                     ColumnarValue::Scalar(scalar) => {
                         scalar.clone().to_array_of_size(num_rows).unwrap()
                     }
@@ -88,7 +88,7 @@ pub fn spark_xxhash64(args: &[ColumnarValue]) -> Result<ColumnarValue, DataFusio
             let arrays = args[0..args.len() - 1]
                 .iter()
                 .map(|arg| match arg {
-                    ColumnarValue::Array(array) => array.clone(),
+                    ColumnarValue::Array(array) => Arc::clone(array),
                     ColumnarValue::Scalar(scalar) => {
                         scalar.clone().to_array_of_size(num_rows).unwrap()
                     }
diff --git a/src/structs.rs b/src/structs.rs
index 9ce3eb8326fd..b3c3737d206c 100644
--- a/src/structs.rs
+++ b/src/structs.rs
@@ -145,7 +145,7 @@ impl GetStructField {
 
     fn child_field(&self, input_schema: &Schema) -> DataFusionResult<Arc<Field>> {
         match self.child.data_type(input_schema)? {
-            DataType::Struct(fields) => Ok(fields[self.ordinal].clone()),
+            DataType::Struct(fields) => Ok(Arc::clone(&fields[self.ordinal])),
             data_type => Err(DataFusionError::Plan(format!(
                 "Expect struct field, got {:?}",
                 data_type
@@ -177,12 +177,12 @@ impl PhysicalExpr for GetStructField {
                     .downcast_ref::<StructArray>()
                     .expect("A struct is expected");
 
-                Ok(ColumnarValue::Array(
-                    struct_array.column(self.ordinal).clone(),
-                ))
+                Ok(ColumnarValue::Array(Arc::clone(
+                    struct_array.column(self.ordinal),
+                )))
             }
             ColumnarValue::Scalar(ScalarValue::Struct(struct_array)) => Ok(ColumnarValue::Array(
-                struct_array.column(self.ordinal).clone(),
+                Arc::clone(struct_array.column(self.ordinal)),
             )),
             value => Err(DataFusionError::Execution(format!(
                 "Expected a struct array, got {:?}",
@@ -200,7 +200,7 @@ impl PhysicalExpr for GetStructField {
         children: Vec<Arc<dyn PhysicalExpr>>,
     ) -> datafusion_common::Result<Arc<dyn PhysicalExpr>> {
         Ok(Arc::new(GetStructField::new(
-            children[0].clone(),
+            Arc::clone(&children[0]),
             self.ordinal,
         )))
     }
diff --git a/src/temporal.rs b/src/temporal.rs
index 34b71a284a4e..415db6070ac4 100644
--- a/src/temporal.rs
+++ b/src/temporal.rs
@@ -118,7 +118,7 @@ impl PhysicalExpr for HourExpr {
         children: Vec<Arc<dyn PhysicalExpr>>,
     ) -> Result<Arc<dyn PhysicalExpr>, DataFusionError> {
         Ok(Arc::new(HourExpr::new(
-            children[0].clone(),
+            Arc::clone(&children[0]),
             self.timezone.clone(),
         )))
     }
@@ -212,7 +212,7 @@ impl PhysicalExpr for MinuteExpr {
         children: Vec<Arc<dyn PhysicalExpr>>,
     ) -> Result<Arc<dyn PhysicalExpr>, DataFusionError> {
         Ok(Arc::new(MinuteExpr::new(
-            children[0].clone(),
+            Arc::clone(&children[0]),
             self.timezone.clone(),
         )))
     }
@@ -306,7 +306,7 @@ impl PhysicalExpr for SecondExpr {
         children: Vec<Arc<dyn PhysicalExpr>>,
     ) -> Result<Arc<dyn PhysicalExpr>, DataFusionError> {
         Ok(Arc::new(SecondExpr::new(
-            children[0].clone(),
+            Arc::clone(&children[0]),
             self.timezone.clone(),
         )))
     }
@@ -393,8 +393,8 @@ impl PhysicalExpr for DateTruncExpr {
         children: Vec<Arc<dyn PhysicalExpr>>,
     ) -> Result<Arc<dyn PhysicalExpr>, DataFusionError> {
         Ok(Arc::new(DateTruncExpr::new(
-            children[0].clone(),
-            self.format.clone(),
+            Arc::clone(&children[0]),
+            Arc::clone(&self.format),
         )))
     }
 
@@ -518,8 +518,8 @@ impl PhysicalExpr for TimestampTruncExpr {
         children: Vec<Arc<dyn PhysicalExpr>>,
     ) -> Result<Arc<dyn PhysicalExpr>, DataFusionError> {
         Ok(Arc::new(TimestampTruncExpr::new(
-            children[0].clone(),
-            self.format.clone(),
+            Arc::clone(&children[0]),
+            Arc::clone(&self.format),
             self.timezone.clone(),
         )))
     }

From 3cf7c4f91b9dc5f0448994c7556b507879d9adbc Mon Sep 17 00:00:00 2001
From: Andy Grove <agrove@apache.org>
Date: Wed, 28 Aug 2024 11:20:55 -0600
Subject: [PATCH 22/68] feat: Implement basic version of string to
 float/double/decimal (#870)

* basic version of string to float/double/decimal

* docs

* update benches

* update benches

* rust doc
---
 benches/cast_from_string.rs |  5 ++++-
 benches/cast_numeric.rs     |  4 +++-
 src/cast.rs                 | 44 +++++++++++++++++++++++++++++++++----
 3 files changed, 47 insertions(+), 6 deletions(-)

diff --git a/benches/cast_from_string.rs b/benches/cast_from_string.rs
index 51410a68ad90..056ada2eb70e 100644
--- a/benches/cast_from_string.rs
+++ b/benches/cast_from_string.rs
@@ -31,20 +31,23 @@ fn criterion_benchmark(c: &mut Criterion) {
         DataType::Int8,
         EvalMode::Legacy,
         timezone.clone(),
+        false,
     );
     let cast_string_to_i16 = Cast::new(
         expr.clone(),
         DataType::Int16,
         EvalMode::Legacy,
         timezone.clone(),
+        false,
     );
     let cast_string_to_i32 = Cast::new(
         expr.clone(),
         DataType::Int32,
         EvalMode::Legacy,
         timezone.clone(),
+        false,
     );
-    let cast_string_to_i64 = Cast::new(expr, DataType::Int64, EvalMode::Legacy, timezone);
+    let cast_string_to_i64 = Cast::new(expr, DataType::Int64, EvalMode::Legacy, timezone, false);
 
     let mut group = c.benchmark_group("cast_string_to_int");
     group.bench_function("cast_string_to_i8", |b| {
diff --git a/benches/cast_numeric.rs b/benches/cast_numeric.rs
index dc0ceea79ad1..15ef1a5a27c6 100644
--- a/benches/cast_numeric.rs
+++ b/benches/cast_numeric.rs
@@ -31,14 +31,16 @@ fn criterion_benchmark(c: &mut Criterion) {
         DataType::Int8,
         EvalMode::Legacy,
         timezone.clone(),
+        false,
     );
     let cast_i32_to_i16 = Cast::new(
         expr.clone(),
         DataType::Int16,
         EvalMode::Legacy,
         timezone.clone(),
+        false,
     );
-    let cast_i32_to_i64 = Cast::new(expr, DataType::Int64, EvalMode::Legacy, timezone);
+    let cast_i32_to_i64 = Cast::new(expr, DataType::Int64, EvalMode::Legacy, timezone, false);
 
     let mut group = c.benchmark_group("cast_int_to_int");
     group.bench_function("cast_i32_to_i8", |b| {
diff --git a/src/cast.rs b/src/cast.rs
index d8c03d8975de..ed8cdc2fe153 100644
--- a/src/cast.rs
+++ b/src/cast.rs
@@ -142,6 +142,9 @@ pub struct Cast {
     /// When cast from/to timezone related types, we need timezone, which will be resolved with
     /// session local timezone by an analyzer in Spark.
     pub timezone: String,
+
+    /// Whether to allow casts that are known to be incompatible with Spark
+    pub allow_incompat: bool,
 }
 
 macro_rules! cast_utf8_to_int {
@@ -545,12 +548,14 @@ impl Cast {
         data_type: DataType,
         eval_mode: EvalMode,
         timezone: String,
+        allow_incompat: bool,
     ) -> Self {
         Self {
             child,
             data_type,
             timezone,
             eval_mode,
+            allow_incompat,
         }
     }
 
@@ -558,12 +563,14 @@ impl Cast {
         child: Arc<dyn PhysicalExpr>,
         data_type: DataType,
         eval_mode: EvalMode,
+        allow_incompat: bool,
     ) -> Self {
         Self {
             child,
             data_type,
             timezone: "".to_string(),
             eval_mode,
+            allow_incompat,
         }
     }
 }
@@ -576,6 +583,7 @@ pub fn spark_cast(
     data_type: &DataType,
     eval_mode: EvalMode,
     timezone: String,
+    allow_incompat: bool,
 ) -> DataFusionResult<ColumnarValue> {
     match arg {
         ColumnarValue::Array(array) => Ok(ColumnarValue::Array(cast_array(
@@ -583,6 +591,7 @@ pub fn spark_cast(
             data_type,
             eval_mode,
             timezone.to_owned(),
+            allow_incompat,
         )?)),
         ColumnarValue::Scalar(scalar) => {
             // Note that normally CAST(scalar) should be fold in Spark JVM side. However, for
@@ -590,7 +599,13 @@ pub fn spark_cast(
             // here.
             let array = scalar.to_array()?;
             let scalar = ScalarValue::try_from_array(
-                &cast_array(array, data_type, eval_mode, timezone.to_owned())?,
+                &cast_array(
+                    array,
+                    data_type,
+                    eval_mode,
+                    timezone.to_owned(),
+                    allow_incompat,
+                )?,
                 0,
             )?;
             Ok(ColumnarValue::Scalar(scalar))
@@ -603,6 +618,7 @@ fn cast_array(
     to_type: &DataType,
     eval_mode: EvalMode,
     timezone: String,
+    allow_incompat: bool,
 ) -> DataFusionResult<ArrayRef> {
     let array = array_with_timezone(array, timezone.clone(), Some(to_type))?;
     let from_type = array.data_type().clone();
@@ -624,6 +640,7 @@ fn cast_array(
                     to_type,
                     eval_mode,
                     timezone,
+                    allow_incompat,
                 )?,
             );
 
@@ -693,7 +710,7 @@ fn cast_array(
         {
             spark_cast_nonintegral_numeric_to_integral(&array, eval_mode, from_type, to_type)
         }
-        _ if is_datafusion_spark_compatible(from_type, to_type) => {
+        _ if is_datafusion_spark_compatible(from_type, to_type, allow_incompat) => {
             // use DataFusion cast only when we know that it is compatible with Spark
             Ok(cast_with_options(&array, to_type, &CAST_OPTIONS)?)
         }
@@ -711,7 +728,11 @@ fn cast_array(
 
 /// Determines if DataFusion supports the given cast in a way that is
 /// compatible with Spark
-fn is_datafusion_spark_compatible(from_type: &DataType, to_type: &DataType) -> bool {
+fn is_datafusion_spark_compatible(
+    from_type: &DataType,
+    to_type: &DataType,
+    allow_incompat: bool,
+) -> bool {
     if from_type == to_type {
         return true;
     }
@@ -764,6 +785,10 @@ fn is_datafusion_spark_compatible(from_type: &DataType, to_type: &DataType) -> b
                 | DataType::Decimal128(_, _)
                 | DataType::Decimal256(_, _)
         ),
+        DataType::Utf8 if allow_incompat => matches!(
+            to_type,
+            DataType::Binary | DataType::Float32 | DataType::Float64 | DataType::Decimal128(_, _)
+        ),
         DataType::Utf8 => matches!(to_type, DataType::Binary),
         DataType::Date32 => matches!(to_type, DataType::Utf8),
         DataType::Timestamp(_, _) => {
@@ -1385,7 +1410,13 @@ impl PhysicalExpr for Cast {
 
     fn evaluate(&self, batch: &RecordBatch) -> DataFusionResult<ColumnarValue> {
         let arg = self.child.evaluate(batch)?;
-        spark_cast(arg, &self.data_type, self.eval_mode, self.timezone.clone())
+        spark_cast(
+            arg,
+            &self.data_type,
+            self.eval_mode,
+            self.timezone.clone(),
+            self.allow_incompat,
+        )
     }
 
     fn children(&self) -> Vec<&Arc<dyn PhysicalExpr>> {
@@ -1402,6 +1433,7 @@ impl PhysicalExpr for Cast {
                 self.data_type.clone(),
                 self.eval_mode,
                 self.timezone.clone(),
+                self.allow_incompat,
             ))),
             _ => internal_err!("Cast should have exactly one child"),
         }
@@ -1413,6 +1445,7 @@ impl PhysicalExpr for Cast {
         self.data_type.hash(&mut s);
         self.timezone.hash(&mut s);
         self.eval_mode.hash(&mut s);
+        self.allow_incompat.hash(&mut s);
         self.hash(&mut s);
     }
 }
@@ -1996,6 +2029,7 @@ mod tests {
             &DataType::Timestamp(TimeUnit::Microsecond, Some(timezone.clone().into())),
             EvalMode::Legacy,
             timezone.clone(),
+            false,
         )?;
         assert_eq!(
             *result.data_type(),
@@ -2205,6 +2239,7 @@ mod tests {
             &DataType::Date32,
             EvalMode::Legacy,
             "UTC".to_owned(),
+            false,
         );
         assert!(result.is_err())
     }
@@ -2217,6 +2252,7 @@ mod tests {
             &DataType::Date32,
             EvalMode::Legacy,
             "Not a valid timezone".to_owned(),
+            false,
         );
         assert!(result.is_err())
     }

From 9d86fa9d03190a91881f8640f45f1bfe851f73cb Mon Sep 17 00:00:00 2001
From: Andy Grove <agrove@apache.org>
Date: Wed, 28 Aug 2024 17:39:44 -0600
Subject: [PATCH 23/68] feat: Implement to_json for subset of types (#805)

* add skeleton for StructsToJson

* first test passes

* add support for nested structs

* add support for strings and improve test

* clippy

* format

* prepare for review

* remove perf results

* update user guide

* add microbenchmark

* remove comment

* update docs

* reduce size of diff

* add failing test for quotes in field names and values

* test passes

* clippy

* revert a docs change

* Update native/spark-expr/src/to_json.rs

Co-authored-by: Emil Ejbyfeldt <emil.ejbyfeldt@gmail.com>

* address feedback

* support tabs

* newlines

* backspace

* clippy

* fix test regression

* cargo fmt

---------

Co-authored-by: Emil Ejbyfeldt <emil.ejbyfeldt@gmail.com>
---
 src/cast.rs    |   4 +-
 src/lib.rs     |   2 +
 src/to_json.rs | 352 +++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 356 insertions(+), 2 deletions(-)
 create mode 100644 src/to_json.rs

diff --git a/src/cast.rs b/src/cast.rs
index ed8cdc2fe153..12dbfdcdc7ab 100644
--- a/src/cast.rs
+++ b/src/cast.rs
@@ -582,7 +582,7 @@ pub fn spark_cast(
     arg: ColumnarValue,
     data_type: &DataType,
     eval_mode: EvalMode,
-    timezone: String,
+    timezone: &str,
     allow_incompat: bool,
 ) -> DataFusionResult<ColumnarValue> {
     match arg {
@@ -1414,7 +1414,7 @@ impl PhysicalExpr for Cast {
             arg,
             &self.data_type,
             self.eval_mode,
-            self.timezone.clone(),
+            &self.timezone,
             self.allow_incompat,
         )
     }
diff --git a/src/lib.rs b/src/lib.rs
index 9fb16f94d986..6233a29eb7b5 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -30,6 +30,7 @@ pub mod spark_hash;
 mod structs;
 mod temporal;
 pub mod timezone;
+mod to_json;
 pub mod utils;
 mod xxhash64;
 
@@ -39,6 +40,7 @@ pub use if_expr::IfExpr;
 pub use regexp::RLike;
 pub use structs::{CreateNamedStruct, GetStructField};
 pub use temporal::{DateTruncExpr, HourExpr, MinuteExpr, SecondExpr, TimestampTruncExpr};
+pub use to_json::ToJson;
 
 /// Spark supports three evaluation modes when evaluating expressions, which affect
 /// the behavior when processing input values that are invalid or would result in an
diff --git a/src/to_json.rs b/src/to_json.rs
new file mode 100644
index 000000000000..2b9a2c540780
--- /dev/null
+++ b/src/to_json.rs
@@ -0,0 +1,352 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// TODO upstream this to DataFusion as long as we have a way to specify all
+// of the Spark-specific compatibility features that we need (including
+// being able to specify Spark-compatible cast from all types to string)
+
+use crate::{spark_cast, EvalMode};
+use arrow_array::builder::StringBuilder;
+use arrow_array::{Array, ArrayRef, RecordBatch, StringArray, StructArray};
+use arrow_schema::{DataType, Schema};
+use datafusion_common::Result;
+use datafusion_expr::ColumnarValue;
+use datafusion_physical_expr_common::physical_expr::PhysicalExpr;
+use std::any::Any;
+use std::fmt::{Debug, Display, Formatter};
+use std::hash::{Hash, Hasher};
+use std::sync::Arc;
+
+/// to_json function
+#[derive(Debug, Hash)]
+pub struct ToJson {
+    /// The input to convert to JSON
+    expr: Arc<dyn PhysicalExpr>,
+    /// Timezone to use when converting timestamps to JSON
+    timezone: String,
+}
+
+impl ToJson {
+    pub fn new(expr: Arc<dyn PhysicalExpr>, timezone: &str) -> Self {
+        Self {
+            expr,
+            timezone: timezone.to_owned(),
+        }
+    }
+}
+
+impl Display for ToJson {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        write!(f, "to_json({}, timezone={})", self.expr, self.timezone)
+    }
+}
+
+impl PartialEq<dyn Any> for ToJson {
+    fn eq(&self, other: &dyn Any) -> bool {
+        if let Some(other) = other.downcast_ref::<ToJson>() {
+            self.expr.eq(&other.expr) && self.timezone.eq(&other.timezone)
+        } else {
+            false
+        }
+    }
+}
+
+impl PhysicalExpr for ToJson {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn data_type(&self, _: &Schema) -> Result<DataType> {
+        Ok(DataType::Utf8)
+    }
+
+    fn nullable(&self, input_schema: &Schema) -> Result<bool> {
+        self.expr.nullable(input_schema)
+    }
+
+    fn evaluate(&self, batch: &RecordBatch) -> Result<ColumnarValue> {
+        let input = self.expr.evaluate(batch)?.into_array(batch.num_rows())?;
+        Ok(ColumnarValue::Array(array_to_json_string(
+            &input,
+            &self.timezone,
+        )?))
+    }
+
+    fn children(&self) -> Vec<&Arc<dyn PhysicalExpr>> {
+        vec![&self.expr]
+    }
+
+    fn with_new_children(
+        self: Arc<Self>,
+        children: Vec<Arc<dyn PhysicalExpr>>,
+    ) -> Result<Arc<dyn PhysicalExpr>> {
+        assert!(children.len() == 1);
+        Ok(Arc::new(Self::new(
+            Arc::clone(&children[0]),
+            &self.timezone,
+        )))
+    }
+
+    fn dyn_hash(&self, state: &mut dyn Hasher) {
+        let mut s = state;
+        self.expr.hash(&mut s);
+        self.timezone.hash(&mut s);
+        self.hash(&mut s);
+    }
+}
+
+/// Convert an array into a JSON value string representation
+fn array_to_json_string(arr: &Arc<dyn Array>, timezone: &str) -> Result<ArrayRef> {
+    if let Some(struct_array) = arr.as_any().downcast_ref::<StructArray>() {
+        struct_to_json(struct_array, timezone)
+    } else {
+        spark_cast(
+            ColumnarValue::Array(Arc::clone(arr)),
+            &DataType::Utf8,
+            EvalMode::Legacy,
+            timezone,
+            false,
+        )?
+        .into_array(arr.len())
+    }
+}
+
+fn escape_string(input: &str) -> String {
+    let mut escaped_string = String::with_capacity(input.len());
+    let mut is_escaped = false;
+    for c in input.chars() {
+        match c {
+            '\"' | '\\' if !is_escaped => {
+                escaped_string.push('\\');
+                escaped_string.push(c);
+                is_escaped = false;
+            }
+            '\t' => {
+                escaped_string.push('\\');
+                escaped_string.push('t');
+                is_escaped = false;
+            }
+            '\r' => {
+                escaped_string.push('\\');
+                escaped_string.push('r');
+                is_escaped = false;
+            }
+            '\n' => {
+                escaped_string.push('\\');
+                escaped_string.push('n');
+                is_escaped = false;
+            }
+            '\x0C' => {
+                escaped_string.push('\\');
+                escaped_string.push('f');
+                is_escaped = false;
+            }
+            '\x08' => {
+                escaped_string.push('\\');
+                escaped_string.push('b');
+                is_escaped = false;
+            }
+            '\\' => {
+                escaped_string.push('\\');
+                is_escaped = true;
+            }
+            _ => {
+                escaped_string.push(c);
+                is_escaped = false;
+            }
+        }
+    }
+    escaped_string
+}
+
+fn struct_to_json(array: &StructArray, timezone: &str) -> Result<ArrayRef> {
+    // get field names and escape any quotes
+    let field_names: Vec<String> = array
+        .fields()
+        .iter()
+        .map(|f| escape_string(f.name().as_str()))
+        .collect();
+    // determine which fields need to have their values quoted
+    let is_string: Vec<bool> = array
+        .fields()
+        .iter()
+        .map(|f| match f.data_type() {
+            DataType::Utf8 | DataType::LargeUtf8 => true,
+            DataType::Dictionary(_, dt) => {
+                matches!(dt.as_ref(), DataType::Utf8 | DataType::LargeUtf8)
+            }
+            _ => false,
+        })
+        .collect();
+    // create JSON string representation of each column
+    let string_arrays: Vec<ArrayRef> = array
+        .columns()
+        .iter()
+        .map(|arr| array_to_json_string(arr, timezone))
+        .collect::<Result<Vec<_>>>()?;
+    let string_arrays: Vec<&StringArray> = string_arrays
+        .iter()
+        .map(|arr| {
+            arr.as_any()
+                .downcast_ref::<StringArray>()
+                .expect("string array")
+        })
+        .collect();
+    // build the JSON string containing entries in the format `"field_name":field_value`
+    let mut builder = StringBuilder::with_capacity(array.len(), array.len() * 16);
+    let mut json = String::with_capacity(array.len() * 16);
+    for row_index in 0..array.len() {
+        if array.is_null(row_index) {
+            builder.append_null();
+        } else {
+            json.clear();
+            let mut any_fields_written = false;
+            json.push('{');
+            for col_index in 0..string_arrays.len() {
+                if !string_arrays[col_index].is_null(row_index) {
+                    if any_fields_written {
+                        json.push(',');
+                    }
+                    // quoted field name
+                    json.push('"');
+                    json.push_str(&field_names[col_index]);
+                    json.push_str("\":");
+                    // value
+                    let string_value = string_arrays[col_index].value(row_index);
+                    if is_string[col_index] {
+                        json.push('"');
+                        json.push_str(&escape_string(string_value));
+                        json.push('"');
+                    } else {
+                        json.push_str(string_value);
+                    }
+                    any_fields_written = true;
+                }
+            }
+            json.push('}');
+            builder.append_value(&json);
+        }
+    }
+    Ok(Arc::new(builder.finish()))
+}
+
+#[cfg(test)]
+mod test {
+    use crate::to_json::struct_to_json;
+    use arrow_array::types::Int32Type;
+    use arrow_array::{Array, PrimitiveArray, StringArray};
+    use arrow_array::{ArrayRef, BooleanArray, Int32Array, StructArray};
+    use arrow_schema::{DataType, Field};
+    use datafusion_common::Result;
+    use std::sync::Arc;
+
+    #[test]
+    fn test_primitives() -> Result<()> {
+        let bools: ArrayRef = create_bools();
+        let ints: ArrayRef = create_ints();
+        let strings: ArrayRef = create_strings();
+        let struct_array = StructArray::from(vec![
+            (Arc::new(Field::new("a", DataType::Boolean, true)), bools),
+            (Arc::new(Field::new("b", DataType::Int32, true)), ints),
+            (Arc::new(Field::new("c", DataType::Utf8, true)), strings),
+        ]);
+        let json = struct_to_json(&struct_array, "UTC")?;
+        let json = json
+            .as_any()
+            .downcast_ref::<StringArray>()
+            .expect("string array");
+        assert_eq!(4, json.len());
+        assert_eq!(r#"{"b":123}"#, json.value(0));
+        assert_eq!(r#"{"a":true,"c":"foo"}"#, json.value(1));
+        assert_eq!(r#"{"a":false,"b":2147483647,"c":"bar"}"#, json.value(2));
+        assert_eq!(r#"{"a":false,"b":-2147483648,"c":""}"#, json.value(3));
+        Ok(())
+    }
+
+    #[test]
+    fn test_nested_struct() -> Result<()> {
+        let bools: ArrayRef = create_bools();
+        let ints: ArrayRef = create_ints();
+
+        // create first struct array
+        let struct_fields = vec![
+            Arc::new(Field::new("a", DataType::Boolean, true)),
+            Arc::new(Field::new("b", DataType::Int32, true)),
+        ];
+        let struct_values = vec![bools, ints];
+        let struct_array = StructArray::from(
+            struct_fields
+                .clone()
+                .into_iter()
+                .zip(struct_values)
+                .collect::<Vec<_>>(),
+        );
+
+        // create second struct array containing the first struct array
+        let struct_fields2 = vec![Arc::new(Field::new(
+            "a",
+            DataType::Struct(struct_fields.into()),
+            true,
+        ))];
+        let struct_values2: Vec<ArrayRef> = vec![Arc::new(struct_array.clone())];
+        let struct_array2 = StructArray::from(
+            struct_fields2
+                .into_iter()
+                .zip(struct_values2)
+                .collect::<Vec<_>>(),
+        );
+
+        let json = struct_to_json(&struct_array2, "UTC")?;
+        let json = json
+            .as_any()
+            .downcast_ref::<StringArray>()
+            .expect("string array");
+        assert_eq!(4, json.len());
+        assert_eq!(r#"{"a":{"b":123}}"#, json.value(0));
+        assert_eq!(r#"{"a":{"a":true}}"#, json.value(1));
+        assert_eq!(r#"{"a":{"a":false,"b":2147483647}}"#, json.value(2));
+        assert_eq!(r#"{"a":{"a":false,"b":-2147483648}}"#, json.value(3));
+        Ok(())
+    }
+
+    fn create_ints() -> Arc<PrimitiveArray<Int32Type>> {
+        Arc::new(Int32Array::from(vec![
+            Some(123),
+            None,
+            Some(i32::MAX),
+            Some(i32::MIN),
+        ]))
+    }
+
+    fn create_bools() -> Arc<BooleanArray> {
+        Arc::new(BooleanArray::from(vec![
+            None,
+            Some(true),
+            Some(false),
+            Some(false),
+        ]))
+    }
+
+    fn create_strings() -> Arc<StringArray> {
+        Arc::new(StringArray::from(vec![
+            None,
+            Some("foo"),
+            Some("bar"),
+            Some(""),
+        ]))
+    }
+}

From d9eecde571d740b5273d197ecba667cf268ea62b Mon Sep 17 00:00:00 2001
From: Liang-Chi Hsieh <viirya@gmail.com>
Date: Thu, 29 Aug 2024 17:22:16 -0700
Subject: [PATCH 24/68] feat: Support sort merge join with a join condition
 (#553)

* Init

* test

* test

* test

* Use specified commit to test

* Fix format

* fix clippy

* fix

* fix

* Fix

* Change to SQL syntax

* Disable SMJ LeftAnti with join filter

* Fix

* Add test

* Add test

* Update to last DataFusion commit

* fix format

* fix

* Update diffs
---
 Cargo.toml             | 1 -
 benches/conditional.rs | 6 +++---
 src/regexp.rs          | 2 +-
 src/structs.rs         | 4 ++--
 src/to_json.rs         | 2 +-
 5 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/Cargo.toml b/Cargo.toml
index 1a8c8aeb4b8f..0a371a6e6140 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -34,7 +34,6 @@ chrono = { workspace = true }
 datafusion = { workspace = true }
 datafusion-common = { workspace = true }
 datafusion-expr = { workspace = true }
-datafusion-physical-expr-common = { workspace = true }
 datafusion-physical-expr = { workspace = true }
 datafusion-physical-plan = { workspace = true }
 chrono-tz = { workspace = true }
diff --git a/benches/conditional.rs b/benches/conditional.rs
index d86ef76f82ee..444928d5a086 100644
--- a/benches/conditional.rs
+++ b/benches/conditional.rs
@@ -23,10 +23,10 @@ use criterion::{black_box, criterion_group, criterion_main, Criterion};
 use datafusion_comet_spark_expr::IfExpr;
 use datafusion_common::ScalarValue;
 use datafusion_expr::Operator;
+use datafusion_physical_expr::expressions::Column;
+use datafusion_physical_expr::expressions::Literal;
 use datafusion_physical_expr::expressions::{BinaryExpr, CaseExpr};
-use datafusion_physical_expr_common::expressions::column::Column;
-use datafusion_physical_expr_common::expressions::Literal;
-use datafusion_physical_expr_common::physical_expr::PhysicalExpr;
+use datafusion_physical_expr::PhysicalExpr;
 use std::sync::Arc;
 
 fn make_col(name: &str, index: usize) -> Arc<dyn PhysicalExpr> {
diff --git a/src/regexp.rs b/src/regexp.rs
index f5e0d53dc57a..221fd1f04757 100644
--- a/src/regexp.rs
+++ b/src/regexp.rs
@@ -24,7 +24,7 @@ use arrow_array::{Array, BooleanArray, DictionaryArray, RecordBatch, StringArray
 use arrow_schema::{DataType, Schema};
 use datafusion_common::{internal_err, Result};
 use datafusion_expr::ColumnarValue;
-use datafusion_physical_expr_common::physical_expr::PhysicalExpr;
+use datafusion_physical_expr::PhysicalExpr;
 use regex::Regex;
 use std::any::Any;
 use std::fmt::{Display, Formatter};
diff --git a/src/structs.rs b/src/structs.rs
index b3c3737d206c..49017b671094 100644
--- a/src/structs.rs
+++ b/src/structs.rs
@@ -239,8 +239,8 @@ mod test {
     use arrow_schema::{DataType, Field, Schema};
     use datafusion_common::Result;
     use datafusion_expr::ColumnarValue;
-    use datafusion_physical_expr_common::expressions::column::Column;
-    use datafusion_physical_expr_common::physical_expr::PhysicalExpr;
+    use datafusion_physical_expr::expressions::Column;
+    use datafusion_physical_expr::PhysicalExpr;
     use std::sync::Arc;
 
     #[test]
diff --git a/src/to_json.rs b/src/to_json.rs
index 2b9a2c540780..7d38cbf1b6df 100644
--- a/src/to_json.rs
+++ b/src/to_json.rs
@@ -25,7 +25,7 @@ use arrow_array::{Array, ArrayRef, RecordBatch, StringArray, StructArray};
 use arrow_schema::{DataType, Schema};
 use datafusion_common::Result;
 use datafusion_expr::ColumnarValue;
-use datafusion_physical_expr_common::physical_expr::PhysicalExpr;
+use datafusion_physical_expr::PhysicalExpr;
 use std::any::Any;
 use std::fmt::{Debug, Display, Formatter};
 use std::hash::{Hash, Hasher};

From 74097da03fd9a3e14d51eb7ac0090044054254eb Mon Sep 17 00:00:00 2001
From: Adam Binford <adamq43@gmail.com>
Date: Thu, 5 Sep 2024 07:02:37 -0400
Subject: [PATCH 25/68] feat: Array element extraction (#899)

---
 src/lib.rs  |   2 +
 src/list.rs | 325 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 327 insertions(+)
 create mode 100644 src/list.rs

diff --git a/src/lib.rs b/src/lib.rs
index 6233a29eb7b5..c4b1c99ba930 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -24,6 +24,7 @@ mod error;
 mod if_expr;
 
 mod kernels;
+mod list;
 mod regexp;
 pub mod scalar_funcs;
 pub mod spark_hash;
@@ -37,6 +38,7 @@ mod xxhash64;
 pub use cast::{spark_cast, Cast};
 pub use error::{SparkError, SparkResult};
 pub use if_expr::IfExpr;
+pub use list::ListExtract;
 pub use regexp::RLike;
 pub use structs::{CreateNamedStruct, GetStructField};
 pub use temporal::{DateTruncExpr, HourExpr, MinuteExpr, SecondExpr, TimestampTruncExpr};
diff --git a/src/list.rs b/src/list.rs
new file mode 100644
index 000000000000..ec953b66e9de
--- /dev/null
+++ b/src/list.rs
@@ -0,0 +1,325 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::{array::MutableArrayData, datatypes::ArrowNativeType, record_batch::RecordBatch};
+use arrow_array::{Array, GenericListArray, Int32Array, OffsetSizeTrait};
+use arrow_schema::{DataType, FieldRef, Schema};
+use datafusion::logical_expr::ColumnarValue;
+use datafusion_common::{
+    cast::{as_int32_array, as_large_list_array, as_list_array},
+    internal_err, DataFusionError, Result as DataFusionResult, ScalarValue,
+};
+use datafusion_physical_expr::PhysicalExpr;
+use std::{
+    any::Any,
+    fmt::{Display, Formatter},
+    hash::{Hash, Hasher},
+    sync::Arc,
+};
+
+use crate::utils::down_cast_any_ref;
+
+#[derive(Debug, Hash)]
+pub struct ListExtract {
+    child: Arc<dyn PhysicalExpr>,
+    ordinal: Arc<dyn PhysicalExpr>,
+    default_value: Option<Arc<dyn PhysicalExpr>>,
+    one_based: bool,
+    fail_on_error: bool,
+}
+
+impl ListExtract {
+    pub fn new(
+        child: Arc<dyn PhysicalExpr>,
+        ordinal: Arc<dyn PhysicalExpr>,
+        default_value: Option<Arc<dyn PhysicalExpr>>,
+        one_based: bool,
+        fail_on_error: bool,
+    ) -> Self {
+        Self {
+            child,
+            ordinal,
+            default_value,
+            one_based,
+            fail_on_error,
+        }
+    }
+
+    fn child_field(&self, input_schema: &Schema) -> DataFusionResult<FieldRef> {
+        match self.child.data_type(input_schema)? {
+            DataType::List(field) | DataType::LargeList(field) => Ok(field),
+            data_type => Err(DataFusionError::Internal(format!(
+                "Unexpected data type in ListExtract: {:?}",
+                data_type
+            ))),
+        }
+    }
+}
+
+impl PhysicalExpr for ListExtract {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn data_type(&self, input_schema: &Schema) -> DataFusionResult<DataType> {
+        Ok(self.child_field(input_schema)?.data_type().clone())
+    }
+
+    fn nullable(&self, input_schema: &Schema) -> DataFusionResult<bool> {
+        // Only non-nullable if fail_on_error is enabled and the element is non-nullable
+        Ok(!self.fail_on_error || self.child_field(input_schema)?.is_nullable())
+    }
+
+    fn evaluate(&self, batch: &RecordBatch) -> DataFusionResult<ColumnarValue> {
+        let child_value = self.child.evaluate(batch)?.into_array(batch.num_rows())?;
+        let ordinal_value = self.ordinal.evaluate(batch)?.into_array(batch.num_rows())?;
+
+        let default_value = self
+            .default_value
+            .as_ref()
+            .map(|d| {
+                d.evaluate(batch).map(|value| match value {
+                    ColumnarValue::Scalar(scalar)
+                        if !scalar.data_type().equals_datatype(child_value.data_type()) =>
+                    {
+                        scalar.cast_to(child_value.data_type())
+                    }
+                    ColumnarValue::Scalar(scalar) => Ok(scalar),
+                    v => Err(DataFusionError::Execution(format!(
+                        "Expected scalar default value for ListExtract, got {:?}",
+                        v
+                    ))),
+                })
+            })
+            .transpose()?
+            .unwrap_or(self.data_type(&batch.schema())?.try_into())?;
+
+        let adjust_index = if self.one_based {
+            one_based_index
+        } else {
+            zero_based_index
+        };
+
+        match child_value.data_type() {
+            DataType::List(_) => {
+                let list_array = as_list_array(&child_value)?;
+                let index_array = as_int32_array(&ordinal_value)?;
+
+                list_extract(
+                    list_array,
+                    index_array,
+                    &default_value,
+                    self.fail_on_error,
+                    adjust_index,
+                )
+            }
+            DataType::LargeList(_) => {
+                let list_array = as_large_list_array(&child_value)?;
+                let index_array = as_int32_array(&ordinal_value)?;
+
+                list_extract(
+                    list_array,
+                    index_array,
+                    &default_value,
+                    self.fail_on_error,
+                    adjust_index,
+                )
+            }
+            data_type => Err(DataFusionError::Internal(format!(
+                "Unexpected child type for ListExtract: {:?}",
+                data_type
+            ))),
+        }
+    }
+
+    fn children(&self) -> Vec<&Arc<dyn PhysicalExpr>> {
+        vec![&self.child, &self.ordinal]
+    }
+
+    fn with_new_children(
+        self: Arc<Self>,
+        children: Vec<Arc<dyn PhysicalExpr>>,
+    ) -> datafusion_common::Result<Arc<dyn PhysicalExpr>> {
+        match children.len() {
+            2 => Ok(Arc::new(ListExtract::new(
+                Arc::clone(&children[0]),
+                Arc::clone(&children[1]),
+                self.default_value.clone(),
+                self.one_based,
+                self.fail_on_error,
+            ))),
+            _ => internal_err!("ListExtract should have exactly two children"),
+        }
+    }
+
+    fn dyn_hash(&self, state: &mut dyn Hasher) {
+        let mut s = state;
+        self.child.hash(&mut s);
+        self.ordinal.hash(&mut s);
+        self.default_value.hash(&mut s);
+        self.one_based.hash(&mut s);
+        self.fail_on_error.hash(&mut s);
+        self.hash(&mut s);
+    }
+}
+
+fn one_based_index(index: i32, len: usize) -> DataFusionResult<Option<usize>> {
+    if index == 0 {
+        return Err(DataFusionError::Execution(
+            "Invalid index of 0 for one-based ListExtract".to_string(),
+        ));
+    }
+
+    let abs_index = index.abs().as_usize();
+    if abs_index <= len {
+        if index > 0 {
+            Ok(Some(abs_index - 1))
+        } else {
+            Ok(Some(len - abs_index))
+        }
+    } else {
+        Ok(None)
+    }
+}
+
+fn zero_based_index(index: i32, len: usize) -> DataFusionResult<Option<usize>> {
+    if index < 0 {
+        Ok(None)
+    } else {
+        let positive_index = index.as_usize();
+        if positive_index < len {
+            Ok(Some(positive_index))
+        } else {
+            Ok(None)
+        }
+    }
+}
+
+fn list_extract<O: OffsetSizeTrait>(
+    list_array: &GenericListArray<O>,
+    index_array: &Int32Array,
+    default_value: &ScalarValue,
+    fail_on_error: bool,
+    adjust_index: impl Fn(i32, usize) -> DataFusionResult<Option<usize>>,
+) -> DataFusionResult<ColumnarValue> {
+    let values = list_array.values();
+    let offsets = list_array.offsets();
+
+    let data = values.to_data();
+
+    let default_data = default_value.to_array()?.to_data();
+
+    let mut mutable = MutableArrayData::new(vec![&data, &default_data], true, index_array.len());
+
+    for (row, (offset_window, index)) in offsets.windows(2).zip(index_array.values()).enumerate() {
+        let start = offset_window[0].as_usize();
+        let len = offset_window[1].as_usize() - start;
+
+        if let Some(i) = adjust_index(*index, len)? {
+            mutable.extend(0, start + i, start + i + 1);
+        } else if list_array.is_null(row) {
+            mutable.extend_nulls(1);
+        } else if fail_on_error {
+            return Err(DataFusionError::Execution(
+                "Index out of bounds for array".to_string(),
+            ));
+        } else {
+            mutable.extend(1, 0, 1);
+        }
+    }
+
+    let data = mutable.freeze();
+    Ok(ColumnarValue::Array(arrow::array::make_array(data)))
+}
+
+impl Display for ListExtract {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        write!(
+            f,
+            "ListExtract [child: {:?}, ordinal: {:?}, default_value: {:?}, one_based: {:?}, fail_on_error: {:?}]",
+            self.child, self.ordinal,  self.default_value, self.one_based, self.fail_on_error
+        )
+    }
+}
+
+impl PartialEq<dyn Any> for ListExtract {
+    fn eq(&self, other: &dyn Any) -> bool {
+        down_cast_any_ref(other)
+            .downcast_ref::<Self>()
+            .map(|x| {
+                self.child.eq(&x.child)
+                    && self.ordinal.eq(&x.ordinal)
+                    && (self.default_value.is_none() == x.default_value.is_none())
+                    && self
+                        .default_value
+                        .as_ref()
+                        .zip(x.default_value.as_ref())
+                        .map(|(s, x)| s.eq(x))
+                        .unwrap_or(true)
+                    && self.one_based.eq(&x.one_based)
+                    && self.fail_on_error.eq(&x.fail_on_error)
+            })
+            .unwrap_or(false)
+    }
+}
+
+#[cfg(test)]
+mod test {
+    use crate::list::{list_extract, zero_based_index};
+
+    use arrow::datatypes::Int32Type;
+    use arrow_array::{Array, Int32Array, ListArray};
+    use datafusion_common::{Result, ScalarValue};
+    use datafusion_expr::ColumnarValue;
+
+    #[test]
+    fn test_list_extract_default_value() -> Result<()> {
+        let list = ListArray::from_iter_primitive::<Int32Type, _, _>(vec![
+            Some(vec![Some(1)]),
+            None,
+            Some(vec![]),
+        ]);
+        let indices = Int32Array::from(vec![0, 0, 0]);
+
+        let null_default = ScalarValue::Int32(None);
+
+        let ColumnarValue::Array(result) =
+            list_extract(&list, &indices, &null_default, false, zero_based_index)?
+        else {
+            unreachable!()
+        };
+
+        assert_eq!(
+            &result.to_data(),
+            &Int32Array::from(vec![Some(1), None, None]).to_data()
+        );
+
+        let zero_default = ScalarValue::Int32(Some(0));
+
+        let ColumnarValue::Array(result) =
+            list_extract(&list, &indices, &zero_default, false, zero_based_index)?
+        else {
+            unreachable!()
+        };
+
+        assert_eq!(
+            &result.to_data(),
+            &Int32Array::from(vec![Some(1), None, Some(0)]).to_data()
+        );
+        Ok(())
+    }
+}

From b5763f649c5babe77cdc31f9e51e4850dcedb482 Mon Sep 17 00:00:00 2001
From: Andy Grove <agrove@apache.org>
Date: Thu, 5 Sep 2024 12:09:34 -0600
Subject: [PATCH 26/68] chore: Upgrade to latest DataFusion revision (#909)

* update dependency version

* update avg

* update avg_decimal

* update sum_decimal

* variance

* stddev

* covariance

* correlation

* save progress

* code compiles

* clippy

* remove duplicate of down_cast_any_ref function

* remove duplicate of down_cast_any_ref function

* machete

* bump DF version again and use StatsType from DataFusion

* implement groups accumulator for stddev and variance

* refactor

* fmt

* revert group accumulator
---
 Cargo.toml      |  1 -
 src/cast.rs     |  3 ++-
 src/if_expr.rs  |  3 +--
 src/regexp.rs   |  2 +-
 src/structs.rs  |  3 +--
 src/temporal.rs |  3 ++-
 src/utils.rs    | 18 ------------------
 7 files changed, 7 insertions(+), 26 deletions(-)

diff --git a/Cargo.toml b/Cargo.toml
index 0a371a6e6140..a5d156912922 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -35,7 +35,6 @@ datafusion = { workspace = true }
 datafusion-common = { workspace = true }
 datafusion-expr = { workspace = true }
 datafusion-physical-expr = { workspace = true }
-datafusion-physical-plan = { workspace = true }
 chrono-tz = { workspace = true }
 num = { workspace = true }
 regex = { workspace = true }
diff --git a/src/cast.rs b/src/cast.rs
index 12dbfdcdc7ab..6a3974fe1e73 100644
--- a/src/cast.rs
+++ b/src/cast.rs
@@ -50,6 +50,7 @@ use datafusion_expr::ColumnarValue;
 use datafusion_physical_expr::PhysicalExpr;
 
 use chrono::{NaiveDate, NaiveDateTime, TimeZone, Timelike};
+use datafusion::physical_expr_common::physical_expr::down_cast_any_ref;
 use num::{
     cast::AsPrimitive, integer::div_floor, traits::CheckedNeg, CheckedSub, Integer, Num,
     ToPrimitive,
@@ -57,7 +58,7 @@ use num::{
 use regex::Regex;
 
 use crate::timezone;
-use crate::utils::{array_with_timezone, down_cast_any_ref};
+use crate::utils::array_with_timezone;
 
 use crate::{EvalMode, SparkError, SparkResult};
 
diff --git a/src/if_expr.rs b/src/if_expr.rs
index 9a90b727ce77..193a90fb5578 100644
--- a/src/if_expr.rs
+++ b/src/if_expr.rs
@@ -26,11 +26,10 @@ use arrow::{
     record_batch::RecordBatch,
 };
 use datafusion::logical_expr::ColumnarValue;
+use datafusion::physical_expr_common::physical_expr::down_cast_any_ref;
 use datafusion_common::Result;
 use datafusion_physical_expr::{expressions::CaseExpr, PhysicalExpr};
 
-use crate::utils::down_cast_any_ref;
-
 /// IfExpr is a wrapper around CaseExpr, because `IF(a, b, c)` is semantically equivalent to
 /// `CASE WHEN a THEN b ELSE c END`.
 #[derive(Debug, Hash)]
diff --git a/src/regexp.rs b/src/regexp.rs
index 221fd1f04757..c7626285a28c 100644
--- a/src/regexp.rs
+++ b/src/regexp.rs
@@ -15,13 +15,13 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use crate::utils::down_cast_any_ref;
 use crate::SparkError;
 use arrow::compute::take;
 use arrow_array::builder::BooleanBuilder;
 use arrow_array::types::Int32Type;
 use arrow_array::{Array, BooleanArray, DictionaryArray, RecordBatch, StringArray};
 use arrow_schema::{DataType, Schema};
+use datafusion::physical_expr_common::physical_expr::down_cast_any_ref;
 use datafusion_common::{internal_err, Result};
 use datafusion_expr::ColumnarValue;
 use datafusion_physical_expr::PhysicalExpr;
diff --git a/src/structs.rs b/src/structs.rs
index 49017b671094..cda8246d90b4 100644
--- a/src/structs.rs
+++ b/src/structs.rs
@@ -19,6 +19,7 @@ use arrow::record_batch::RecordBatch;
 use arrow_array::{Array, StructArray};
 use arrow_schema::{DataType, Field, Schema};
 use datafusion::logical_expr::ColumnarValue;
+use datafusion::physical_expr_common::physical_expr::down_cast_any_ref;
 use datafusion_common::{DataFusionError, Result as DataFusionResult, ScalarValue};
 use datafusion_physical_expr::PhysicalExpr;
 use std::{
@@ -28,8 +29,6 @@ use std::{
     sync::Arc,
 };
 
-use crate::utils::down_cast_any_ref;
-
 #[derive(Debug, Hash)]
 pub struct CreateNamedStruct {
     values: Vec<Arc<dyn PhysicalExpr>>,
diff --git a/src/temporal.rs b/src/temporal.rs
index 415db6070ac4..91953dd60047 100644
--- a/src/temporal.rs
+++ b/src/temporal.rs
@@ -28,10 +28,11 @@ use arrow::{
 };
 use arrow_schema::{DataType, Schema, TimeUnit::Microsecond};
 use datafusion::logical_expr::ColumnarValue;
+use datafusion::physical_expr_common::physical_expr::down_cast_any_ref;
 use datafusion_common::{DataFusionError, ScalarValue::Utf8};
 use datafusion_physical_expr::PhysicalExpr;
 
-use crate::utils::{array_with_timezone, down_cast_any_ref};
+use crate::utils::array_with_timezone;
 
 use crate::kernels::temporal::{
     date_trunc_array_fmt_dyn, date_trunc_dyn, timestamp_trunc_array_fmt_dyn, timestamp_trunc_dyn,
diff --git a/src/utils.rs b/src/utils.rs
index 6945e82b3e4f..db4ad1956a32 100644
--- a/src/utils.rs
+++ b/src/utils.rs
@@ -20,7 +20,6 @@ use arrow_array::{
     types::{Int32Type, TimestampMicrosecondType},
 };
 use arrow_schema::{ArrowError, DataType};
-use std::any::Any;
 use std::sync::Arc;
 
 use crate::timezone::Tz;
@@ -30,23 +29,6 @@ use arrow::{
 };
 use chrono::{DateTime, Offset, TimeZone};
 
-use datafusion_physical_plan::PhysicalExpr;
-
-/// A utility function from DataFusion. It is not exposed by DataFusion.
-pub fn down_cast_any_ref(any: &dyn Any) -> &dyn Any {
-    if any.is::<Arc<dyn PhysicalExpr>>() {
-        any.downcast_ref::<Arc<dyn PhysicalExpr>>()
-            .unwrap()
-            .as_any()
-    } else if any.is::<Box<dyn PhysicalExpr>>() {
-        any.downcast_ref::<Box<dyn PhysicalExpr>>()
-            .unwrap()
-            .as_any()
-    } else {
-        any
-    }
-}
-
 /// Preprocesses input arrays to add timezone information from Spark to Arrow array datatype or
 /// to apply timezone offset.
 //

From c7ed2eb4e11e47000a1ce2cfadc7e5a4eaec33cb Mon Sep 17 00:00:00 2001
From: Andy Grove <agrove@apache.org>
Date: Thu, 5 Sep 2024 14:30:37 -0600
Subject: [PATCH 27/68] build: fix build (#917)

---
 src/list.rs | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/list.rs b/src/list.rs
index ec953b66e9de..0b85a84248e2 100644
--- a/src/list.rs
+++ b/src/list.rs
@@ -19,6 +19,7 @@ use arrow::{array::MutableArrayData, datatypes::ArrowNativeType, record_batch::R
 use arrow_array::{Array, GenericListArray, Int32Array, OffsetSizeTrait};
 use arrow_schema::{DataType, FieldRef, Schema};
 use datafusion::logical_expr::ColumnarValue;
+use datafusion::physical_expr_common::physical_expr::down_cast_any_ref;
 use datafusion_common::{
     cast::{as_int32_array, as_large_list_array, as_list_array},
     internal_err, DataFusionError, Result as DataFusionResult, ScalarValue,
@@ -30,9 +31,6 @@ use std::{
     hash::{Hash, Hasher},
     sync::Arc,
 };
-
-use crate::utils::down_cast_any_ref;
-
 #[derive(Debug, Hash)]
 pub struct ListExtract {
     child: Arc<dyn PhysicalExpr>,

From c7ec30012f1448bf2e0ee603f08846e6a1e979d8 Mon Sep 17 00:00:00 2001
From: Matt Butrovich <mbutrovich@users.noreply.github.com>
Date: Mon, 16 Sep 2024 19:19:24 -0400
Subject: [PATCH 28/68] feat: date_add and date_sub functions (#910)

* date_add test case.

* Add DateAdd to proto and QueryPlanSerde. Next up is the native side.

* Add DateAdd in planner.rs that generates a Literal for right child. Need to confirm if any other type of expression can occur here.

* Minor refactor.

* Change test predicate to actually select some rows.

* Switch to scalar UDF implementation for date_add.

* Docs and minor refactor.

* Add a new test to explicitly cover array scenario.

* cargo clippy fixes

* Fix Scala 2.13.

* New approved plans for q72 due to date_add.

* Address first round of feedback.

* Add date_sub and tests.

* Fix error message to be more general.

* Update error message for Spark 4.0+

* Support Int8 and Int16 for days.
---
 src/scalar_funcs.rs | 83 +++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 80 insertions(+), 3 deletions(-)

diff --git a/src/scalar_funcs.rs b/src/scalar_funcs.rs
index f4270220d708..5cc3f3dd7e9a 100644
--- a/src/scalar_funcs.rs
+++ b/src/scalar_funcs.rs
@@ -15,6 +15,8 @@
 // specific language governing permissions and limitations
 // under the License.
 
+use arrow::compute::kernels::numeric::{add, sub};
+use arrow::datatypes::IntervalDayTime;
 use arrow::{
     array::{
         ArrayRef, AsArray, Decimal128Builder, Float32Array, Float64Array, Int16Array, Int32Array,
@@ -22,9 +24,11 @@ use arrow::{
     },
     datatypes::{validate_decimal_precision, Decimal128Type, Int64Type},
 };
-use arrow_array::builder::GenericStringBuilder;
-use arrow_array::{Array, ArrowNativeTypeOp, BooleanArray, Decimal128Array};
-use arrow_schema::{DataType, DECIMAL128_MAX_PRECISION};
+use arrow_array::builder::{GenericStringBuilder, IntervalDayTimeBuilder};
+use arrow_array::types::{Int16Type, Int32Type, Int8Type};
+use arrow_array::{Array, ArrowNativeTypeOp, BooleanArray, Datum, Decimal128Array};
+use arrow_schema::{ArrowError, DataType, DECIMAL128_MAX_PRECISION};
+use datafusion::physical_expr_common::datum;
 use datafusion::{functions::math::round::round, physical_plan::ColumnarValue};
 use datafusion_common::{
     cast::as_generic_string_array, exec_err, internal_err, DataFusionError,
@@ -547,3 +551,76 @@ pub fn spark_isnan(args: &[ColumnarValue]) -> Result<ColumnarValue, DataFusionEr
         },
     }
 }
+
+macro_rules! scalar_date_arithmetic {
+    ($start:expr, $days:expr, $op:expr) => {{
+        let interval = IntervalDayTime::new(*$days as i32, 0);
+        let interval_cv = ColumnarValue::Scalar(ScalarValue::IntervalDayTime(Some(interval)));
+        datum::apply($start, &interval_cv, $op)
+    }};
+}
+macro_rules! array_date_arithmetic {
+    ($days:expr, $interval_builder:expr, $intType:ty) => {{
+        for day in $days.as_primitive::<$intType>().into_iter() {
+            if let Some(non_null_day) = day {
+                $interval_builder.append_value(IntervalDayTime::new(non_null_day as i32, 0));
+            } else {
+                $interval_builder.append_null();
+            }
+        }
+    }};
+}
+
+/// Spark-compatible `date_add` and `date_sub` expressions, which assumes days for the second
+/// argument, but we cannot directly add that to a Date32. We generate an IntervalDayTime from the
+/// second argument and use DataFusion's interface to apply Arrow's operators.
+fn spark_date_arithmetic(
+    args: &[ColumnarValue],
+    op: impl Fn(&dyn Datum, &dyn Datum) -> Result<ArrayRef, ArrowError>,
+) -> Result<ColumnarValue, DataFusionError> {
+    let start = &args[0];
+    match &args[1] {
+        ColumnarValue::Scalar(ScalarValue::Int8(Some(days))) => {
+            scalar_date_arithmetic!(start, days, op)
+        }
+        ColumnarValue::Scalar(ScalarValue::Int16(Some(days))) => {
+            scalar_date_arithmetic!(start, days, op)
+        }
+        ColumnarValue::Scalar(ScalarValue::Int32(Some(days))) => {
+            scalar_date_arithmetic!(start, days, op)
+        }
+        ColumnarValue::Array(days) => {
+            let mut interval_builder = IntervalDayTimeBuilder::with_capacity(days.len());
+            match days.data_type() {
+                DataType::Int8 => {
+                    array_date_arithmetic!(days, interval_builder, Int8Type)
+                }
+                DataType::Int16 => {
+                    array_date_arithmetic!(days, interval_builder, Int16Type)
+                }
+                DataType::Int32 => {
+                    array_date_arithmetic!(days, interval_builder, Int32Type)
+                }
+                _ => {
+                    return Err(DataFusionError::Internal(format!(
+                        "Unsupported data types {:?} for date arithmetic.",
+                        args,
+                    )))
+                }
+            }
+            let interval_cv = ColumnarValue::Array(Arc::new(interval_builder.finish()));
+            datum::apply(start, &interval_cv, op)
+        }
+        _ => Err(DataFusionError::Internal(format!(
+            "Unsupported data types {:?} for date arithmetic.",
+            args,
+        ))),
+    }
+}
+pub fn spark_date_add(args: &[ColumnarValue]) -> Result<ColumnarValue, DataFusionError> {
+    spark_date_arithmetic(args, add)
+}
+
+pub fn spark_date_sub(args: &[ColumnarValue]) -> Result<ColumnarValue, DataFusionError> {
+    spark_date_arithmetic(args, sub)
+}

From ff41f1bd14332d60d8f472fa995639ff36789698 Mon Sep 17 00:00:00 2001
From: Adam Binford <adamq43@gmail.com>
Date: Mon, 7 Oct 2024 11:59:04 -0400
Subject: [PATCH 29/68] feat: Support `GetArrayStructFields` expression (#993)

* Start working on GetArrayStructFIelds

* Almost have working

* Working

* Add another test

* Remove unused

* Remove unused sql conf
---
 src/lib.rs  |   2 +-
 src/list.rs | 140 +++++++++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 140 insertions(+), 2 deletions(-)

diff --git a/src/lib.rs b/src/lib.rs
index c4b1c99ba930..cc22dfcbceda 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -38,7 +38,7 @@ mod xxhash64;
 pub use cast::{spark_cast, Cast};
 pub use error::{SparkError, SparkResult};
 pub use if_expr::IfExpr;
-pub use list::ListExtract;
+pub use list::{GetArrayStructFields, ListExtract};
 pub use regexp::RLike;
 pub use structs::{CreateNamedStruct, GetStructField};
 pub use temporal::{DateTruncExpr, HourExpr, MinuteExpr, SecondExpr, TimestampTruncExpr};
diff --git a/src/list.rs b/src/list.rs
index 0b85a84248e2..a376198db793 100644
--- a/src/list.rs
+++ b/src/list.rs
@@ -16,7 +16,7 @@
 // under the License.
 
 use arrow::{array::MutableArrayData, datatypes::ArrowNativeType, record_batch::RecordBatch};
-use arrow_array::{Array, GenericListArray, Int32Array, OffsetSizeTrait};
+use arrow_array::{Array, GenericListArray, Int32Array, OffsetSizeTrait, StructArray};
 use arrow_schema::{DataType, FieldRef, Schema};
 use datafusion::logical_expr::ColumnarValue;
 use datafusion::physical_expr_common::physical_expr::down_cast_any_ref;
@@ -275,6 +275,144 @@ impl PartialEq<dyn Any> for ListExtract {
     }
 }
 
+#[derive(Debug, Hash)]
+pub struct GetArrayStructFields {
+    child: Arc<dyn PhysicalExpr>,
+    ordinal: usize,
+}
+
+impl GetArrayStructFields {
+    pub fn new(child: Arc<dyn PhysicalExpr>, ordinal: usize) -> Self {
+        Self { child, ordinal }
+    }
+
+    fn list_field(&self, input_schema: &Schema) -> DataFusionResult<FieldRef> {
+        match self.child.data_type(input_schema)? {
+            DataType::List(field) | DataType::LargeList(field) => Ok(field),
+            data_type => Err(DataFusionError::Internal(format!(
+                "Unexpected data type in GetArrayStructFields: {:?}",
+                data_type
+            ))),
+        }
+    }
+
+    fn child_field(&self, input_schema: &Schema) -> DataFusionResult<FieldRef> {
+        match self.list_field(input_schema)?.data_type() {
+            DataType::Struct(fields) => Ok(Arc::clone(&fields[self.ordinal])),
+            data_type => Err(DataFusionError::Internal(format!(
+                "Unexpected data type in GetArrayStructFields: {:?}",
+                data_type
+            ))),
+        }
+    }
+}
+
+impl PhysicalExpr for GetArrayStructFields {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn data_type(&self, input_schema: &Schema) -> DataFusionResult<DataType> {
+        let struct_field = self.child_field(input_schema)?;
+        match self.child.data_type(input_schema)? {
+            DataType::List(_) => Ok(DataType::List(struct_field)),
+            DataType::LargeList(_) => Ok(DataType::LargeList(struct_field)),
+            data_type => Err(DataFusionError::Internal(format!(
+                "Unexpected data type in GetArrayStructFields: {:?}",
+                data_type
+            ))),
+        }
+    }
+
+    fn nullable(&self, input_schema: &Schema) -> DataFusionResult<bool> {
+        Ok(self.list_field(input_schema)?.is_nullable()
+            || self.child_field(input_schema)?.is_nullable())
+    }
+
+    fn evaluate(&self, batch: &RecordBatch) -> DataFusionResult<ColumnarValue> {
+        let child_value = self.child.evaluate(batch)?.into_array(batch.num_rows())?;
+
+        match child_value.data_type() {
+            DataType::List(_) => {
+                let list_array = as_list_array(&child_value)?;
+
+                get_array_struct_fields(list_array, self.ordinal)
+            }
+            DataType::LargeList(_) => {
+                let list_array = as_large_list_array(&child_value)?;
+
+                get_array_struct_fields(list_array, self.ordinal)
+            }
+            data_type => Err(DataFusionError::Internal(format!(
+                "Unexpected child type for ListExtract: {:?}",
+                data_type
+            ))),
+        }
+    }
+
+    fn children(&self) -> Vec<&Arc<dyn PhysicalExpr>> {
+        vec![&self.child]
+    }
+
+    fn with_new_children(
+        self: Arc<Self>,
+        children: Vec<Arc<dyn PhysicalExpr>>,
+    ) -> datafusion_common::Result<Arc<dyn PhysicalExpr>> {
+        match children.len() {
+            1 => Ok(Arc::new(GetArrayStructFields::new(
+                Arc::clone(&children[0]),
+                self.ordinal,
+            ))),
+            _ => internal_err!("GetArrayStructFields should have exactly one child"),
+        }
+    }
+
+    fn dyn_hash(&self, state: &mut dyn Hasher) {
+        let mut s = state;
+        self.child.hash(&mut s);
+        self.ordinal.hash(&mut s);
+        self.hash(&mut s);
+    }
+}
+
+fn get_array_struct_fields<O: OffsetSizeTrait>(
+    list_array: &GenericListArray<O>,
+    ordinal: usize,
+) -> DataFusionResult<ColumnarValue> {
+    let values = list_array
+        .values()
+        .as_any()
+        .downcast_ref::<StructArray>()
+        .expect("A struct is expected");
+
+    let column = Arc::clone(values.column(ordinal));
+    let field = Arc::clone(&values.fields()[ordinal]);
+
+    let offsets = list_array.offsets();
+    let array = GenericListArray::new(field, offsets.clone(), column, list_array.nulls().cloned());
+
+    Ok(ColumnarValue::Array(Arc::new(array)))
+}
+
+impl Display for GetArrayStructFields {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        write!(
+            f,
+            "GetArrayStructFields [child: {:?}, ordinal: {:?}]",
+            self.child, self.ordinal
+        )
+    }
+}
+
+impl PartialEq<dyn Any> for GetArrayStructFields {
+    fn eq(&self, other: &dyn Any) -> bool {
+        down_cast_any_ref(other)
+            .downcast_ref::<Self>()
+            .map(|x| self.child.eq(&x.child) && self.ordinal.eq(&x.ordinal))
+            .unwrap_or(false)
+    }
+}
+
 #[cfg(test)]
 mod test {
     use crate::list::{list_extract, zero_based_index};

From e1e54835cd377f67de18fedd468e277d0a8c6904 Mon Sep 17 00:00:00 2001
From: KAZUYUKI TANIMURA <ktanimura@apple.com>
Date: Mon, 14 Oct 2024 15:44:23 -0700
Subject: [PATCH 30/68] chore: Bump arrow-rs to 53.1.0 and datafusion (#1001)

## Which issue does this PR close?

## Rationale for this change

Arrow-rs 53.1.0 includes performance improvements

## What changes are included in this PR?

Bumping arrow-rs to 53.1.0 and datafusion to a revision

## How are these changes tested?

existing tests
---
 src/scalar_funcs/hash_expressions.rs | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/scalar_funcs/hash_expressions.rs b/src/scalar_funcs/hash_expressions.rs
index e76314f41e0c..33c8cc5e9451 100644
--- a/src/scalar_funcs/hash_expressions.rs
+++ b/src/scalar_funcs/hash_expressions.rs
@@ -22,7 +22,7 @@ use arrow_array::{ArrayRef, Int32Array, Int64Array, StringArray};
 use datafusion::functions::crypto::{sha224, sha256, sha384, sha512};
 use datafusion_common::cast::as_binary_array;
 use datafusion_common::{exec_err, internal_err, DataFusionError, ScalarValue};
-use datafusion_expr::{ColumnarValue, ScalarFunctionImplementation};
+use datafusion_expr::{ColumnarValue, ScalarUDF};
 use std::sync::Arc;
 
 /// Spark compatible murmur3 hash (just `hash` in Spark) in vectorized execution fashion
@@ -115,31 +115,31 @@ pub fn spark_xxhash64(args: &[ColumnarValue]) -> Result<ColumnarValue, DataFusio
 
 /// `sha224` function that simulates Spark's `sha2` expression with bit width 224
 pub fn spark_sha224(args: &[ColumnarValue]) -> Result<ColumnarValue, DataFusionError> {
-    wrap_digest_result_as_hex_string(args, sha224().fun())
+    wrap_digest_result_as_hex_string(args, sha224())
 }
 
 /// `sha256` function that simulates Spark's `sha2` expression with bit width 0 or 256
 pub fn spark_sha256(args: &[ColumnarValue]) -> Result<ColumnarValue, DataFusionError> {
-    wrap_digest_result_as_hex_string(args, sha256().fun())
+    wrap_digest_result_as_hex_string(args, sha256())
 }
 
 /// `sha384` function that simulates Spark's `sha2` expression with bit width 384
 pub fn spark_sha384(args: &[ColumnarValue]) -> Result<ColumnarValue, DataFusionError> {
-    wrap_digest_result_as_hex_string(args, sha384().fun())
+    wrap_digest_result_as_hex_string(args, sha384())
 }
 
 /// `sha512` function that simulates Spark's `sha2` expression with bit width 512
 pub fn spark_sha512(args: &[ColumnarValue]) -> Result<ColumnarValue, DataFusionError> {
-    wrap_digest_result_as_hex_string(args, sha512().fun())
+    wrap_digest_result_as_hex_string(args, sha512())
 }
 
 // Spark requires hex string as the result of sha2 functions, we have to wrap the
 // result of digest functions as hex string
 fn wrap_digest_result_as_hex_string(
     args: &[ColumnarValue],
-    digest: ScalarFunctionImplementation,
+    digest: Arc<ScalarUDF>,
 ) -> Result<ColumnarValue, DataFusionError> {
-    let value = digest(args)?;
+    let value = digest.invoke(args)?;
     match value {
         ColumnarValue::Array(array) => {
             let binary_array = as_binary_array(&array)?;

From 2fea49f8fab237e9e2696c69960a982f41f5bfec Mon Sep 17 00:00:00 2001
From: NoeB <noe.brehm@bluewin.ch>
Date: Wed, 30 Oct 2024 14:34:46 +0100
Subject: [PATCH 31/68] chore: Use twox-hash 2.0 xxhash64 oneshot api instead
 of custom implementation (#1041)

---
 Cargo.toml             |   3 +-
 benches/conditional.rs |   4 +-
 src/cast.rs            |   4 +-
 src/lib.rs             |   1 -
 src/spark_hash.rs      |   6 +-
 src/xxhash64.rs        | 190 -----------------------------------------
 6 files changed, 10 insertions(+), 198 deletions(-)
 delete mode 100644 src/xxhash64.rs

diff --git a/Cargo.toml b/Cargo.toml
index a5d156912922..532bf74375df 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -39,12 +39,13 @@ chrono-tz = { workspace = true }
 num = { workspace = true }
 regex = { workspace = true }
 thiserror = { workspace = true }
+twox-hash = "2.0.0"
 
 [dev-dependencies]
 arrow-data = {workspace = true}
 criterion = "0.5.1"
 rand = { workspace = true}
-twox-hash = "1.6.3"
+
 
 [lib]
 name = "datafusion_comet_spark_expr"
diff --git a/benches/conditional.rs b/benches/conditional.rs
index 444928d5a086..97cd9fcb9f3a 100644
--- a/benches/conditional.rs
+++ b/benches/conditional.rs
@@ -51,12 +51,12 @@ fn criterion_benchmark(c: &mut Criterion) {
         if i % 7 == 0 {
             c2.append_null();
         } else {
-            c2.append_value(&format!("string {i}"));
+            c2.append_value(format!("string {i}"));
         }
         if i % 9 == 0 {
             c3.append_null();
         } else {
-            c3.append_value(&format!("other string {i}"));
+            c3.append_value(format!("other string {i}"));
         }
     }
     let c1 = Arc::new(c1.finish());
diff --git a/src/cast.rs b/src/cast.rs
index 6a3974fe1e73..0224aabf66c7 100644
--- a/src/cast.rs
+++ b/src/cast.rs
@@ -1568,9 +1568,7 @@ fn get_timestamp_values<T: TimeZone>(
     timestamp_type: &str,
     tz: &T,
 ) -> SparkResult<Option<i64>> {
-    let values: Vec<_> = value
-        .split(|c| c == 'T' || c == '-' || c == ':' || c == '.')
-        .collect();
+    let values: Vec<_> = value.split(['T', '-', ':', '.']).collect();
     let year = values[0].parse::<i32>().unwrap_or_default();
     let month = values.get(1).map_or(1, |m| m.parse::<u32>().unwrap_or(1));
     let day = values.get(2).map_or(1, |d| d.parse::<u32>().unwrap_or(1));
diff --git a/src/lib.rs b/src/lib.rs
index cc22dfcbceda..614b48f2b88c 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -33,7 +33,6 @@ mod temporal;
 pub mod timezone;
 mod to_json;
 pub mod utils;
-mod xxhash64;
 
 pub use cast::{spark_cast, Cast};
 pub use error::{SparkError, SparkResult};
diff --git a/src/spark_hash.rs b/src/spark_hash.rs
index 66a103a2ae27..1402f717157a 100644
--- a/src/spark_hash.rs
+++ b/src/spark_hash.rs
@@ -22,6 +22,7 @@ use arrow::{
     datatypes::{ArrowNativeTypeOp, UInt16Type, UInt32Type, UInt64Type, UInt8Type},
 };
 use std::sync::Arc;
+use twox_hash::XxHash64;
 
 use datafusion::{
     arrow::{
@@ -34,7 +35,10 @@ use datafusion::{
     error::{DataFusionError, Result},
 };
 
-use crate::xxhash64::spark_compatible_xxhash64;
+#[inline]
+pub(crate) fn spark_compatible_xxhash64<T: AsRef<[u8]>>(data: T, seed: u64) -> u64 {
+    XxHash64::oneshot(seed, data.as_ref())
+}
 
 /// Spark-compatible murmur3 hash function
 #[inline]
diff --git a/src/xxhash64.rs b/src/xxhash64.rs
deleted file mode 100644
index f5a11f66cd7d..000000000000
--- a/src/xxhash64.rs
+++ /dev/null
@@ -1,190 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-//! xxhash64 implementation
-
-const CHUNK_SIZE: usize = 32;
-
-const PRIME_1: u64 = 11_400_714_785_074_694_791;
-const PRIME_2: u64 = 14_029_467_366_897_019_727;
-const PRIME_3: u64 = 1_609_587_929_392_839_161;
-const PRIME_4: u64 = 9_650_029_242_287_828_579;
-const PRIME_5: u64 = 2_870_177_450_012_600_261;
-
-/// Custom implementation of xxhash64 based on code from https://github.com/shepmaster/twox-hash
-/// but optimized for our use case by removing any intermediate buffering, which is
-/// not required because we are operating on data that is already in memory.
-#[inline]
-pub(crate) fn spark_compatible_xxhash64<T: AsRef<[u8]>>(data: T, seed: u64) -> u64 {
-    let data: &[u8] = data.as_ref();
-    let length_bytes = data.len();
-
-    let mut v1 = seed.wrapping_add(PRIME_1).wrapping_add(PRIME_2);
-    let mut v2 = seed.wrapping_add(PRIME_2);
-    let mut v3 = seed;
-    let mut v4 = seed.wrapping_sub(PRIME_1);
-
-    // process chunks of 32 bytes
-    let mut offset_u64_4 = 0;
-    let ptr_u64 = data.as_ptr() as *const u64;
-    unsafe {
-        while offset_u64_4 * CHUNK_SIZE + CHUNK_SIZE <= length_bytes {
-            v1 = ingest_one_number(v1, ptr_u64.add(offset_u64_4 * 4).read_unaligned().to_le());
-            v2 = ingest_one_number(
-                v2,
-                ptr_u64.add(offset_u64_4 * 4 + 1).read_unaligned().to_le(),
-            );
-            v3 = ingest_one_number(
-                v3,
-                ptr_u64.add(offset_u64_4 * 4 + 2).read_unaligned().to_le(),
-            );
-            v4 = ingest_one_number(
-                v4,
-                ptr_u64.add(offset_u64_4 * 4 + 3).read_unaligned().to_le(),
-            );
-            offset_u64_4 += 1;
-        }
-    }
-
-    let mut hash = if length_bytes >= CHUNK_SIZE {
-        // We have processed at least one full chunk
-        let mut hash = v1.rotate_left(1);
-        hash = hash.wrapping_add(v2.rotate_left(7));
-        hash = hash.wrapping_add(v3.rotate_left(12));
-        hash = hash.wrapping_add(v4.rotate_left(18));
-
-        hash = mix_one(hash, v1);
-        hash = mix_one(hash, v2);
-        hash = mix_one(hash, v3);
-        hash = mix_one(hash, v4);
-
-        hash
-    } else {
-        seed.wrapping_add(PRIME_5)
-    };
-
-    hash = hash.wrapping_add(length_bytes as u64);
-
-    // process u64s
-    let mut offset_u64 = offset_u64_4 * 4;
-    while offset_u64 * 8 + 8 <= length_bytes {
-        let mut k1 = unsafe {
-            ptr_u64
-                .add(offset_u64)
-                .read_unaligned()
-                .to_le()
-                .wrapping_mul(PRIME_2)
-        };
-        k1 = k1.rotate_left(31);
-        k1 = k1.wrapping_mul(PRIME_1);
-        hash ^= k1;
-        hash = hash.rotate_left(27);
-        hash = hash.wrapping_mul(PRIME_1);
-        hash = hash.wrapping_add(PRIME_4);
-        offset_u64 += 1;
-    }
-
-    // process u32s
-    let data = &data[offset_u64 * 8..];
-    let ptr_u32 = data.as_ptr() as *const u32;
-    let length_bytes = length_bytes - offset_u64 * 8;
-    let mut offset_u32 = 0;
-    while offset_u32 * 4 + 4 <= length_bytes {
-        let k1 = unsafe {
-            u64::from(ptr_u32.add(offset_u32).read_unaligned().to_le()).wrapping_mul(PRIME_1)
-        };
-        hash ^= k1;
-        hash = hash.rotate_left(23);
-        hash = hash.wrapping_mul(PRIME_2);
-        hash = hash.wrapping_add(PRIME_3);
-        offset_u32 += 1;
-    }
-
-    // process u8s
-    let data = &data[offset_u32 * 4..];
-    let length_bytes = length_bytes - offset_u32 * 4;
-    let mut offset_u8 = 0;
-    while offset_u8 < length_bytes {
-        let k1 = u64::from(data[offset_u8]).wrapping_mul(PRIME_5);
-        hash ^= k1;
-        hash = hash.rotate_left(11);
-        hash = hash.wrapping_mul(PRIME_1);
-        offset_u8 += 1;
-    }
-
-    // The final intermixing
-    hash ^= hash >> 33;
-    hash = hash.wrapping_mul(PRIME_2);
-    hash ^= hash >> 29;
-    hash = hash.wrapping_mul(PRIME_3);
-    hash ^= hash >> 32;
-
-    hash
-}
-
-#[inline(always)]
-fn ingest_one_number(mut current_value: u64, mut value: u64) -> u64 {
-    value = value.wrapping_mul(PRIME_2);
-    current_value = current_value.wrapping_add(value);
-    current_value = current_value.rotate_left(31);
-    current_value.wrapping_mul(PRIME_1)
-}
-
-#[inline(always)]
-fn mix_one(mut hash: u64, mut value: u64) -> u64 {
-    value = value.wrapping_mul(PRIME_2);
-    value = value.rotate_left(31);
-    value = value.wrapping_mul(PRIME_1);
-    hash ^= value;
-    hash = hash.wrapping_mul(PRIME_1);
-    hash.wrapping_add(PRIME_4)
-}
-
-#[cfg(test)]
-mod test {
-    use super::spark_compatible_xxhash64;
-    use rand::Rng;
-    use std::hash::Hasher;
-    use twox_hash::XxHash64;
-
-    #[test]
-    #[cfg_attr(miri, ignore)] // test takes too long with miri
-    fn test_xxhash64_random() {
-        let mut rng = rand::thread_rng();
-        for len in 0..128 {
-            for _ in 0..10 {
-                let data: Vec<u8> = (0..len).map(|_| rng.gen()).collect();
-                let seed = rng.gen();
-                check_xxhash64(&data, seed);
-            }
-        }
-    }
-
-    fn check_xxhash64(data: &[u8], seed: u64) {
-        let mut hasher = XxHash64::with_seed(seed);
-        hasher.write(data.as_ref());
-        let hash1 = hasher.finish();
-        let hash2 = spark_compatible_xxhash64(data, seed);
-        if hash1 != hash2 {
-            panic!("input: {} with seed {seed} produced incorrect hash (comet={hash2}, twox-hash={hash1})",
-                   data.iter().fold(String::new(), |mut output, byte| {
-                       output.push_str(&format!("{:02x}", byte));
-                       output
-                   }))
-        }
-    }
-}

From 118701ab53cc8b1e8086bb37788ed1d5c8679dbf Mon Sep 17 00:00:00 2001
From: Andy Grove <agrove@apache.org>
Date: Tue, 5 Nov 2024 13:23:02 -0700
Subject: [PATCH 32/68] chore: Upgrade to DataFusion 43.0.0-rc1 (#1057)

* Use DataFusion 43.0.0-rc1

* Fix compilation issues

* clippy

* fix

* fix

* ignore failing test
---
 src/scalar_funcs/hash_expressions.rs | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/scalar_funcs/hash_expressions.rs b/src/scalar_funcs/hash_expressions.rs
index 33c8cc5e9451..af423677b717 100644
--- a/src/scalar_funcs/hash_expressions.rs
+++ b/src/scalar_funcs/hash_expressions.rs
@@ -18,7 +18,7 @@
 use crate::scalar_funcs::hex::hex_strings;
 use crate::spark_hash::{create_murmur3_hashes, create_xxhash64_hashes};
 
-use arrow_array::{ArrayRef, Int32Array, Int64Array, StringArray};
+use arrow_array::{Array, ArrayRef, Int32Array, Int64Array, StringArray};
 use datafusion::functions::crypto::{sha224, sha256, sha384, sha512};
 use datafusion_common::cast::as_binary_array;
 use datafusion_common::{exec_err, internal_err, DataFusionError, ScalarValue};
@@ -139,7 +139,11 @@ fn wrap_digest_result_as_hex_string(
     args: &[ColumnarValue],
     digest: Arc<ScalarUDF>,
 ) -> Result<ColumnarValue, DataFusionError> {
-    let value = digest.invoke(args)?;
+    let row_count = match &args[0] {
+        ColumnarValue::Array(array) => array.len(),
+        ColumnarValue::Scalar(_) => 1,
+    };
+    let value = digest.invoke_batch(args, row_count)?;
     match value {
         ColumnarValue::Array(array) => {
             let binary_array = as_binary_array(&array)?;

From 850cde4faa63d4e0bb5a3e844487bfb4ab45262d Mon Sep 17 00:00:00 2001
From: Andy Grove <agrove@apache.org>
Date: Fri, 8 Nov 2024 08:46:56 -0700
Subject: [PATCH 33/68] feat: Implement CAST from struct to string (#1066)

* rough out solution based on to_json

* simple test passes

* cover more types

* format

* improve checks

* update

* update

* update

* add comment

* add link to issue
---
 src/cast.rs | 97 ++++++++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 89 insertions(+), 8 deletions(-)

diff --git a/src/cast.rs b/src/cast.rs
index 0224aabf66c7..0a2f7fef65b5 100644
--- a/src/cast.rs
+++ b/src/cast.rs
@@ -32,8 +32,14 @@ use arrow::{
     record_batch::RecordBatch,
     util::display::FormatOptions,
 };
-use arrow_array::DictionaryArray;
+use arrow_array::builder::StringBuilder;
+use arrow_array::{DictionaryArray, StringArray, StructArray};
 use arrow_schema::{DataType, Schema};
+use datafusion_common::{
+    cast::as_generic_string_array, internal_err, Result as DataFusionResult, ScalarValue,
+};
+use datafusion_expr::ColumnarValue;
+use datafusion_physical_expr::PhysicalExpr;
 use std::str::FromStr;
 use std::{
     any::Any,
@@ -43,12 +49,6 @@ use std::{
     sync::Arc,
 };
 
-use datafusion_common::{
-    cast::as_generic_string_array, internal_err, Result as DataFusionResult, ScalarValue,
-};
-use datafusion_expr::ColumnarValue;
-use datafusion_physical_expr::PhysicalExpr;
-
 use chrono::{NaiveDate, NaiveDateTime, TimeZone, Timelike};
 use datafusion::physical_expr_common::physical_expr::down_cast_any_ref;
 use num::{
@@ -711,6 +711,9 @@ fn cast_array(
         {
             spark_cast_nonintegral_numeric_to_integral(&array, eval_mode, from_type, to_type)
         }
+        (DataType::Struct(_), DataType::Utf8) => {
+            Ok(casts_struct_to_string(array.as_struct(), &timezone)?)
+        }
         _ if is_datafusion_spark_compatible(from_type, to_type, allow_incompat) => {
             // use DataFusion cast only when we know that it is compatible with Spark
             Ok(cast_with_options(&array, to_type, &CAST_OPTIONS)?)
@@ -785,6 +788,7 @@ fn is_datafusion_spark_compatible(
                 | DataType::Float64
                 | DataType::Decimal128(_, _)
                 | DataType::Decimal256(_, _)
+                | DataType::Utf8 // note that there can be formatting differences
         ),
         DataType::Utf8 if allow_incompat => matches!(
             to_type,
@@ -807,6 +811,52 @@ fn is_datafusion_spark_compatible(
     }
 }
 
+fn casts_struct_to_string(array: &StructArray, timezone: &str) -> DataFusionResult<ArrayRef> {
+    // cast each field to a string
+    let string_arrays: Vec<ArrayRef> = array
+        .columns()
+        .iter()
+        .map(|arr| {
+            spark_cast(
+                ColumnarValue::Array(Arc::clone(arr)),
+                &DataType::Utf8,
+                EvalMode::Legacy,
+                timezone,
+                true,
+            )
+            .and_then(|cv| cv.into_array(arr.len()))
+        })
+        .collect::<DataFusionResult<Vec<_>>>()?;
+    let string_arrays: Vec<&StringArray> =
+        string_arrays.iter().map(|arr| arr.as_string()).collect();
+    // build the struct string containing entries in the format `"field_name":field_value`
+    let mut builder = StringBuilder::with_capacity(array.len(), array.len() * 16);
+    let mut str = String::with_capacity(array.len() * 16);
+    for row_index in 0..array.len() {
+        if array.is_null(row_index) {
+            builder.append_null();
+        } else {
+            str.clear();
+            let mut any_fields_written = false;
+            str.push('{');
+            for field in &string_arrays {
+                if any_fields_written {
+                    str.push_str(", ");
+                }
+                if field.is_null(row_index) {
+                    str.push_str("null");
+                } else {
+                    str.push_str(field.value(row_index));
+                }
+                any_fields_written = true;
+            }
+            str.push('}');
+            builder.append_value(&str);
+        }
+    }
+    Ok(Arc::new(builder.finish()))
+}
+
 fn cast_string_to_int<OffsetSize: OffsetSizeTrait>(
     to_type: &DataType,
     array: &ArrayRef,
@@ -1879,7 +1929,7 @@ fn trim_end(s: &str) -> &str {
 mod tests {
     use arrow::datatypes::TimestampMicrosecondType;
     use arrow_array::StringArray;
-    use arrow_schema::TimeUnit;
+    use arrow_schema::{Field, TimeUnit};
     use std::str::FromStr;
 
     use super::*;
@@ -2255,4 +2305,35 @@ mod tests {
         );
         assert!(result.is_err())
     }
+
+    #[test]
+    fn test_cast_struct_to_utf8() {
+        let a: ArrayRef = Arc::new(Int32Array::from(vec![
+            Some(1),
+            Some(2),
+            None,
+            Some(4),
+            Some(5),
+        ]));
+        let b: ArrayRef = Arc::new(StringArray::from(vec!["a", "b", "c", "d", "e"]));
+        let c: ArrayRef = Arc::new(StructArray::from(vec![
+            (Arc::new(Field::new("a", DataType::Int32, true)), a),
+            (Arc::new(Field::new("b", DataType::Utf8, true)), b),
+        ]));
+        let string_array = cast_array(
+            c,
+            &DataType::Utf8,
+            EvalMode::Legacy,
+            "UTC".to_owned(),
+            false,
+        )
+        .unwrap();
+        let string_array = string_array.as_string::<i32>();
+        assert_eq!(5, string_array.len());
+        assert_eq!(r#"{1, a}"#, string_array.value(0));
+        assert_eq!(r#"{2, b}"#, string_array.value(1));
+        assert_eq!(r#"{null, c}"#, string_array.value(2));
+        assert_eq!(r#"{4, d}"#, string_array.value(3));
+        assert_eq!(r#"{5, e}"#, string_array.value(4));
+    }
 }

From b534608d6be9939e4cf25a54345e03db35d2d058 Mon Sep 17 00:00:00 2001
From: Andy Grove <agrove@apache.org>
Date: Mon, 11 Nov 2024 14:25:26 -0700
Subject: [PATCH 34/68] feat: Implement CAST between struct types (#1074)

* implement basic native code for casting struct to struct

* add another test

* rustdoc

* add scala side

* code cleanup

* clippy

* clippy

* add scala test

* improve test

* remove assert

* clippy
---
 src/cast.rs | 112 +++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 110 insertions(+), 2 deletions(-)

diff --git a/src/cast.rs b/src/cast.rs
index 0a2f7fef65b5..13263a595438 100644
--- a/src/cast.rs
+++ b/src/cast.rs
@@ -34,7 +34,7 @@ use arrow::{
 };
 use arrow_array::builder::StringBuilder;
 use arrow_array::{DictionaryArray, StringArray, StructArray};
-use arrow_schema::{DataType, Schema};
+use arrow_schema::{DataType, Field, Schema};
 use datafusion_common::{
     cast::as_generic_string_array, internal_err, Result as DataFusionResult, ScalarValue,
 };
@@ -714,6 +714,14 @@ fn cast_array(
         (DataType::Struct(_), DataType::Utf8) => {
             Ok(casts_struct_to_string(array.as_struct(), &timezone)?)
         }
+        (DataType::Struct(_), DataType::Struct(_)) => Ok(cast_struct_to_struct(
+            array.as_struct(),
+            from_type,
+            to_type,
+            eval_mode,
+            timezone,
+            allow_incompat,
+        )?),
         _ if is_datafusion_spark_compatible(from_type, to_type, allow_incompat) => {
             // use DataFusion cast only when we know that it is compatible with Spark
             Ok(cast_with_options(&array, to_type, &CAST_OPTIONS)?)
@@ -811,6 +819,35 @@ fn is_datafusion_spark_compatible(
     }
 }
 
+/// Cast between struct types based on logic in
+/// `org.apache.spark.sql.catalyst.expressions.Cast#castStruct`.
+fn cast_struct_to_struct(
+    array: &StructArray,
+    from_type: &DataType,
+    to_type: &DataType,
+    eval_mode: EvalMode,
+    timezone: String,
+    allow_incompat: bool,
+) -> DataFusionResult<ArrayRef> {
+    match (from_type, to_type) {
+        (DataType::Struct(_), DataType::Struct(to_fields)) => {
+            let mut cast_fields: Vec<(Arc<Field>, ArrayRef)> = Vec::with_capacity(to_fields.len());
+            for i in 0..to_fields.len() {
+                let cast_field = cast_array(
+                    Arc::clone(array.column(i)),
+                    to_fields[i].data_type(),
+                    eval_mode,
+                    timezone.clone(),
+                    allow_incompat,
+                )?;
+                cast_fields.push((Arc::clone(&to_fields[i]), cast_field));
+            }
+            Ok(Arc::new(StructArray::from(cast_fields)))
+        }
+        _ => unreachable!(),
+    }
+}
+
 fn casts_struct_to_string(array: &StructArray, timezone: &str) -> DataFusionResult<ArrayRef> {
     // cast each field to a string
     let string_arrays: Vec<ArrayRef> = array
@@ -1929,7 +1966,7 @@ fn trim_end(s: &str) -> &str {
 mod tests {
     use arrow::datatypes::TimestampMicrosecondType;
     use arrow_array::StringArray;
-    use arrow_schema::{Field, TimeUnit};
+    use arrow_schema::{Field, Fields, TimeUnit};
     use std::str::FromStr;
 
     use super::*;
@@ -2336,4 +2373,75 @@ mod tests {
         assert_eq!(r#"{4, d}"#, string_array.value(3));
         assert_eq!(r#"{5, e}"#, string_array.value(4));
     }
+
+    #[test]
+    fn test_cast_struct_to_struct() {
+        let a: ArrayRef = Arc::new(Int32Array::from(vec![
+            Some(1),
+            Some(2),
+            None,
+            Some(4),
+            Some(5),
+        ]));
+        let b: ArrayRef = Arc::new(StringArray::from(vec!["a", "b", "c", "d", "e"]));
+        let c: ArrayRef = Arc::new(StructArray::from(vec![
+            (Arc::new(Field::new("a", DataType::Int32, true)), a),
+            (Arc::new(Field::new("b", DataType::Utf8, true)), b),
+        ]));
+        // change type of "a" from Int32 to Utf8
+        let fields = Fields::from(vec![
+            Field::new("a", DataType::Utf8, true),
+            Field::new("b", DataType::Utf8, true),
+        ]);
+        let cast_array = spark_cast(
+            ColumnarValue::Array(c),
+            &DataType::Struct(fields),
+            EvalMode::Legacy,
+            "UTC",
+            false,
+        )
+        .unwrap();
+        if let ColumnarValue::Array(cast_array) = cast_array {
+            assert_eq!(5, cast_array.len());
+            let a = cast_array.as_struct().column(0).as_string::<i32>();
+            assert_eq!("1", a.value(0));
+        } else {
+            unreachable!()
+        }
+    }
+
+    #[test]
+    fn test_cast_struct_to_struct_drop_column() {
+        let a: ArrayRef = Arc::new(Int32Array::from(vec![
+            Some(1),
+            Some(2),
+            None,
+            Some(4),
+            Some(5),
+        ]));
+        let b: ArrayRef = Arc::new(StringArray::from(vec!["a", "b", "c", "d", "e"]));
+        let c: ArrayRef = Arc::new(StructArray::from(vec![
+            (Arc::new(Field::new("a", DataType::Int32, true)), a),
+            (Arc::new(Field::new("b", DataType::Utf8, true)), b),
+        ]));
+        // change type of "a" from Int32 to Utf8 and drop "b"
+        let fields = Fields::from(vec![Field::new("a", DataType::Utf8, true)]);
+        let cast_array = spark_cast(
+            ColumnarValue::Array(c),
+            &DataType::Struct(fields),
+            EvalMode::Legacy,
+            "UTC",
+            false,
+        )
+        .unwrap();
+        if let ColumnarValue::Array(cast_array) = cast_array {
+            assert_eq!(5, cast_array.len());
+            let struct_array = cast_array.as_struct();
+            assert_eq!(1, struct_array.columns().len());
+            let a = struct_array.column(0).as_string::<i32>();
+            assert_eq!("1", a.value(0));
+        } else {
+            unreachable!()
+        }
+    }
 }

From e07a64b53a188ac71b6f2ade545ddcddeb3074fd Mon Sep 17 00:00:00 2001
From: KAZUYUKI TANIMURA <ktanimura@apple.com>
Date: Mon, 18 Nov 2024 19:01:18 -0800
Subject: [PATCH 35/68] fix: Unsigned type related bugs (#1095)

## Which issue does this PR close?

Closes https://github.com/apache/datafusion-comet/issues/1067

## Rationale for this change

Bug fix. A few expressions were failing some unsigned type related tests

## What changes are included in this PR?

 - For `u8`/`u16`, switched to use `generate_cast_to_signed!` in order to copy full i16/i32 width instead of padding zeros in the higher bits
 - `u64` becomes `Decimal(20, 0)` but there was a bug in `round()`  (`>` vs `>=`)

## How are these changes tested?

Put back tests for unsigned types
---
 src/scalar_funcs.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/scalar_funcs.rs b/src/scalar_funcs.rs
index 5cc3f3dd7e9a..2961f038dca0 100644
--- a/src/scalar_funcs.rs
+++ b/src/scalar_funcs.rs
@@ -354,7 +354,7 @@ pub fn spark_round(
             DataType::Int32 if *point < 0 => round_integer_array!(array, point, Int32Array, i32),
             DataType::Int16 if *point < 0 => round_integer_array!(array, point, Int16Array, i16),
             DataType::Int8 if *point < 0 => round_integer_array!(array, point, Int8Array, i8),
-            DataType::Decimal128(_, scale) if *scale > 0 => {
+            DataType::Decimal128(_, scale) if *scale >= 0 => {
                 let f = decimal_round_f(scale, point);
                 let (precision, scale) = get_precision_scale(data_type);
                 make_decimal_array(array, precision, scale, &f)

From 97b750e6618d131e81981671a2f55371037a779e Mon Sep 17 00:00:00 2001
From: Sem <ssinchenko@apache.org>
Date: Fri, 22 Nov 2024 14:25:01 +0100
Subject: [PATCH 36/68] feat: support array_insert (#1073)

* Part of the implementation of array_insert

* Missing methods

* Working version

* Reformat code

* Fix code-style

* Add comments about spark's implementation.

* Implement negative indices

+ fix tests for spark < 3.4

* Fix code-style

* Fix scalastyle

* Fix tests for spark < 3.4

* Fixes & tests

- added test for the negative index
- added test for the legacy spark mode

* Use assume(isSpark34Plus) in tests

* Test else-branch & improve coverage

* Update native/spark-expr/src/list.rs

Co-authored-by: Andy Grove <agrove@apache.org>

* Fix fallback test

In one case there is a zero in index and test fails due to spark error

* Adjust the behaviour for the NULL case to Spark

* Move the logic of type checking to the method

* Fix code-style

---------

Co-authored-by: Andy Grove <agrove@apache.org>
---
 src/lib.rs  |   2 +-
 src/list.rs | 424 +++++++++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 419 insertions(+), 7 deletions(-)

diff --git a/src/lib.rs b/src/lib.rs
index 614b48f2b88c..3ec2e886b5d2 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -37,7 +37,7 @@ pub mod utils;
 pub use cast::{spark_cast, Cast};
 pub use error::{SparkError, SparkResult};
 pub use if_expr::IfExpr;
-pub use list::{GetArrayStructFields, ListExtract};
+pub use list::{ArrayInsert, GetArrayStructFields, ListExtract};
 pub use regexp::RLike;
 pub use structs::{CreateNamedStruct, GetStructField};
 pub use temporal::{DateTruncExpr, HourExpr, MinuteExpr, SecondExpr, TimestampTruncExpr};
diff --git a/src/list.rs b/src/list.rs
index a376198db793..7dc17b568884 100644
--- a/src/list.rs
+++ b/src/list.rs
@@ -15,9 +15,16 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use arrow::{array::MutableArrayData, datatypes::ArrowNativeType, record_batch::RecordBatch};
-use arrow_array::{Array, GenericListArray, Int32Array, OffsetSizeTrait, StructArray};
-use arrow_schema::{DataType, FieldRef, Schema};
+use arrow::{
+    array::{as_primitive_array, Capacities, MutableArrayData},
+    buffer::{NullBuffer, OffsetBuffer},
+    datatypes::ArrowNativeType,
+    record_batch::RecordBatch,
+};
+use arrow_array::{
+    make_array, Array, ArrayRef, GenericListArray, Int32Array, OffsetSizeTrait, StructArray,
+};
+use arrow_schema::{DataType, Field, FieldRef, Schema};
 use datafusion::logical_expr::ColumnarValue;
 use datafusion::physical_expr_common::physical_expr::down_cast_any_ref;
 use datafusion_common::{
@@ -27,10 +34,16 @@ use datafusion_common::{
 use datafusion_physical_expr::PhysicalExpr;
 use std::{
     any::Any,
-    fmt::{Display, Formatter},
+    fmt::{Debug, Display, Formatter},
     hash::{Hash, Hasher},
     sync::Arc,
 };
+
+// 2147483632 == java.lang.Integer.MAX_VALUE - 15
+// It is a value of ByteArrayUtils.MAX_ROUNDED_ARRAY_LENGTH
+// https://github.com/apache/spark/blob/master/common/utils/src/main/java/org/apache/spark/unsafe/array/ByteArrayUtils.java
+const MAX_ROUNDED_ARRAY_LENGTH: usize = 2147483632;
+
 #[derive(Debug, Hash)]
 pub struct ListExtract {
     child: Arc<dyn PhysicalExpr>,
@@ -413,14 +426,297 @@ impl PartialEq<dyn Any> for GetArrayStructFields {
     }
 }
 
+#[derive(Debug, Hash)]
+pub struct ArrayInsert {
+    src_array_expr: Arc<dyn PhysicalExpr>,
+    pos_expr: Arc<dyn PhysicalExpr>,
+    item_expr: Arc<dyn PhysicalExpr>,
+    legacy_negative_index: bool,
+}
+
+impl ArrayInsert {
+    pub fn new(
+        src_array_expr: Arc<dyn PhysicalExpr>,
+        pos_expr: Arc<dyn PhysicalExpr>,
+        item_expr: Arc<dyn PhysicalExpr>,
+        legacy_negative_index: bool,
+    ) -> Self {
+        Self {
+            src_array_expr,
+            pos_expr,
+            item_expr,
+            legacy_negative_index,
+        }
+    }
+
+    pub fn array_type(&self, data_type: &DataType) -> DataFusionResult<DataType> {
+        match data_type {
+            DataType::List(field) => Ok(DataType::List(Arc::clone(field))),
+            DataType::LargeList(field) => Ok(DataType::LargeList(Arc::clone(field))),
+            data_type => Err(DataFusionError::Internal(format!(
+                "Unexpected src array type in ArrayInsert: {:?}",
+                data_type
+            ))),
+        }
+    }
+}
+
+impl PhysicalExpr for ArrayInsert {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn data_type(&self, input_schema: &Schema) -> DataFusionResult<DataType> {
+        self.array_type(&self.src_array_expr.data_type(input_schema)?)
+    }
+
+    fn nullable(&self, input_schema: &Schema) -> DataFusionResult<bool> {
+        self.src_array_expr.nullable(input_schema)
+    }
+
+    fn evaluate(&self, batch: &RecordBatch) -> DataFusionResult<ColumnarValue> {
+        let pos_value = self
+            .pos_expr
+            .evaluate(batch)?
+            .into_array(batch.num_rows())?;
+
+        // Spark supports only IntegerType (Int32):
+        // https://github.com/apache/spark/blob/branch-3.5/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala#L4737
+        if !matches!(pos_value.data_type(), DataType::Int32) {
+            return Err(DataFusionError::Internal(format!(
+                "Unexpected index data type in ArrayInsert: {:?}, expected type is Int32",
+                pos_value.data_type()
+            )));
+        }
+
+        // Check that src array is actually an array and get it's value type
+        let src_value = self
+            .src_array_expr
+            .evaluate(batch)?
+            .into_array(batch.num_rows())?;
+
+        let src_element_type = match self.array_type(src_value.data_type())? {
+            DataType::List(field) => &field.data_type().clone(),
+            DataType::LargeList(field) => &field.data_type().clone(),
+            _ => unreachable!(),
+        };
+
+        // Check that inserted value has the same type as an array
+        let item_value = self
+            .item_expr
+            .evaluate(batch)?
+            .into_array(batch.num_rows())?;
+        if item_value.data_type() != src_element_type {
+            return Err(DataFusionError::Internal(format!(
+                "Type mismatch in ArrayInsert: array type is {:?} but item type is {:?}",
+                src_element_type,
+                item_value.data_type()
+            )));
+        }
+
+        match src_value.data_type() {
+            DataType::List(_) => {
+                let list_array = as_list_array(&src_value)?;
+                array_insert(
+                    list_array,
+                    &item_value,
+                    &pos_value,
+                    self.legacy_negative_index,
+                )
+            }
+            DataType::LargeList(_) => {
+                let list_array = as_large_list_array(&src_value)?;
+                array_insert(
+                    list_array,
+                    &item_value,
+                    &pos_value,
+                    self.legacy_negative_index,
+                )
+            }
+            _ => unreachable!(), // This case is checked already
+        }
+    }
+
+    fn children(&self) -> Vec<&Arc<dyn PhysicalExpr>> {
+        vec![&self.src_array_expr, &self.pos_expr, &self.item_expr]
+    }
+
+    fn with_new_children(
+        self: Arc<Self>,
+        children: Vec<Arc<dyn PhysicalExpr>>,
+    ) -> DataFusionResult<Arc<dyn PhysicalExpr>> {
+        match children.len() {
+            3 => Ok(Arc::new(ArrayInsert::new(
+                Arc::clone(&children[0]),
+                Arc::clone(&children[1]),
+                Arc::clone(&children[2]),
+                self.legacy_negative_index,
+            ))),
+            _ => internal_err!("ArrayInsert should have exactly three childrens"),
+        }
+    }
+
+    fn dyn_hash(&self, _state: &mut dyn Hasher) {
+        let mut s = _state;
+        self.src_array_expr.hash(&mut s);
+        self.pos_expr.hash(&mut s);
+        self.item_expr.hash(&mut s);
+        self.legacy_negative_index.hash(&mut s);
+        self.hash(&mut s);
+    }
+}
+
+fn array_insert<O: OffsetSizeTrait>(
+    list_array: &GenericListArray<O>,
+    items_array: &ArrayRef,
+    pos_array: &ArrayRef,
+    legacy_mode: bool,
+) -> DataFusionResult<ColumnarValue> {
+    // The code is based on the implementation of the array_append from the Apache DataFusion
+    // https://github.com/apache/datafusion/blob/main/datafusion/functions-nested/src/concat.rs#L513
+    //
+    // This code is also based on the implementation of the array_insert from the Apache Spark
+    // https://github.com/apache/spark/blob/branch-3.5/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala#L4713
+
+    let values = list_array.values();
+    let offsets = list_array.offsets();
+    let values_data = values.to_data();
+    let item_data = items_array.to_data();
+    let new_capacity = Capacities::Array(values_data.len() + item_data.len());
+
+    let mut mutable_values =
+        MutableArrayData::with_capacities(vec![&values_data, &item_data], true, new_capacity);
+
+    let mut new_offsets = vec![O::usize_as(0)];
+    let mut new_nulls = Vec::<bool>::with_capacity(list_array.len());
+
+    let pos_data: &Int32Array = as_primitive_array(&pos_array); // Spark supports only i32 for positions
+
+    for (row_index, offset_window) in offsets.windows(2).enumerate() {
+        let pos = pos_data.values()[row_index];
+        let start = offset_window[0].as_usize();
+        let end = offset_window[1].as_usize();
+        let is_item_null = items_array.is_null(row_index);
+
+        if list_array.is_null(row_index) {
+            // In Spark if value of the array is NULL than nothing happens
+            mutable_values.extend_nulls(1);
+            new_offsets.push(new_offsets[row_index] + O::one());
+            new_nulls.push(false);
+            continue;
+        }
+
+        if pos == 0 {
+            return Err(DataFusionError::Internal(
+                "Position for array_insert should be greter or less than zero".to_string(),
+            ));
+        }
+
+        if (pos > 0) || ((-pos).as_usize() < (end - start + 1)) {
+            let corrected_pos = if pos > 0 {
+                (pos - 1).as_usize()
+            } else {
+                end - start - (-pos).as_usize() + if legacy_mode { 0 } else { 1 }
+            };
+            let new_array_len = std::cmp::max(end - start + 1, corrected_pos);
+            if new_array_len > MAX_ROUNDED_ARRAY_LENGTH {
+                return Err(DataFusionError::Internal(format!(
+                    "Max array length in Spark is {:?}, but got {:?}",
+                    MAX_ROUNDED_ARRAY_LENGTH, new_array_len
+                )));
+            }
+
+            if (start + corrected_pos) <= end {
+                mutable_values.extend(0, start, start + corrected_pos);
+                mutable_values.extend(1, row_index, row_index + 1);
+                mutable_values.extend(0, start + corrected_pos, end);
+                new_offsets.push(new_offsets[row_index] + O::usize_as(new_array_len));
+            } else {
+                mutable_values.extend(0, start, end);
+                mutable_values.extend_nulls(new_array_len - (end - start));
+                mutable_values.extend(1, row_index, row_index + 1);
+                // In that case spark actualy makes array longer than expected;
+                // For example, if pos is equal to 5, len is eq to 3, than resulted len will be 5
+                new_offsets.push(new_offsets[row_index] + O::usize_as(new_array_len) + O::one());
+            }
+        } else {
+            // This comment is takes from the Apache Spark source code as is:
+            // special case- if the new position is negative but larger than the current array size
+            // place the new item at start of array, place the current array contents at the end
+            // and fill the newly created array elements inbetween with a null
+            let base_offset = if legacy_mode { 1 } else { 0 };
+            let new_array_len = (-pos + base_offset).as_usize();
+            if new_array_len > MAX_ROUNDED_ARRAY_LENGTH {
+                return Err(DataFusionError::Internal(format!(
+                    "Max array length in Spark is {:?}, but got {:?}",
+                    MAX_ROUNDED_ARRAY_LENGTH, new_array_len
+                )));
+            }
+            mutable_values.extend(1, row_index, row_index + 1);
+            mutable_values.extend_nulls(new_array_len - (end - start + 1));
+            mutable_values.extend(0, start, end);
+            new_offsets.push(new_offsets[row_index] + O::usize_as(new_array_len));
+        }
+        if is_item_null {
+            if (start == end) || (values.is_null(row_index)) {
+                new_nulls.push(false)
+            } else {
+                new_nulls.push(true)
+            }
+        } else {
+            new_nulls.push(true)
+        }
+    }
+
+    let data = make_array(mutable_values.freeze());
+    let data_type = match list_array.data_type() {
+        DataType::List(field) => field.data_type(),
+        DataType::LargeList(field) => field.data_type(),
+        _ => unreachable!(),
+    };
+    let new_array = GenericListArray::<O>::try_new(
+        Arc::new(Field::new("item", data_type.clone(), true)),
+        OffsetBuffer::new(new_offsets.into()),
+        data,
+        Some(NullBuffer::new(new_nulls.into())),
+    )?;
+
+    Ok(ColumnarValue::Array(Arc::new(new_array)))
+}
+
+impl Display for ArrayInsert {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        write!(
+            f,
+            "ArrayInsert [array: {:?}, pos: {:?}, item: {:?}]",
+            self.src_array_expr, self.pos_expr, self.item_expr
+        )
+    }
+}
+
+impl PartialEq<dyn Any> for ArrayInsert {
+    fn eq(&self, other: &dyn Any) -> bool {
+        down_cast_any_ref(other)
+            .downcast_ref::<Self>()
+            .map(|x| {
+                self.src_array_expr.eq(&x.src_array_expr)
+                    && self.pos_expr.eq(&x.pos_expr)
+                    && self.item_expr.eq(&x.item_expr)
+                    && self.legacy_negative_index.eq(&x.legacy_negative_index)
+            })
+            .unwrap_or(false)
+    }
+}
+
 #[cfg(test)]
 mod test {
-    use crate::list::{list_extract, zero_based_index};
+    use crate::list::{array_insert, list_extract, zero_based_index};
 
     use arrow::datatypes::Int32Type;
-    use arrow_array::{Array, Int32Array, ListArray};
+    use arrow_array::{Array, ArrayRef, Int32Array, ListArray};
     use datafusion_common::{Result, ScalarValue};
     use datafusion_expr::ColumnarValue;
+    use std::sync::Arc;
 
     #[test]
     fn test_list_extract_default_value() -> Result<()> {
@@ -458,4 +754,120 @@ mod test {
         );
         Ok(())
     }
+
+    #[test]
+    fn test_array_insert() -> Result<()> {
+        // Test inserting an item into a list array
+        // Inputs and expected values are taken from the Spark results
+        let list = ListArray::from_iter_primitive::<Int32Type, _, _>(vec![
+            Some(vec![Some(1), Some(2), Some(3)]),
+            Some(vec![Some(4), Some(5)]),
+            Some(vec![None]),
+            Some(vec![Some(1), Some(2), Some(3)]),
+            Some(vec![Some(1), Some(2), Some(3)]),
+            None,
+        ]);
+
+        let positions = Int32Array::from(vec![2, 1, 1, 5, 6, 1]);
+        let items = Int32Array::from(vec![
+            Some(10),
+            Some(20),
+            Some(30),
+            Some(100),
+            Some(100),
+            Some(40),
+        ]);
+
+        let ColumnarValue::Array(result) = array_insert(
+            &list,
+            &(Arc::new(items) as ArrayRef),
+            &(Arc::new(positions) as ArrayRef),
+            false,
+        )?
+        else {
+            unreachable!()
+        };
+
+        let expected = ListArray::from_iter_primitive::<Int32Type, _, _>(vec![
+            Some(vec![Some(1), Some(10), Some(2), Some(3)]),
+            Some(vec![Some(20), Some(4), Some(5)]),
+            Some(vec![Some(30), None]),
+            Some(vec![Some(1), Some(2), Some(3), None, Some(100)]),
+            Some(vec![Some(1), Some(2), Some(3), None, None, Some(100)]),
+            None,
+        ]);
+
+        assert_eq!(&result.to_data(), &expected.to_data());
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_array_insert_negative_index() -> Result<()> {
+        // Test insert with negative index
+        // Inputs and expected values are taken from the Spark results
+        let list = ListArray::from_iter_primitive::<Int32Type, _, _>(vec![
+            Some(vec![Some(1), Some(2), Some(3)]),
+            Some(vec![Some(4), Some(5)]),
+            Some(vec![Some(1)]),
+            None,
+        ]);
+
+        let positions = Int32Array::from(vec![-2, -1, -3, -1]);
+        let items = Int32Array::from(vec![Some(10), Some(20), Some(100), Some(30)]);
+
+        let ColumnarValue::Array(result) = array_insert(
+            &list,
+            &(Arc::new(items) as ArrayRef),
+            &(Arc::new(positions) as ArrayRef),
+            false,
+        )?
+        else {
+            unreachable!()
+        };
+
+        let expected = ListArray::from_iter_primitive::<Int32Type, _, _>(vec![
+            Some(vec![Some(1), Some(2), Some(10), Some(3)]),
+            Some(vec![Some(4), Some(5), Some(20)]),
+            Some(vec![Some(100), None, Some(1)]),
+            None,
+        ]);
+
+        assert_eq!(&result.to_data(), &expected.to_data());
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_array_insert_legacy_mode() -> Result<()> {
+        // Test the so-called "legacy" mode exisiting in the Spark
+        let list = ListArray::from_iter_primitive::<Int32Type, _, _>(vec![
+            Some(vec![Some(1), Some(2), Some(3)]),
+            Some(vec![Some(4), Some(5)]),
+            None,
+        ]);
+
+        let positions = Int32Array::from(vec![-1, -1, -1]);
+        let items = Int32Array::from(vec![Some(10), Some(20), Some(30)]);
+
+        let ColumnarValue::Array(result) = array_insert(
+            &list,
+            &(Arc::new(items) as ArrayRef),
+            &(Arc::new(positions) as ArrayRef),
+            true,
+        )?
+        else {
+            unreachable!()
+        };
+
+        let expected = ListArray::from_iter_primitive::<Int32Type, _, _>(vec![
+            Some(vec![Some(1), Some(2), Some(10), Some(3)]),
+            Some(vec![Some(4), Some(20), Some(5)]),
+            None,
+        ]);
+
+        assert_eq!(&result.to_data(), &expected.to_data());
+
+        Ok(())
+    }
 }

From 7fff54706492b7992035dfe14c2509c0f4dee200 Mon Sep 17 00:00:00 2001
From: Andy Grove <agrove@apache.org>
Date: Fri, 6 Dec 2024 15:16:39 -0700
Subject: [PATCH 37/68] chore: Refactor cast to use SparkCastOptions param
 (#1146)

* Refactor cast to use SparkCastOptions param

* update tests

* update benches

* update benches

* update benches
---
 benches/cast_from_string.rs |  30 ++-----
 benches/cast_numeric.rs     |  22 ++---
 src/cast.rs                 | 157 ++++++++++++++----------------------
 src/lib.rs                  |   4 +-
 src/to_json.rs              |   5 +-
 5 files changed, 74 insertions(+), 144 deletions(-)

diff --git a/benches/cast_from_string.rs b/benches/cast_from_string.rs
index 056ada2eb70e..c6b0bcf39794 100644
--- a/benches/cast_from_string.rs
+++ b/benches/cast_from_string.rs
@@ -18,36 +18,18 @@
 use arrow_array::{builder::StringBuilder, RecordBatch};
 use arrow_schema::{DataType, Field, Schema};
 use criterion::{criterion_group, criterion_main, Criterion};
-use datafusion_comet_spark_expr::{Cast, EvalMode};
+use datafusion_comet_spark_expr::{Cast, EvalMode, SparkCastOptions};
 use datafusion_physical_expr::{expressions::Column, PhysicalExpr};
 use std::sync::Arc;
 
 fn criterion_benchmark(c: &mut Criterion) {
     let batch = create_utf8_batch();
     let expr = Arc::new(Column::new("a", 0));
-    let timezone = "".to_string();
-    let cast_string_to_i8 = Cast::new(
-        expr.clone(),
-        DataType::Int8,
-        EvalMode::Legacy,
-        timezone.clone(),
-        false,
-    );
-    let cast_string_to_i16 = Cast::new(
-        expr.clone(),
-        DataType::Int16,
-        EvalMode::Legacy,
-        timezone.clone(),
-        false,
-    );
-    let cast_string_to_i32 = Cast::new(
-        expr.clone(),
-        DataType::Int32,
-        EvalMode::Legacy,
-        timezone.clone(),
-        false,
-    );
-    let cast_string_to_i64 = Cast::new(expr, DataType::Int64, EvalMode::Legacy, timezone, false);
+    let spark_cast_options = SparkCastOptions::new(EvalMode::Legacy, "", false);
+    let cast_string_to_i8 = Cast::new(expr.clone(), DataType::Int8, spark_cast_options.clone());
+    let cast_string_to_i16 = Cast::new(expr.clone(), DataType::Int16, spark_cast_options.clone());
+    let cast_string_to_i32 = Cast::new(expr.clone(), DataType::Int32, spark_cast_options.clone());
+    let cast_string_to_i64 = Cast::new(expr, DataType::Int64, spark_cast_options);
 
     let mut group = c.benchmark_group("cast_string_to_int");
     group.bench_function("cast_string_to_i8", |b| {
diff --git a/benches/cast_numeric.rs b/benches/cast_numeric.rs
index 15ef1a5a27c6..8ec8b2f89183 100644
--- a/benches/cast_numeric.rs
+++ b/benches/cast_numeric.rs
@@ -18,29 +18,17 @@
 use arrow_array::{builder::Int32Builder, RecordBatch};
 use arrow_schema::{DataType, Field, Schema};
 use criterion::{criterion_group, criterion_main, Criterion};
-use datafusion_comet_spark_expr::{Cast, EvalMode};
+use datafusion_comet_spark_expr::{Cast, EvalMode, SparkCastOptions};
 use datafusion_physical_expr::{expressions::Column, PhysicalExpr};
 use std::sync::Arc;
 
 fn criterion_benchmark(c: &mut Criterion) {
     let batch = create_int32_batch();
     let expr = Arc::new(Column::new("a", 0));
-    let timezone = "".to_string();
-    let cast_i32_to_i8 = Cast::new(
-        expr.clone(),
-        DataType::Int8,
-        EvalMode::Legacy,
-        timezone.clone(),
-        false,
-    );
-    let cast_i32_to_i16 = Cast::new(
-        expr.clone(),
-        DataType::Int16,
-        EvalMode::Legacy,
-        timezone.clone(),
-        false,
-    );
-    let cast_i32_to_i64 = Cast::new(expr, DataType::Int64, EvalMode::Legacy, timezone, false);
+    let spark_cast_options = SparkCastOptions::new_without_timezone(EvalMode::Legacy, false);
+    let cast_i32_to_i8 = Cast::new(expr.clone(), DataType::Int8, spark_cast_options.clone());
+    let cast_i32_to_i16 = Cast::new(expr.clone(), DataType::Int16, spark_cast_options.clone());
+    let cast_i32_to_i64 = Cast::new(expr, DataType::Int64, spark_cast_options);
 
     let mut group = c.benchmark_group("cast_int_to_int");
     group.bench_function("cast_i32_to_i8", |b| {
diff --git a/src/cast.rs b/src/cast.rs
index 13263a595438..f62d0220c9d4 100644
--- a/src/cast.rs
+++ b/src/cast.rs
@@ -138,14 +138,7 @@ impl TimeStampInfo {
 pub struct Cast {
     pub child: Arc<dyn PhysicalExpr>,
     pub data_type: DataType,
-    pub eval_mode: EvalMode,
-
-    /// When cast from/to timezone related types, we need timezone, which will be resolved with
-    /// session local timezone by an analyzer in Spark.
-    pub timezone: String,
-
-    /// Whether to allow casts that are known to be incompatible with Spark
-    pub allow_incompat: bool,
+    pub cast_options: SparkCastOptions,
 }
 
 macro_rules! cast_utf8_to_int {
@@ -547,30 +540,41 @@ impl Cast {
     pub fn new(
         child: Arc<dyn PhysicalExpr>,
         data_type: DataType,
-        eval_mode: EvalMode,
-        timezone: String,
-        allow_incompat: bool,
+        cast_options: SparkCastOptions,
     ) -> Self {
         Self {
             child,
             data_type,
-            timezone,
+            cast_options,
+        }
+    }
+}
+
+/// Spark cast options
+#[derive(Debug, Clone, Hash, PartialEq, Eq)]
+pub struct SparkCastOptions {
+    /// Spark evaluation mode
+    pub eval_mode: EvalMode,
+    /// When cast from/to timezone related types, we need timezone, which will be resolved with
+    /// session local timezone by an analyzer in Spark.
+    pub timezone: String,
+    /// Allow casts that are supported but not guaranteed to be 100% compatible
+    pub allow_incompat: bool,
+}
+
+impl SparkCastOptions {
+    pub fn new(eval_mode: EvalMode, timezone: &str, allow_incompat: bool) -> Self {
+        Self {
             eval_mode,
+            timezone: timezone.to_string(),
             allow_incompat,
         }
     }
 
-    pub fn new_without_timezone(
-        child: Arc<dyn PhysicalExpr>,
-        data_type: DataType,
-        eval_mode: EvalMode,
-        allow_incompat: bool,
-    ) -> Self {
+    pub fn new_without_timezone(eval_mode: EvalMode, allow_incompat: bool) -> Self {
         Self {
-            child,
-            data_type,
-            timezone: "".to_string(),
             eval_mode,
+            timezone: "".to_string(),
             allow_incompat,
         }
     }
@@ -582,33 +586,21 @@ impl Cast {
 pub fn spark_cast(
     arg: ColumnarValue,
     data_type: &DataType,
-    eval_mode: EvalMode,
-    timezone: &str,
-    allow_incompat: bool,
+    cast_options: &SparkCastOptions,
 ) -> DataFusionResult<ColumnarValue> {
     match arg {
         ColumnarValue::Array(array) => Ok(ColumnarValue::Array(cast_array(
             array,
             data_type,
-            eval_mode,
-            timezone.to_owned(),
-            allow_incompat,
+            cast_options,
         )?)),
         ColumnarValue::Scalar(scalar) => {
             // Note that normally CAST(scalar) should be fold in Spark JVM side. However, for
             // some cases e.g., scalar subquery, Spark will not fold it, so we need to handle it
             // here.
             let array = scalar.to_array()?;
-            let scalar = ScalarValue::try_from_array(
-                &cast_array(
-                    array,
-                    data_type,
-                    eval_mode,
-                    timezone.to_owned(),
-                    allow_incompat,
-                )?,
-                0,
-            )?;
+            let scalar =
+                ScalarValue::try_from_array(&cast_array(array, data_type, cast_options)?, 0)?;
             Ok(ColumnarValue::Scalar(scalar))
         }
     }
@@ -617,12 +609,11 @@ pub fn spark_cast(
 fn cast_array(
     array: ArrayRef,
     to_type: &DataType,
-    eval_mode: EvalMode,
-    timezone: String,
-    allow_incompat: bool,
+    cast_options: &SparkCastOptions,
 ) -> DataFusionResult<ArrayRef> {
-    let array = array_with_timezone(array, timezone.clone(), Some(to_type))?;
+    let array = array_with_timezone(array, cast_options.timezone.clone(), Some(to_type))?;
     let from_type = array.data_type().clone();
+
     let array = match &from_type {
         DataType::Dictionary(key_type, value_type)
             if key_type.as_ref() == &DataType::Int32
@@ -636,13 +627,7 @@ fn cast_array(
 
             let casted_dictionary = DictionaryArray::<Int32Type>::new(
                 dict_array.keys().clone(),
-                cast_array(
-                    Arc::clone(dict_array.values()),
-                    to_type,
-                    eval_mode,
-                    timezone,
-                    allow_incompat,
-                )?,
+                cast_array(Arc::clone(dict_array.values()), to_type, cast_options)?,
             );
 
             let casted_result = match to_type {
@@ -654,6 +639,7 @@ fn cast_array(
         _ => array,
     };
     let from_type = array.data_type();
+    let eval_mode = cast_options.eval_mode;
 
     let cast_result = match (from_type, to_type) {
         (DataType::Utf8, DataType::Boolean) => spark_cast_utf8_to_boolean::<i32>(&array, eval_mode),
@@ -661,7 +647,7 @@ fn cast_array(
             spark_cast_utf8_to_boolean::<i64>(&array, eval_mode)
         }
         (DataType::Utf8, DataType::Timestamp(_, _)) => {
-            cast_string_to_timestamp(&array, to_type, eval_mode, &timezone)
+            cast_string_to_timestamp(&array, to_type, eval_mode, &cast_options.timezone)
         }
         (DataType::Utf8, DataType::Date32) => cast_string_to_date(&array, to_type, eval_mode),
         (DataType::Int64, DataType::Int32)
@@ -712,17 +698,15 @@ fn cast_array(
             spark_cast_nonintegral_numeric_to_integral(&array, eval_mode, from_type, to_type)
         }
         (DataType::Struct(_), DataType::Utf8) => {
-            Ok(casts_struct_to_string(array.as_struct(), &timezone)?)
+            Ok(casts_struct_to_string(array.as_struct(), cast_options)?)
         }
         (DataType::Struct(_), DataType::Struct(_)) => Ok(cast_struct_to_struct(
             array.as_struct(),
             from_type,
             to_type,
-            eval_mode,
-            timezone,
-            allow_incompat,
+            cast_options,
         )?),
-        _ if is_datafusion_spark_compatible(from_type, to_type, allow_incompat) => {
+        _ if is_datafusion_spark_compatible(from_type, to_type, cast_options.allow_incompat) => {
             // use DataFusion cast only when we know that it is compatible with Spark
             Ok(cast_with_options(&array, to_type, &CAST_OPTIONS)?)
         }
@@ -825,9 +809,7 @@ fn cast_struct_to_struct(
     array: &StructArray,
     from_type: &DataType,
     to_type: &DataType,
-    eval_mode: EvalMode,
-    timezone: String,
-    allow_incompat: bool,
+    cast_options: &SparkCastOptions,
 ) -> DataFusionResult<ArrayRef> {
     match (from_type, to_type) {
         (DataType::Struct(_), DataType::Struct(to_fields)) => {
@@ -836,9 +818,7 @@ fn cast_struct_to_struct(
                 let cast_field = cast_array(
                     Arc::clone(array.column(i)),
                     to_fields[i].data_type(),
-                    eval_mode,
-                    timezone.clone(),
-                    allow_incompat,
+                    cast_options,
                 )?;
                 cast_fields.push((Arc::clone(&to_fields[i]), cast_field));
             }
@@ -848,7 +828,10 @@ fn cast_struct_to_struct(
     }
 }
 
-fn casts_struct_to_string(array: &StructArray, timezone: &str) -> DataFusionResult<ArrayRef> {
+fn casts_struct_to_string(
+    array: &StructArray,
+    spark_cast_options: &SparkCastOptions,
+) -> DataFusionResult<ArrayRef> {
     // cast each field to a string
     let string_arrays: Vec<ArrayRef> = array
         .columns()
@@ -857,9 +840,7 @@ fn casts_struct_to_string(array: &StructArray, timezone: &str) -> DataFusionResu
             spark_cast(
                 ColumnarValue::Array(Arc::clone(arr)),
                 &DataType::Utf8,
-                EvalMode::Legacy,
-                timezone,
-                true,
+                spark_cast_options,
             )
             .and_then(|cv| cv.into_array(arr.len()))
         })
@@ -1464,7 +1445,7 @@ impl Display for Cast {
         write!(
             f,
             "Cast [data_type: {}, timezone: {}, child: {}, eval_mode: {:?}]",
-            self.data_type, self.timezone, self.child, &self.eval_mode
+            self.data_type, self.cast_options.timezone, self.child, &self.cast_options.eval_mode
         )
     }
 }
@@ -1475,9 +1456,8 @@ impl PartialEq<dyn Any> for Cast {
             .downcast_ref::<Self>()
             .map(|x| {
                 self.child.eq(&x.child)
-                    && self.timezone.eq(&x.timezone)
+                    && self.cast_options.eq(&x.cast_options)
                     && self.data_type.eq(&x.data_type)
-                    && self.eval_mode.eq(&x.eval_mode)
             })
             .unwrap_or(false)
     }
@@ -1498,13 +1478,7 @@ impl PhysicalExpr for Cast {
 
     fn evaluate(&self, batch: &RecordBatch) -> DataFusionResult<ColumnarValue> {
         let arg = self.child.evaluate(batch)?;
-        spark_cast(
-            arg,
-            &self.data_type,
-            self.eval_mode,
-            &self.timezone,
-            self.allow_incompat,
-        )
+        spark_cast(arg, &self.data_type, &self.cast_options)
     }
 
     fn children(&self) -> Vec<&Arc<dyn PhysicalExpr>> {
@@ -1519,9 +1493,7 @@ impl PhysicalExpr for Cast {
             1 => Ok(Arc::new(Cast::new(
                 Arc::clone(&children[0]),
                 self.data_type.clone(),
-                self.eval_mode,
-                self.timezone.clone(),
-                self.allow_incompat,
+                self.cast_options.clone(),
             ))),
             _ => internal_err!("Cast should have exactly one child"),
         }
@@ -1531,9 +1503,7 @@ impl PhysicalExpr for Cast {
         let mut s = state;
         self.child.hash(&mut s);
         self.data_type.hash(&mut s);
-        self.timezone.hash(&mut s);
-        self.eval_mode.hash(&mut s);
-        self.allow_incompat.hash(&mut s);
+        self.cast_options.hash(&mut s);
         self.hash(&mut s);
     }
 }
@@ -2110,12 +2080,11 @@ mod tests {
 
         let timezone = "UTC".to_string();
         // test casting string dictionary array to timestamp array
+        let cast_options = SparkCastOptions::new(EvalMode::Legacy, &timezone, false);
         let result = cast_array(
             dict_array,
             &DataType::Timestamp(TimeUnit::Microsecond, Some(timezone.clone().into())),
-            EvalMode::Legacy,
-            timezone.clone(),
-            false,
+            &cast_options,
         )?;
         assert_eq!(
             *result.data_type(),
@@ -2320,12 +2289,11 @@ mod tests {
     fn test_cast_unsupported_timestamp_to_date() {
         // Since datafusion uses chrono::Datetime internally not all dates representable by TimestampMicrosecondType are supported
         let timestamps: PrimitiveArray<TimestampMicrosecondType> = vec![i64::MAX].into();
+        let cast_options = SparkCastOptions::new(EvalMode::Legacy, "UTC", false);
         let result = cast_array(
             Arc::new(timestamps.with_timezone("Europe/Copenhagen")),
             &DataType::Date32,
-            EvalMode::Legacy,
-            "UTC".to_owned(),
-            false,
+            &cast_options,
         );
         assert!(result.is_err())
     }
@@ -2333,12 +2301,11 @@ mod tests {
     #[test]
     fn test_cast_invalid_timezone() {
         let timestamps: PrimitiveArray<TimestampMicrosecondType> = vec![i64::MAX].into();
+        let cast_options = SparkCastOptions::new(EvalMode::Legacy, "Not a valid timezone", false);
         let result = cast_array(
             Arc::new(timestamps.with_timezone("Europe/Copenhagen")),
             &DataType::Date32,
-            EvalMode::Legacy,
-            "Not a valid timezone".to_owned(),
-            false,
+            &cast_options,
         );
         assert!(result.is_err())
     }
@@ -2360,9 +2327,7 @@ mod tests {
         let string_array = cast_array(
             c,
             &DataType::Utf8,
-            EvalMode::Legacy,
-            "UTC".to_owned(),
-            false,
+            &SparkCastOptions::new(EvalMode::Legacy, "UTC", false),
         )
         .unwrap();
         let string_array = string_array.as_string::<i32>();
@@ -2396,9 +2361,7 @@ mod tests {
         let cast_array = spark_cast(
             ColumnarValue::Array(c),
             &DataType::Struct(fields),
-            EvalMode::Legacy,
-            "UTC",
-            false,
+            &SparkCastOptions::new(EvalMode::Legacy, "UTC", false),
         )
         .unwrap();
         if let ColumnarValue::Array(cast_array) = cast_array {
@@ -2429,9 +2392,7 @@ mod tests {
         let cast_array = spark_cast(
             ColumnarValue::Array(c),
             &DataType::Struct(fields),
-            EvalMode::Legacy,
-            "UTC",
-            false,
+            &SparkCastOptions::new(EvalMode::Legacy, "UTC", false),
         )
         .unwrap();
         if let ColumnarValue::Array(cast_array) = cast_array {
diff --git a/src/lib.rs b/src/lib.rs
index 3ec2e886b5d2..c227b3a02522 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -34,7 +34,7 @@ pub mod timezone;
 mod to_json;
 pub mod utils;
 
-pub use cast::{spark_cast, Cast};
+pub use cast::{spark_cast, Cast, SparkCastOptions};
 pub use error::{SparkError, SparkResult};
 pub use if_expr::IfExpr;
 pub use list::{ArrayInsert, GetArrayStructFields, ListExtract};
@@ -47,7 +47,7 @@ pub use to_json::ToJson;
 /// the behavior when processing input values that are invalid or would result in an
 /// error, such as divide by zero errors, and also affects behavior when converting
 /// between types.
-#[derive(Debug, Hash, PartialEq, Clone, Copy)]
+#[derive(Debug, Hash, PartialEq, Eq, Clone, Copy)]
 pub enum EvalMode {
     /// Legacy is the default behavior in Spark prior to Spark 4.0. This mode silently ignores
     /// or replaces errors during SQL operations. Operations resulting in errors (like
diff --git a/src/to_json.rs b/src/to_json.rs
index 7d38cbf1b6df..1f68eb860639 100644
--- a/src/to_json.rs
+++ b/src/to_json.rs
@@ -19,6 +19,7 @@
 // of the Spark-specific compatibility features that we need (including
 // being able to specify Spark-compatible cast from all types to string)
 
+use crate::cast::SparkCastOptions;
 use crate::{spark_cast, EvalMode};
 use arrow_array::builder::StringBuilder;
 use arrow_array::{Array, ArrayRef, RecordBatch, StringArray, StructArray};
@@ -117,9 +118,7 @@ fn array_to_json_string(arr: &Arc<dyn Array>, timezone: &str) -> Result<ArrayRef
         spark_cast(
             ColumnarValue::Array(Arc::clone(arr)),
             &DataType::Utf8,
-            EvalMode::Legacy,
-            timezone,
-            false,
+            &SparkCastOptions::new(EvalMode::Legacy, timezone, false),
         )?
         .into_array(arr.len())
     }

From df2350f031c3208eef1e3307cecb6266940f74d8 Mon Sep 17 00:00:00 2001
From: Andy Grove <agrove@apache.org>
Date: Mon, 9 Dec 2024 11:45:12 -0700
Subject: [PATCH 38/68] chore: Move more expressions from core crate to
 spark-expr crate (#1152)

* move aggregate expressions to spark-expr crate

* move more expressions

* move benchmark

* normalize_nan

* bitwise not

* comet scalar funcs

* update bench imports
---
 Cargo.toml                |   8 +
 benches/aggregate.rs      | 199 +++++++++++++
 src/avg.rs                | 357 +++++++++++++++++++++++
 src/avg_decimal.rs        | 538 +++++++++++++++++++++++++++++++++++
 src/bitwise_not.rs        | 177 ++++++++++++
 src/comet_scalar_funcs.rs | 193 +++++++++++++
 src/correlation.rs        | 272 ++++++++++++++++++
 src/covariance.rs         | 328 ++++++++++++++++++++++
 src/lib.rs                |  20 ++
 src/normalize_nan.rs      | 166 +++++++++++
 src/stddev.rs             | 190 +++++++++++++
 src/sum_decimal.rs        | 575 ++++++++++++++++++++++++++++++++++++++
 src/utils.rs              |  39 ++-
 src/variance.rs           | 264 +++++++++++++++++
 14 files changed, 3325 insertions(+), 1 deletion(-)
 create mode 100644 benches/aggregate.rs
 create mode 100644 src/avg.rs
 create mode 100644 src/avg_decimal.rs
 create mode 100644 src/bitwise_not.rs
 create mode 100644 src/comet_scalar_funcs.rs
 create mode 100644 src/correlation.rs
 create mode 100644 src/covariance.rs
 create mode 100644 src/normalize_nan.rs
 create mode 100644 src/stddev.rs
 create mode 100644 src/sum_decimal.rs
 create mode 100644 src/variance.rs

diff --git a/Cargo.toml b/Cargo.toml
index 532bf74375df..65517431d2d9 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -29,6 +29,7 @@ edition = { workspace = true }
 [dependencies]
 arrow = { workspace = true }
 arrow-array = { workspace = true }
+arrow-data = { workspace = true }
 arrow-schema = { workspace = true }
 chrono = { workspace = true }
 datafusion = { workspace = true }
@@ -39,12 +40,14 @@ chrono-tz = { workspace = true }
 num = { workspace = true }
 regex = { workspace = true }
 thiserror = { workspace = true }
+futures = { workspace = true }
 twox-hash = "2.0.0"
 
 [dev-dependencies]
 arrow-data = {workspace = true}
 criterion = "0.5.1"
 rand = { workspace = true}
+tokio = { version = "1", features = ["rt-multi-thread"] }
 
 
 [lib]
@@ -66,3 +69,8 @@ harness = false
 [[bench]]
 name = "decimal_div"
 harness = false
+
+[[bench]]
+name = "aggregate"
+harness = false
+
diff --git a/benches/aggregate.rs b/benches/aggregate.rs
new file mode 100644
index 000000000000..43194fdda2f8
--- /dev/null
+++ b/benches/aggregate.rs
@@ -0,0 +1,199 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.use arrow::array::{ArrayRef, BooleanBuilder, Int32Builder, RecordBatch, StringBuilder};
+
+use arrow::datatypes::{DataType, Field, Schema};
+use arrow_array::builder::{Decimal128Builder, StringBuilder};
+use arrow_array::{ArrayRef, RecordBatch};
+use arrow_schema::SchemaRef;
+use criterion::{black_box, criterion_group, criterion_main, Criterion};
+use datafusion::execution::TaskContext;
+use datafusion::functions_aggregate::average::avg_udaf;
+use datafusion::functions_aggregate::sum::sum_udaf;
+use datafusion::physical_expr::PhysicalExpr;
+use datafusion::physical_plan::aggregates::{AggregateExec, AggregateMode, PhysicalGroupBy};
+use datafusion::physical_plan::memory::MemoryExec;
+use datafusion::physical_plan::ExecutionPlan;
+use datafusion_comet_spark_expr::AvgDecimal;
+use datafusion_comet_spark_expr::SumDecimal;
+use datafusion_expr::AggregateUDF;
+use datafusion_physical_expr::aggregate::AggregateExprBuilder;
+use datafusion_physical_expr::expressions::Column;
+use futures::StreamExt;
+use std::sync::Arc;
+use std::time::Duration;
+use tokio::runtime::Runtime;
+
+fn criterion_benchmark(c: &mut Criterion) {
+    let mut group = c.benchmark_group("aggregate");
+    let num_rows = 8192;
+    let batch = create_record_batch(num_rows);
+    let mut batches = Vec::new();
+    for _ in 0..10 {
+        batches.push(batch.clone());
+    }
+    let partitions = &[batches];
+    let c0: Arc<dyn PhysicalExpr> = Arc::new(Column::new("c0", 0));
+    let c1: Arc<dyn PhysicalExpr> = Arc::new(Column::new("c1", 1));
+
+    let rt = Runtime::new().unwrap();
+
+    group.bench_function("avg_decimal_datafusion", |b| {
+        let datafusion_sum_decimal = avg_udaf();
+        b.to_async(&rt).iter(|| {
+            black_box(agg_test(
+                partitions,
+                c0.clone(),
+                c1.clone(),
+                datafusion_sum_decimal.clone(),
+                "avg",
+            ))
+        })
+    });
+
+    group.bench_function("avg_decimal_comet", |b| {
+        let comet_avg_decimal = Arc::new(AggregateUDF::new_from_impl(AvgDecimal::new(
+            Arc::clone(&c1),
+            DataType::Decimal128(38, 10),
+            DataType::Decimal128(38, 10),
+        )));
+        b.to_async(&rt).iter(|| {
+            black_box(agg_test(
+                partitions,
+                c0.clone(),
+                c1.clone(),
+                comet_avg_decimal.clone(),
+                "avg",
+            ))
+        })
+    });
+
+    group.bench_function("sum_decimal_datafusion", |b| {
+        let datafusion_sum_decimal = sum_udaf();
+        b.to_async(&rt).iter(|| {
+            black_box(agg_test(
+                partitions,
+                c0.clone(),
+                c1.clone(),
+                datafusion_sum_decimal.clone(),
+                "sum",
+            ))
+        })
+    });
+
+    group.bench_function("sum_decimal_comet", |b| {
+        let comet_sum_decimal = Arc::new(AggregateUDF::new_from_impl(
+            SumDecimal::try_new(Arc::clone(&c1), DataType::Decimal128(38, 10)).unwrap(),
+        ));
+        b.to_async(&rt).iter(|| {
+            black_box(agg_test(
+                partitions,
+                c0.clone(),
+                c1.clone(),
+                comet_sum_decimal.clone(),
+                "sum",
+            ))
+        })
+    });
+
+    group.finish();
+}
+
+async fn agg_test(
+    partitions: &[Vec<RecordBatch>],
+    c0: Arc<dyn PhysicalExpr>,
+    c1: Arc<dyn PhysicalExpr>,
+    aggregate_udf: Arc<AggregateUDF>,
+    alias: &str,
+) {
+    let schema = &partitions[0][0].schema();
+    let scan: Arc<dyn ExecutionPlan> =
+        Arc::new(MemoryExec::try_new(partitions, Arc::clone(schema), None).unwrap());
+    let aggregate = create_aggregate(scan, c0.clone(), c1.clone(), schema, aggregate_udf, alias);
+    let mut stream = aggregate
+        .execute(0, Arc::new(TaskContext::default()))
+        .unwrap();
+    while let Some(batch) = stream.next().await {
+        let _batch = batch.unwrap();
+    }
+}
+
+fn create_aggregate(
+    scan: Arc<dyn ExecutionPlan>,
+    c0: Arc<dyn PhysicalExpr>,
+    c1: Arc<dyn PhysicalExpr>,
+    schema: &SchemaRef,
+    aggregate_udf: Arc<AggregateUDF>,
+    alias: &str,
+) -> Arc<AggregateExec> {
+    let aggr_expr = AggregateExprBuilder::new(aggregate_udf, vec![c1])
+        .schema(schema.clone())
+        .alias(alias)
+        .with_ignore_nulls(false)
+        .with_distinct(false)
+        .build()
+        .unwrap();
+
+    Arc::new(
+        AggregateExec::try_new(
+            AggregateMode::Partial,
+            PhysicalGroupBy::new_single(vec![(c0, "c0".to_string())]),
+            vec![aggr_expr.into()],
+            vec![None], // no filter expressions
+            scan,
+            Arc::clone(schema),
+        )
+        .unwrap(),
+    )
+}
+
+fn create_record_batch(num_rows: usize) -> RecordBatch {
+    let mut decimal_builder = Decimal128Builder::with_capacity(num_rows);
+    let mut string_builder = StringBuilder::with_capacity(num_rows, num_rows * 32);
+    for i in 0..num_rows {
+        decimal_builder.append_value(i as i128);
+        string_builder.append_value(format!("this is string #{}", i % 1024));
+    }
+    let decimal_array = Arc::new(decimal_builder.finish());
+    let string_array = Arc::new(string_builder.finish());
+
+    let mut fields = vec![];
+    let mut columns: Vec<ArrayRef> = vec![];
+
+    // string column
+    fields.push(Field::new("c0", DataType::Utf8, false));
+    columns.push(string_array);
+
+    // decimal column
+    fields.push(Field::new("c1", DataType::Decimal128(38, 10), false));
+    columns.push(decimal_array);
+
+    let schema = Schema::new(fields);
+    RecordBatch::try_new(Arc::new(schema), columns).unwrap()
+}
+
+fn config() -> Criterion {
+    Criterion::default()
+        .measurement_time(Duration::from_millis(500))
+        .warm_up_time(Duration::from_millis(500))
+}
+
+criterion_group! {
+    name = benches;
+    config = config();
+    targets = criterion_benchmark
+}
+criterion_main!(benches);
diff --git a/src/avg.rs b/src/avg.rs
new file mode 100644
index 000000000000..7820497d46f9
--- /dev/null
+++ b/src/avg.rs
@@ -0,0 +1,357 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::compute::sum;
+use arrow_array::{
+    builder::PrimitiveBuilder,
+    cast::AsArray,
+    types::{Float64Type, Int64Type},
+    Array, ArrayRef, ArrowNumericType, Int64Array, PrimitiveArray,
+};
+use arrow_schema::{DataType, Field};
+use datafusion::logical_expr::{
+    type_coercion::aggregates::avg_return_type, Accumulator, EmitTo, GroupsAccumulator, Signature,
+};
+use datafusion_common::{not_impl_err, Result, ScalarValue};
+use datafusion_physical_expr::{expressions::format_state_name, PhysicalExpr};
+use std::{any::Any, sync::Arc};
+
+use arrow_array::ArrowNativeTypeOp;
+use datafusion::physical_expr_common::physical_expr::down_cast_any_ref;
+use datafusion_expr::function::{AccumulatorArgs, StateFieldsArgs};
+use datafusion_expr::Volatility::Immutable;
+use datafusion_expr::{AggregateUDFImpl, ReversedUDAF};
+use DataType::*;
+
+/// AVG aggregate expression
+#[derive(Debug, Clone)]
+pub struct Avg {
+    name: String,
+    signature: Signature,
+    expr: Arc<dyn PhysicalExpr>,
+    input_data_type: DataType,
+    result_data_type: DataType,
+}
+
+impl Avg {
+    /// Create a new AVG aggregate function
+    pub fn new(expr: Arc<dyn PhysicalExpr>, name: impl Into<String>, data_type: DataType) -> Self {
+        let result_data_type = avg_return_type("avg", &data_type).unwrap();
+
+        Self {
+            name: name.into(),
+            signature: Signature::user_defined(Immutable),
+            expr,
+            input_data_type: data_type,
+            result_data_type,
+        }
+    }
+}
+
+impl AggregateUDFImpl for Avg {
+    /// Return a reference to Any that can be used for downcasting
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn accumulator(&self, _acc_args: AccumulatorArgs) -> Result<Box<dyn Accumulator>> {
+        // instantiate specialized accumulator based for the type
+        match (&self.input_data_type, &self.result_data_type) {
+            (Float64, Float64) => Ok(Box::<AvgAccumulator>::default()),
+            _ => not_impl_err!(
+                "AvgAccumulator for ({} --> {})",
+                self.input_data_type,
+                self.result_data_type
+            ),
+        }
+    }
+
+    fn state_fields(&self, _args: StateFieldsArgs) -> Result<Vec<Field>> {
+        Ok(vec![
+            Field::new(
+                format_state_name(&self.name, "sum"),
+                self.input_data_type.clone(),
+                true,
+            ),
+            Field::new(
+                format_state_name(&self.name, "count"),
+                DataType::Int64,
+                true,
+            ),
+        ])
+    }
+
+    fn name(&self) -> &str {
+        &self.name
+    }
+
+    fn reverse_expr(&self) -> ReversedUDAF {
+        ReversedUDAF::Identical
+    }
+
+    fn groups_accumulator_supported(&self, _args: AccumulatorArgs) -> bool {
+        true
+    }
+
+    fn create_groups_accumulator(
+        &self,
+        _args: AccumulatorArgs,
+    ) -> Result<Box<dyn GroupsAccumulator>> {
+        // instantiate specialized accumulator based for the type
+        match (&self.input_data_type, &self.result_data_type) {
+            (Float64, Float64) => Ok(Box::new(AvgGroupsAccumulator::<Float64Type, _>::new(
+                &self.input_data_type,
+                |sum: f64, count: i64| Ok(sum / count as f64),
+            ))),
+
+            _ => not_impl_err!(
+                "AvgGroupsAccumulator for ({} --> {})",
+                self.input_data_type,
+                self.result_data_type
+            ),
+        }
+    }
+
+    fn default_value(&self, _data_type: &DataType) -> Result<ScalarValue> {
+        Ok(ScalarValue::Float64(None))
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
+        avg_return_type(self.name(), &arg_types[0])
+    }
+}
+
+impl PartialEq<dyn Any> for Avg {
+    fn eq(&self, other: &dyn Any) -> bool {
+        down_cast_any_ref(other)
+            .downcast_ref::<Self>()
+            .map(|x| {
+                self.name == x.name
+                    && self.input_data_type == x.input_data_type
+                    && self.result_data_type == x.result_data_type
+                    && self.expr.eq(&x.expr)
+            })
+            .unwrap_or(false)
+    }
+}
+
+/// An accumulator to compute the average
+#[derive(Debug, Default)]
+pub struct AvgAccumulator {
+    sum: Option<f64>,
+    count: i64,
+}
+
+impl Accumulator for AvgAccumulator {
+    fn state(&mut self) -> Result<Vec<ScalarValue>> {
+        Ok(vec![
+            ScalarValue::Float64(self.sum),
+            ScalarValue::from(self.count),
+        ])
+    }
+
+    fn update_batch(&mut self, values: &[ArrayRef]) -> Result<()> {
+        let values = values[0].as_primitive::<Float64Type>();
+        self.count += (values.len() - values.null_count()) as i64;
+        let v = self.sum.get_or_insert(0.);
+        if let Some(x) = sum(values) {
+            *v += x;
+        }
+        Ok(())
+    }
+
+    fn merge_batch(&mut self, states: &[ArrayRef]) -> Result<()> {
+        // counts are summed
+        self.count += sum(states[1].as_primitive::<Int64Type>()).unwrap_or_default();
+
+        // sums are summed
+        if let Some(x) = sum(states[0].as_primitive::<Float64Type>()) {
+            let v = self.sum.get_or_insert(0.);
+            *v += x;
+        }
+        Ok(())
+    }
+
+    fn evaluate(&mut self) -> Result<ScalarValue> {
+        if self.count == 0 {
+            // If all input are nulls, count will be 0 and we will get null after the division.
+            // This is consistent with Spark Average implementation.
+            Ok(ScalarValue::Float64(None))
+        } else {
+            Ok(ScalarValue::Float64(
+                self.sum.map(|f| f / self.count as f64),
+            ))
+        }
+    }
+
+    fn size(&self) -> usize {
+        std::mem::size_of_val(self)
+    }
+}
+
+/// An accumulator to compute the average of `[PrimitiveArray<T>]`.
+/// Stores values as native types, and does overflow checking
+///
+/// F: Function that calculates the average value from a sum of
+/// T::Native and a total count
+#[derive(Debug)]
+struct AvgGroupsAccumulator<T, F>
+where
+    T: ArrowNumericType + Send,
+    F: Fn(T::Native, i64) -> Result<T::Native> + Send,
+{
+    /// The type of the returned average
+    return_data_type: DataType,
+
+    /// Count per group (use i64 to make Int64Array)
+    counts: Vec<i64>,
+
+    /// Sums per group, stored as the native type
+    sums: Vec<T::Native>,
+
+    /// Function that computes the final average (value / count)
+    avg_fn: F,
+}
+
+impl<T, F> AvgGroupsAccumulator<T, F>
+where
+    T: ArrowNumericType + Send,
+    F: Fn(T::Native, i64) -> Result<T::Native> + Send,
+{
+    pub fn new(return_data_type: &DataType, avg_fn: F) -> Self {
+        Self {
+            return_data_type: return_data_type.clone(),
+            counts: vec![],
+            sums: vec![],
+            avg_fn,
+        }
+    }
+}
+
+impl<T, F> GroupsAccumulator for AvgGroupsAccumulator<T, F>
+where
+    T: ArrowNumericType + Send,
+    F: Fn(T::Native, i64) -> Result<T::Native> + Send,
+{
+    fn update_batch(
+        &mut self,
+        values: &[ArrayRef],
+        group_indices: &[usize],
+        _opt_filter: Option<&arrow_array::BooleanArray>,
+        total_num_groups: usize,
+    ) -> Result<()> {
+        assert_eq!(values.len(), 1, "single argument to update_batch");
+        let values = values[0].as_primitive::<T>();
+        let data = values.values();
+
+        // increment counts, update sums
+        self.counts.resize(total_num_groups, 0);
+        self.sums.resize(total_num_groups, T::default_value());
+
+        let iter = group_indices.iter().zip(data.iter());
+        if values.null_count() == 0 {
+            for (&group_index, &value) in iter {
+                let sum = &mut self.sums[group_index];
+                *sum = (*sum).add_wrapping(value);
+                self.counts[group_index] += 1;
+            }
+        } else {
+            for (idx, (&group_index, &value)) in iter.enumerate() {
+                if values.is_null(idx) {
+                    continue;
+                }
+                let sum = &mut self.sums[group_index];
+                *sum = (*sum).add_wrapping(value);
+
+                self.counts[group_index] += 1;
+            }
+        }
+
+        Ok(())
+    }
+
+    fn merge_batch(
+        &mut self,
+        values: &[ArrayRef],
+        group_indices: &[usize],
+        _opt_filter: Option<&arrow_array::BooleanArray>,
+        total_num_groups: usize,
+    ) -> Result<()> {
+        assert_eq!(values.len(), 2, "two arguments to merge_batch");
+        // first batch is partial sums, second is counts
+        let partial_sums = values[0].as_primitive::<T>();
+        let partial_counts = values[1].as_primitive::<Int64Type>();
+        // update counts with partial counts
+        self.counts.resize(total_num_groups, 0);
+        let iter1 = group_indices.iter().zip(partial_counts.values().iter());
+        for (&group_index, &partial_count) in iter1 {
+            self.counts[group_index] += partial_count;
+        }
+
+        // update sums
+        self.sums.resize(total_num_groups, T::default_value());
+        let iter2 = group_indices.iter().zip(partial_sums.values().iter());
+        for (&group_index, &new_value) in iter2 {
+            let sum = &mut self.sums[group_index];
+            *sum = sum.add_wrapping(new_value);
+        }
+
+        Ok(())
+    }
+
+    fn evaluate(&mut self, emit_to: EmitTo) -> Result<ArrayRef> {
+        let counts = emit_to.take_needed(&mut self.counts);
+        let sums = emit_to.take_needed(&mut self.sums);
+        let mut builder = PrimitiveBuilder::<T>::with_capacity(sums.len());
+        let iter = sums.into_iter().zip(counts);
+
+        for (sum, count) in iter {
+            if count != 0 {
+                builder.append_value((self.avg_fn)(sum, count)?)
+            } else {
+                builder.append_null();
+            }
+        }
+        let array: PrimitiveArray<T> = builder.finish();
+
+        Ok(Arc::new(array))
+    }
+
+    // return arrays for sums and counts
+    fn state(&mut self, emit_to: EmitTo) -> Result<Vec<ArrayRef>> {
+        let counts = emit_to.take_needed(&mut self.counts);
+        let counts = Int64Array::new(counts.into(), None);
+
+        let sums = emit_to.take_needed(&mut self.sums);
+        let sums = PrimitiveArray::<T>::new(sums.into(), None)
+            .with_data_type(self.return_data_type.clone());
+
+        Ok(vec![
+            Arc::new(sums) as ArrayRef,
+            Arc::new(counts) as ArrayRef,
+        ])
+    }
+
+    fn size(&self) -> usize {
+        self.counts.capacity() * std::mem::size_of::<i64>()
+            + self.sums.capacity() * std::mem::size_of::<T>()
+    }
+}
diff --git a/src/avg_decimal.rs b/src/avg_decimal.rs
new file mode 100644
index 000000000000..163e1560b680
--- /dev/null
+++ b/src/avg_decimal.rs
@@ -0,0 +1,538 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::{array::BooleanBufferBuilder, buffer::NullBuffer, compute::sum};
+use arrow_array::{
+    builder::PrimitiveBuilder,
+    cast::AsArray,
+    types::{Decimal128Type, Int64Type},
+    Array, ArrayRef, Decimal128Array, Int64Array, PrimitiveArray,
+};
+use arrow_schema::{DataType, Field};
+use datafusion::logical_expr::{Accumulator, EmitTo, GroupsAccumulator, Signature};
+use datafusion_common::{not_impl_err, Result, ScalarValue};
+use datafusion_physical_expr::{expressions::format_state_name, PhysicalExpr};
+use std::{any::Any, sync::Arc};
+
+use crate::utils::is_valid_decimal_precision;
+use arrow_array::ArrowNativeTypeOp;
+use arrow_data::decimal::{MAX_DECIMAL_FOR_EACH_PRECISION, MIN_DECIMAL_FOR_EACH_PRECISION};
+use datafusion::logical_expr::Volatility::Immutable;
+use datafusion::physical_expr_common::physical_expr::down_cast_any_ref;
+use datafusion_expr::function::{AccumulatorArgs, StateFieldsArgs};
+use datafusion_expr::type_coercion::aggregates::avg_return_type;
+use datafusion_expr::{AggregateUDFImpl, ReversedUDAF};
+use num::{integer::div_ceil, Integer};
+use DataType::*;
+
+/// AVG aggregate expression
+#[derive(Debug, Clone)]
+pub struct AvgDecimal {
+    signature: Signature,
+    expr: Arc<dyn PhysicalExpr>,
+    sum_data_type: DataType,
+    result_data_type: DataType,
+}
+
+impl AvgDecimal {
+    /// Create a new AVG aggregate function
+    pub fn new(expr: Arc<dyn PhysicalExpr>, result_type: DataType, sum_type: DataType) -> Self {
+        Self {
+            signature: Signature::user_defined(Immutable),
+            expr,
+            result_data_type: result_type,
+            sum_data_type: sum_type,
+        }
+    }
+}
+
+impl AggregateUDFImpl for AvgDecimal {
+    /// Return a reference to Any that can be used for downcasting
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn accumulator(&self, _acc_args: AccumulatorArgs) -> Result<Box<dyn Accumulator>> {
+        match (&self.sum_data_type, &self.result_data_type) {
+            (Decimal128(sum_precision, sum_scale), Decimal128(target_precision, target_scale)) => {
+                Ok(Box::new(AvgDecimalAccumulator::new(
+                    *sum_scale,
+                    *sum_precision,
+                    *target_precision,
+                    *target_scale,
+                )))
+            }
+            _ => not_impl_err!(
+                "AvgDecimalAccumulator for ({} --> {})",
+                self.sum_data_type,
+                self.result_data_type
+            ),
+        }
+    }
+
+    fn state_fields(&self, _args: StateFieldsArgs) -> Result<Vec<Field>> {
+        Ok(vec![
+            Field::new(
+                format_state_name(self.name(), "sum"),
+                self.sum_data_type.clone(),
+                true,
+            ),
+            Field::new(
+                format_state_name(self.name(), "count"),
+                DataType::Int64,
+                true,
+            ),
+        ])
+    }
+
+    fn name(&self) -> &str {
+        "avg"
+    }
+
+    fn reverse_expr(&self) -> ReversedUDAF {
+        ReversedUDAF::Identical
+    }
+
+    fn groups_accumulator_supported(&self, _args: AccumulatorArgs) -> bool {
+        true
+    }
+
+    fn create_groups_accumulator(
+        &self,
+        _args: AccumulatorArgs,
+    ) -> Result<Box<dyn GroupsAccumulator>> {
+        // instantiate specialized accumulator based for the type
+        match (&self.sum_data_type, &self.result_data_type) {
+            (Decimal128(sum_precision, sum_scale), Decimal128(target_precision, target_scale)) => {
+                Ok(Box::new(AvgDecimalGroupsAccumulator::new(
+                    &self.result_data_type,
+                    &self.sum_data_type,
+                    *target_precision,
+                    *target_scale,
+                    *sum_precision,
+                    *sum_scale,
+                )))
+            }
+            _ => not_impl_err!(
+                "AvgDecimalGroupsAccumulator for ({} --> {})",
+                self.sum_data_type,
+                self.result_data_type
+            ),
+        }
+    }
+
+    fn default_value(&self, _data_type: &DataType) -> Result<ScalarValue> {
+        match &self.result_data_type {
+            Decimal128(target_precision, target_scale) => {
+                Ok(make_decimal128(None, *target_precision, *target_scale))
+            }
+            _ => not_impl_err!(
+                "The result_data_type of AvgDecimal should be Decimal128 but got{}",
+                self.result_data_type
+            ),
+        }
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
+        avg_return_type(self.name(), &arg_types[0])
+    }
+}
+
+impl PartialEq<dyn Any> for AvgDecimal {
+    fn eq(&self, other: &dyn Any) -> bool {
+        down_cast_any_ref(other)
+            .downcast_ref::<Self>()
+            .map(|x| {
+                self.sum_data_type == x.sum_data_type
+                    && self.result_data_type == x.result_data_type
+                    && self.expr.eq(&x.expr)
+            })
+            .unwrap_or(false)
+    }
+}
+
+/// An accumulator to compute the average for decimals
+#[derive(Debug)]
+struct AvgDecimalAccumulator {
+    sum: Option<i128>,
+    count: i64,
+    is_empty: bool,
+    is_not_null: bool,
+    sum_scale: i8,
+    sum_precision: u8,
+    target_precision: u8,
+    target_scale: i8,
+}
+
+impl AvgDecimalAccumulator {
+    pub fn new(sum_scale: i8, sum_precision: u8, target_precision: u8, target_scale: i8) -> Self {
+        Self {
+            sum: None,
+            count: 0,
+            is_empty: true,
+            is_not_null: true,
+            sum_scale,
+            sum_precision,
+            target_precision,
+            target_scale,
+        }
+    }
+
+    fn update_single(&mut self, values: &Decimal128Array, idx: usize) {
+        let v = unsafe { values.value_unchecked(idx) };
+        let (new_sum, is_overflow) = match self.sum {
+            Some(sum) => sum.overflowing_add(v),
+            None => (v, false),
+        };
+
+        if is_overflow || !is_valid_decimal_precision(new_sum, self.sum_precision) {
+            // Overflow: set buffer accumulator to null
+            self.is_not_null = false;
+            return;
+        }
+
+        self.sum = Some(new_sum);
+
+        if let Some(new_count) = self.count.checked_add(1) {
+            self.count = new_count;
+        } else {
+            self.is_not_null = false;
+            return;
+        }
+
+        self.is_not_null = true;
+    }
+}
+
+fn make_decimal128(value: Option<i128>, precision: u8, scale: i8) -> ScalarValue {
+    ScalarValue::Decimal128(value, precision, scale)
+}
+
+impl Accumulator for AvgDecimalAccumulator {
+    fn state(&mut self) -> Result<Vec<ScalarValue>> {
+        Ok(vec![
+            ScalarValue::Decimal128(self.sum, self.sum_precision, self.sum_scale),
+            ScalarValue::from(self.count),
+        ])
+    }
+
+    fn update_batch(&mut self, values: &[ArrayRef]) -> Result<()> {
+        if !self.is_empty && !self.is_not_null {
+            // This means there's a overflow in decimal, so we will just skip the rest
+            // of the computation
+            return Ok(());
+        }
+
+        let values = &values[0];
+        let data = values.as_primitive::<Decimal128Type>();
+
+        self.is_empty = self.is_empty && values.len() == values.null_count();
+
+        if values.null_count() == 0 {
+            for i in 0..data.len() {
+                self.update_single(data, i);
+            }
+        } else {
+            for i in 0..data.len() {
+                if data.is_null(i) {
+                    continue;
+                }
+                self.update_single(data, i);
+            }
+        }
+        Ok(())
+    }
+
+    fn merge_batch(&mut self, states: &[ArrayRef]) -> Result<()> {
+        // counts are summed
+        self.count += sum(states[1].as_primitive::<Int64Type>()).unwrap_or_default();
+
+        // sums are summed
+        if let Some(x) = sum(states[0].as_primitive::<Decimal128Type>()) {
+            let v = self.sum.get_or_insert(0);
+            let (result, overflowed) = v.overflowing_add(x);
+            if overflowed {
+                // Set to None if overflow happens
+                self.sum = None;
+            } else {
+                *v = result;
+            }
+        }
+        Ok(())
+    }
+
+    fn evaluate(&mut self) -> Result<ScalarValue> {
+        let scaler = 10_i128.pow(self.target_scale.saturating_sub(self.sum_scale) as u32);
+        let target_min = MIN_DECIMAL_FOR_EACH_PRECISION[self.target_precision as usize - 1];
+        let target_max = MAX_DECIMAL_FOR_EACH_PRECISION[self.target_precision as usize - 1];
+
+        let result = self
+            .sum
+            .map(|v| avg(v, self.count as i128, target_min, target_max, scaler));
+
+        match result {
+            Some(value) => Ok(make_decimal128(
+                value,
+                self.target_precision,
+                self.target_scale,
+            )),
+            _ => Ok(make_decimal128(
+                None,
+                self.target_precision,
+                self.target_scale,
+            )),
+        }
+    }
+
+    fn size(&self) -> usize {
+        std::mem::size_of_val(self)
+    }
+}
+
+#[derive(Debug)]
+struct AvgDecimalGroupsAccumulator {
+    /// Tracks if the value is null
+    is_not_null: BooleanBufferBuilder,
+
+    // Tracks if the value is empty
+    is_empty: BooleanBufferBuilder,
+
+    /// The type of the avg return type
+    return_data_type: DataType,
+    target_precision: u8,
+    target_scale: i8,
+
+    /// Count per group (use i64 to make Int64Array)
+    counts: Vec<i64>,
+
+    /// Sums per group, stored as i128
+    sums: Vec<i128>,
+
+    /// The type of the sum
+    sum_data_type: DataType,
+    /// This is input_precision + 10 to be consistent with Spark
+    sum_precision: u8,
+    sum_scale: i8,
+}
+
+impl AvgDecimalGroupsAccumulator {
+    pub fn new(
+        return_data_type: &DataType,
+        sum_data_type: &DataType,
+        target_precision: u8,
+        target_scale: i8,
+        sum_precision: u8,
+        sum_scale: i8,
+    ) -> Self {
+        Self {
+            is_not_null: BooleanBufferBuilder::new(0),
+            is_empty: BooleanBufferBuilder::new(0),
+            return_data_type: return_data_type.clone(),
+            target_precision,
+            target_scale,
+            sum_data_type: sum_data_type.clone(),
+            sum_precision,
+            sum_scale,
+            counts: vec![],
+            sums: vec![],
+        }
+    }
+
+    fn is_overflow(&self, index: usize) -> bool {
+        !self.is_empty.get_bit(index) && !self.is_not_null.get_bit(index)
+    }
+
+    fn update_single(&mut self, group_index: usize, value: i128) {
+        if self.is_overflow(group_index) {
+            // This means there's a overflow in decimal, so we will just skip the rest
+            // of the computation
+            return;
+        }
+
+        self.is_empty.set_bit(group_index, false);
+        let (new_sum, is_overflow) = self.sums[group_index].overflowing_add(value);
+        self.counts[group_index] += 1;
+
+        if is_overflow || !is_valid_decimal_precision(new_sum, self.sum_precision) {
+            // Overflow: set buffer accumulator to null
+            self.is_not_null.set_bit(group_index, false);
+            return;
+        }
+
+        self.sums[group_index] = new_sum;
+        self.is_not_null.set_bit(group_index, true)
+    }
+}
+
+fn ensure_bit_capacity(builder: &mut BooleanBufferBuilder, capacity: usize) {
+    if builder.len() < capacity {
+        let additional = capacity - builder.len();
+        builder.append_n(additional, true);
+    }
+}
+
+impl GroupsAccumulator for AvgDecimalGroupsAccumulator {
+    fn update_batch(
+        &mut self,
+        values: &[ArrayRef],
+        group_indices: &[usize],
+        _opt_filter: Option<&arrow_array::BooleanArray>,
+        total_num_groups: usize,
+    ) -> Result<()> {
+        assert_eq!(values.len(), 1, "single argument to update_batch");
+        let values = values[0].as_primitive::<Decimal128Type>();
+        let data = values.values();
+
+        // increment counts, update sums
+        self.counts.resize(total_num_groups, 0);
+        self.sums.resize(total_num_groups, 0);
+        ensure_bit_capacity(&mut self.is_empty, total_num_groups);
+        ensure_bit_capacity(&mut self.is_not_null, total_num_groups);
+
+        let iter = group_indices.iter().zip(data.iter());
+        if values.null_count() == 0 {
+            for (&group_index, &value) in iter {
+                self.update_single(group_index, value);
+            }
+        } else {
+            for (idx, (&group_index, &value)) in iter.enumerate() {
+                if values.is_null(idx) {
+                    continue;
+                }
+                self.update_single(group_index, value);
+            }
+        }
+        Ok(())
+    }
+
+    fn merge_batch(
+        &mut self,
+        values: &[ArrayRef],
+        group_indices: &[usize],
+        _opt_filter: Option<&arrow_array::BooleanArray>,
+        total_num_groups: usize,
+    ) -> Result<()> {
+        assert_eq!(values.len(), 2, "two arguments to merge_batch");
+        // first batch is partial sums, second is counts
+        let partial_sums = values[0].as_primitive::<Decimal128Type>();
+        let partial_counts = values[1].as_primitive::<Int64Type>();
+        // update counts with partial counts
+        self.counts.resize(total_num_groups, 0);
+        let iter1 = group_indices.iter().zip(partial_counts.values().iter());
+        for (&group_index, &partial_count) in iter1 {
+            self.counts[group_index] += partial_count;
+        }
+
+        // update sums
+        self.sums.resize(total_num_groups, 0);
+        let iter2 = group_indices.iter().zip(partial_sums.values().iter());
+        for (&group_index, &new_value) in iter2 {
+            let sum = &mut self.sums[group_index];
+            *sum = sum.add_wrapping(new_value);
+        }
+
+        Ok(())
+    }
+
+    fn evaluate(&mut self, emit_to: EmitTo) -> Result<ArrayRef> {
+        let counts = emit_to.take_needed(&mut self.counts);
+        let sums = emit_to.take_needed(&mut self.sums);
+
+        let mut builder = PrimitiveBuilder::<Decimal128Type>::with_capacity(sums.len())
+            .with_data_type(self.return_data_type.clone());
+        let iter = sums.into_iter().zip(counts);
+
+        let scaler = 10_i128.pow(self.target_scale.saturating_sub(self.sum_scale) as u32);
+        let target_min = MIN_DECIMAL_FOR_EACH_PRECISION[self.target_precision as usize - 1];
+        let target_max = MAX_DECIMAL_FOR_EACH_PRECISION[self.target_precision as usize - 1];
+
+        for (sum, count) in iter {
+            if count != 0 {
+                match avg(sum, count as i128, target_min, target_max, scaler) {
+                    Some(value) => {
+                        builder.append_value(value);
+                    }
+                    _ => {
+                        builder.append_null();
+                    }
+                }
+            } else {
+                builder.append_null();
+            }
+        }
+        let array: PrimitiveArray<Decimal128Type> = builder.finish();
+
+        Ok(Arc::new(array))
+    }
+
+    // return arrays for sums and counts
+    fn state(&mut self, emit_to: EmitTo) -> Result<Vec<ArrayRef>> {
+        let nulls = self.is_not_null.finish();
+        let nulls = Some(NullBuffer::new(nulls));
+
+        let counts = emit_to.take_needed(&mut self.counts);
+        let counts = Int64Array::new(counts.into(), nulls.clone());
+
+        let sums = emit_to.take_needed(&mut self.sums);
+        let sums =
+            Decimal128Array::new(sums.into(), nulls).with_data_type(self.sum_data_type.clone());
+
+        Ok(vec![
+            Arc::new(sums) as ArrayRef,
+            Arc::new(counts) as ArrayRef,
+        ])
+    }
+
+    fn size(&self) -> usize {
+        self.counts.capacity() * std::mem::size_of::<i64>()
+            + self.sums.capacity() * std::mem::size_of::<i128>()
+    }
+}
+
+/// Returns the `sum`/`count` as a i128 Decimal128 with
+/// target_scale and target_precision and return None if overflows.
+///
+/// * sum: The total sum value stored as Decimal128 with sum_scale
+/// * count: total count, stored as a i128 (*NOT* a Decimal128 value)
+/// * target_min: The minimum output value possible to represent with the target precision
+/// * target_max: The maximum output value possible to represent with the target precision
+/// * scaler: scale factor for avg
+#[inline(always)]
+fn avg(sum: i128, count: i128, target_min: i128, target_max: i128, scaler: i128) -> Option<i128> {
+    if let Some(value) = sum.checked_mul(scaler) {
+        // `sum / count` with ROUND_HALF_UP
+        let (div, rem) = value.div_rem(&count);
+        let half = div_ceil(count, 2);
+        let half_neg = half.neg_wrapping();
+        let new_value = match value >= 0 {
+            true if rem >= half => div.add_wrapping(1),
+            false if rem <= half_neg => div.sub_wrapping(1),
+            _ => div,
+        };
+        if new_value >= target_min && new_value <= target_max {
+            Some(new_value)
+        } else {
+            None
+        }
+    } else {
+        None
+    }
+}
diff --git a/src/bitwise_not.rs b/src/bitwise_not.rs
new file mode 100644
index 000000000000..36234935e1aa
--- /dev/null
+++ b/src/bitwise_not.rs
@@ -0,0 +1,177 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::{
+    any::Any,
+    hash::{Hash, Hasher},
+    sync::Arc,
+};
+
+use arrow::{
+    array::*,
+    datatypes::{DataType, Schema},
+    record_batch::RecordBatch,
+};
+use datafusion::physical_expr_common::physical_expr::down_cast_any_ref;
+use datafusion::{error::DataFusionError, logical_expr::ColumnarValue};
+use datafusion_common::Result;
+use datafusion_physical_expr::PhysicalExpr;
+
+macro_rules! compute_op {
+    ($OPERAND:expr, $DT:ident) => {{
+        let operand = $OPERAND
+            .as_any()
+            .downcast_ref::<$DT>()
+            .expect("compute_op failed to downcast array");
+        let result: $DT = operand.iter().map(|x| x.map(|y| !y)).collect();
+        Ok(Arc::new(result))
+    }};
+}
+
+/// BitwiseNot expression
+#[derive(Debug, Hash)]
+pub struct BitwiseNotExpr {
+    /// Input expression
+    arg: Arc<dyn PhysicalExpr>,
+}
+
+impl BitwiseNotExpr {
+    /// Create new bitwise not expression
+    pub fn new(arg: Arc<dyn PhysicalExpr>) -> Self {
+        Self { arg }
+    }
+
+    /// Get the input expression
+    pub fn arg(&self) -> &Arc<dyn PhysicalExpr> {
+        &self.arg
+    }
+}
+
+impl std::fmt::Display for BitwiseNotExpr {
+    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
+        write!(f, "(~ {})", self.arg)
+    }
+}
+
+impl PhysicalExpr for BitwiseNotExpr {
+    /// Return a reference to Any that can be used for downcasting
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn data_type(&self, input_schema: &Schema) -> Result<DataType> {
+        self.arg.data_type(input_schema)
+    }
+
+    fn nullable(&self, input_schema: &Schema) -> Result<bool> {
+        self.arg.nullable(input_schema)
+    }
+
+    fn evaluate(&self, batch: &RecordBatch) -> Result<ColumnarValue> {
+        let arg = self.arg.evaluate(batch)?;
+        match arg {
+            ColumnarValue::Array(array) => {
+                let result: Result<ArrayRef> = match array.data_type() {
+                    DataType::Int8 => compute_op!(array, Int8Array),
+                    DataType::Int16 => compute_op!(array, Int16Array),
+                    DataType::Int32 => compute_op!(array, Int32Array),
+                    DataType::Int64 => compute_op!(array, Int64Array),
+                    _ => Err(DataFusionError::Execution(format!(
+                        "(- '{:?}') can't be evaluated because the expression's type is {:?}, not signed int",
+                        self,
+                        array.data_type(),
+                    ))),
+                };
+                result.map(ColumnarValue::Array)
+            }
+            ColumnarValue::Scalar(_) => Err(DataFusionError::Internal(
+                "shouldn't go to bitwise not scalar path".to_string(),
+            )),
+        }
+    }
+
+    fn children(&self) -> Vec<&Arc<dyn PhysicalExpr>> {
+        vec![&self.arg]
+    }
+
+    fn with_new_children(
+        self: Arc<Self>,
+        children: Vec<Arc<dyn PhysicalExpr>>,
+    ) -> Result<Arc<dyn PhysicalExpr>> {
+        Ok(Arc::new(BitwiseNotExpr::new(Arc::clone(&children[0]))))
+    }
+
+    fn dyn_hash(&self, state: &mut dyn Hasher) {
+        let mut s = state;
+        self.arg.hash(&mut s);
+        self.hash(&mut s);
+    }
+}
+
+impl PartialEq<dyn Any> for BitwiseNotExpr {
+    fn eq(&self, other: &dyn Any) -> bool {
+        down_cast_any_ref(other)
+            .downcast_ref::<Self>()
+            .map(|x| self.arg.eq(&x.arg))
+            .unwrap_or(false)
+    }
+}
+
+pub fn bitwise_not(arg: Arc<dyn PhysicalExpr>) -> Result<Arc<dyn PhysicalExpr>> {
+    Ok(Arc::new(BitwiseNotExpr::new(arg)))
+}
+
+#[cfg(test)]
+mod tests {
+    use arrow::datatypes::*;
+    use datafusion_common::{cast::as_int32_array, Result};
+    use datafusion_physical_expr::expressions::col;
+
+    use super::*;
+
+    #[test]
+    fn bitwise_not_op() -> Result<()> {
+        let schema = Schema::new(vec![Field::new("a", DataType::Int32, true)]);
+
+        let expr = bitwise_not(col("a", &schema)?)?;
+
+        let input = Int32Array::from(vec![
+            Some(1),
+            Some(2),
+            None,
+            Some(12345),
+            Some(89),
+            Some(-3456),
+        ]);
+        let expected = &Int32Array::from(vec![
+            Some(-2),
+            Some(-3),
+            None,
+            Some(-12346),
+            Some(-90),
+            Some(3455),
+        ]);
+
+        let batch = RecordBatch::try_new(Arc::new(schema.clone()), vec![Arc::new(input)])?;
+
+        let result = expr.evaluate(&batch)?.into_array(batch.num_rows())?;
+        let result = as_int32_array(&result).expect("failed to downcast to In32Array");
+        assert_eq!(result, expected);
+
+        Ok(())
+    }
+}
diff --git a/src/comet_scalar_funcs.rs b/src/comet_scalar_funcs.rs
new file mode 100644
index 000000000000..71ff0e9dccad
--- /dev/null
+++ b/src/comet_scalar_funcs.rs
@@ -0,0 +1,193 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::scalar_funcs::hash_expressions::{
+    spark_sha224, spark_sha256, spark_sha384, spark_sha512,
+};
+use crate::scalar_funcs::{
+    spark_ceil, spark_date_add, spark_date_sub, spark_decimal_div, spark_floor, spark_hex,
+    spark_isnan, spark_make_decimal, spark_murmur3_hash, spark_read_side_padding, spark_round,
+    spark_unhex, spark_unscaled_value, spark_xxhash64, SparkChrFunc,
+};
+use arrow_schema::DataType;
+use datafusion_common::{DataFusionError, Result as DataFusionResult};
+use datafusion_expr::registry::FunctionRegistry;
+use datafusion_expr::{
+    ColumnarValue, ScalarFunctionImplementation, ScalarUDF, ScalarUDFImpl, Signature, Volatility,
+};
+use std::any::Any;
+use std::fmt::Debug;
+use std::sync::Arc;
+
+macro_rules! make_comet_scalar_udf {
+    ($name:expr, $func:ident, $data_type:ident) => {{
+        let scalar_func = CometScalarFunction::new(
+            $name.to_string(),
+            Signature::variadic_any(Volatility::Immutable),
+            $data_type.clone(),
+            Arc::new(move |args| $func(args, &$data_type)),
+        );
+        Ok(Arc::new(ScalarUDF::new_from_impl(scalar_func)))
+    }};
+    ($name:expr, $func:expr, without $data_type:ident) => {{
+        let scalar_func = CometScalarFunction::new(
+            $name.to_string(),
+            Signature::variadic_any(Volatility::Immutable),
+            $data_type,
+            $func,
+        );
+        Ok(Arc::new(ScalarUDF::new_from_impl(scalar_func)))
+    }};
+}
+
+/// Create a physical scalar function.
+pub fn create_comet_physical_fun(
+    fun_name: &str,
+    data_type: DataType,
+    registry: &dyn FunctionRegistry,
+) -> Result<Arc<ScalarUDF>, DataFusionError> {
+    match fun_name {
+        "ceil" => {
+            make_comet_scalar_udf!("ceil", spark_ceil, data_type)
+        }
+        "floor" => {
+            make_comet_scalar_udf!("floor", spark_floor, data_type)
+        }
+        "read_side_padding" => {
+            let func = Arc::new(spark_read_side_padding);
+            make_comet_scalar_udf!("read_side_padding", func, without data_type)
+        }
+        "round" => {
+            make_comet_scalar_udf!("round", spark_round, data_type)
+        }
+        "unscaled_value" => {
+            let func = Arc::new(spark_unscaled_value);
+            make_comet_scalar_udf!("unscaled_value", func, without data_type)
+        }
+        "make_decimal" => {
+            make_comet_scalar_udf!("make_decimal", spark_make_decimal, data_type)
+        }
+        "hex" => {
+            let func = Arc::new(spark_hex);
+            make_comet_scalar_udf!("hex", func, without data_type)
+        }
+        "unhex" => {
+            let func = Arc::new(spark_unhex);
+            make_comet_scalar_udf!("unhex", func, without data_type)
+        }
+        "decimal_div" => {
+            make_comet_scalar_udf!("decimal_div", spark_decimal_div, data_type)
+        }
+        "murmur3_hash" => {
+            let func = Arc::new(spark_murmur3_hash);
+            make_comet_scalar_udf!("murmur3_hash", func, without data_type)
+        }
+        "xxhash64" => {
+            let func = Arc::new(spark_xxhash64);
+            make_comet_scalar_udf!("xxhash64", func, without data_type)
+        }
+        "chr" => Ok(Arc::new(ScalarUDF::new_from_impl(SparkChrFunc::default()))),
+        "isnan" => {
+            let func = Arc::new(spark_isnan);
+            make_comet_scalar_udf!("isnan", func, without data_type)
+        }
+        "sha224" => {
+            let func = Arc::new(spark_sha224);
+            make_comet_scalar_udf!("sha224", func, without data_type)
+        }
+        "sha256" => {
+            let func = Arc::new(spark_sha256);
+            make_comet_scalar_udf!("sha256", func, without data_type)
+        }
+        "sha384" => {
+            let func = Arc::new(spark_sha384);
+            make_comet_scalar_udf!("sha384", func, without data_type)
+        }
+        "sha512" => {
+            let func = Arc::new(spark_sha512);
+            make_comet_scalar_udf!("sha512", func, without data_type)
+        }
+        "date_add" => {
+            let func = Arc::new(spark_date_add);
+            make_comet_scalar_udf!("date_add", func, without data_type)
+        }
+        "date_sub" => {
+            let func = Arc::new(spark_date_sub);
+            make_comet_scalar_udf!("date_sub", func, without data_type)
+        }
+        _ => registry.udf(fun_name).map_err(|e| {
+            DataFusionError::Execution(format!(
+                "Function {fun_name} not found in the registry: {e}",
+            ))
+        }),
+    }
+}
+
+struct CometScalarFunction {
+    name: String,
+    signature: Signature,
+    data_type: DataType,
+    func: ScalarFunctionImplementation,
+}
+
+impl Debug for CometScalarFunction {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("CometScalarFunction")
+            .field("name", &self.name)
+            .field("signature", &self.signature)
+            .field("data_type", &self.data_type)
+            .finish()
+    }
+}
+
+impl CometScalarFunction {
+    fn new(
+        name: String,
+        signature: Signature,
+        data_type: DataType,
+        func: ScalarFunctionImplementation,
+    ) -> Self {
+        Self {
+            name,
+            signature,
+            data_type,
+            func,
+        }
+    }
+}
+
+impl ScalarUDFImpl for CometScalarFunction {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn name(&self) -> &str {
+        self.name.as_str()
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, _: &[DataType]) -> DataFusionResult<DataType> {
+        Ok(self.data_type.clone())
+    }
+
+    fn invoke(&self, args: &[ColumnarValue]) -> DataFusionResult<ColumnarValue> {
+        (self.func)(args)
+    }
+}
diff --git a/src/correlation.rs b/src/correlation.rs
new file mode 100644
index 000000000000..e5f36c6f9585
--- /dev/null
+++ b/src/correlation.rs
@@ -0,0 +1,272 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::compute::{and, filter, is_not_null};
+
+use std::{any::Any, sync::Arc};
+
+use crate::covariance::CovarianceAccumulator;
+use crate::stddev::StddevAccumulator;
+use arrow::{
+    array::ArrayRef,
+    datatypes::{DataType, Field},
+};
+use datafusion::logical_expr::Accumulator;
+use datafusion::physical_expr_common::physical_expr::down_cast_any_ref;
+use datafusion_common::{Result, ScalarValue};
+use datafusion_expr::function::{AccumulatorArgs, StateFieldsArgs};
+use datafusion_expr::type_coercion::aggregates::NUMERICS;
+use datafusion_expr::{AggregateUDFImpl, Signature, Volatility};
+use datafusion_physical_expr::expressions::StatsType;
+use datafusion_physical_expr::{expressions::format_state_name, PhysicalExpr};
+
+/// CORR aggregate expression
+/// The implementation mostly is the same as the DataFusion's implementation. The reason
+/// we have our own implementation is that DataFusion has UInt64 for state_field `count`,
+/// while Spark has Double for count. Also we have added `null_on_divide_by_zero`
+/// to be consistent with Spark's implementation.
+#[derive(Debug)]
+pub struct Correlation {
+    name: String,
+    signature: Signature,
+    expr1: Arc<dyn PhysicalExpr>,
+    expr2: Arc<dyn PhysicalExpr>,
+    null_on_divide_by_zero: bool,
+}
+
+impl Correlation {
+    pub fn new(
+        expr1: Arc<dyn PhysicalExpr>,
+        expr2: Arc<dyn PhysicalExpr>,
+        name: impl Into<String>,
+        data_type: DataType,
+        null_on_divide_by_zero: bool,
+    ) -> Self {
+        // the result of correlation just support FLOAT64 data type.
+        assert!(matches!(data_type, DataType::Float64));
+        Self {
+            name: name.into(),
+            signature: Signature::uniform(2, NUMERICS.to_vec(), Volatility::Immutable),
+            expr1,
+            expr2,
+            null_on_divide_by_zero,
+        }
+    }
+}
+
+impl AggregateUDFImpl for Correlation {
+    /// Return a reference to Any that can be used for downcasting
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn name(&self) -> &str {
+        &self.name
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        Ok(DataType::Float64)
+    }
+    fn default_value(&self, _data_type: &DataType) -> Result<ScalarValue> {
+        Ok(ScalarValue::Float64(None))
+    }
+
+    fn accumulator(&self, _acc_args: AccumulatorArgs) -> Result<Box<dyn Accumulator>> {
+        Ok(Box::new(CorrelationAccumulator::try_new(
+            self.null_on_divide_by_zero,
+        )?))
+    }
+
+    fn state_fields(&self, _args: StateFieldsArgs) -> Result<Vec<Field>> {
+        Ok(vec![
+            Field::new(
+                format_state_name(&self.name, "count"),
+                DataType::Float64,
+                true,
+            ),
+            Field::new(
+                format_state_name(&self.name, "mean1"),
+                DataType::Float64,
+                true,
+            ),
+            Field::new(
+                format_state_name(&self.name, "mean2"),
+                DataType::Float64,
+                true,
+            ),
+            Field::new(
+                format_state_name(&self.name, "algo_const"),
+                DataType::Float64,
+                true,
+            ),
+            Field::new(
+                format_state_name(&self.name, "m2_1"),
+                DataType::Float64,
+                true,
+            ),
+            Field::new(
+                format_state_name(&self.name, "m2_2"),
+                DataType::Float64,
+                true,
+            ),
+        ])
+    }
+}
+
+impl PartialEq<dyn Any> for Correlation {
+    fn eq(&self, other: &dyn Any) -> bool {
+        down_cast_any_ref(other)
+            .downcast_ref::<Self>()
+            .map(|x| {
+                self.name == x.name
+                    && self.expr1.eq(&x.expr1)
+                    && self.expr2.eq(&x.expr2)
+                    && self.null_on_divide_by_zero == x.null_on_divide_by_zero
+            })
+            .unwrap_or(false)
+    }
+}
+
+/// An accumulator to compute correlation
+#[derive(Debug)]
+pub struct CorrelationAccumulator {
+    covar: CovarianceAccumulator,
+    stddev1: StddevAccumulator,
+    stddev2: StddevAccumulator,
+    null_on_divide_by_zero: bool,
+}
+
+impl CorrelationAccumulator {
+    /// Creates a new `CorrelationAccumulator`
+    pub fn try_new(null_on_divide_by_zero: bool) -> Result<Self> {
+        Ok(Self {
+            covar: CovarianceAccumulator::try_new(StatsType::Population, null_on_divide_by_zero)?,
+            stddev1: StddevAccumulator::try_new(StatsType::Population, null_on_divide_by_zero)?,
+            stddev2: StddevAccumulator::try_new(StatsType::Population, null_on_divide_by_zero)?,
+            null_on_divide_by_zero,
+        })
+    }
+}
+
+impl Accumulator for CorrelationAccumulator {
+    fn state(&mut self) -> Result<Vec<ScalarValue>> {
+        Ok(vec![
+            ScalarValue::from(self.covar.get_count()),
+            ScalarValue::from(self.covar.get_mean1()),
+            ScalarValue::from(self.covar.get_mean2()),
+            ScalarValue::from(self.covar.get_algo_const()),
+            ScalarValue::from(self.stddev1.get_m2()),
+            ScalarValue::from(self.stddev2.get_m2()),
+        ])
+    }
+
+    fn update_batch(&mut self, values: &[ArrayRef]) -> Result<()> {
+        let values = if values[0].null_count() != 0 || values[1].null_count() != 0 {
+            let mask = and(&is_not_null(&values[0])?, &is_not_null(&values[1])?)?;
+            let values1 = filter(&values[0], &mask)?;
+            let values2 = filter(&values[1], &mask)?;
+
+            vec![values1, values2]
+        } else {
+            values.to_vec()
+        };
+
+        if !values[0].is_empty() && !values[1].is_empty() {
+            self.covar.update_batch(&values)?;
+            self.stddev1.update_batch(&values[0..1])?;
+            self.stddev2.update_batch(&values[1..2])?;
+        }
+
+        Ok(())
+    }
+
+    fn retract_batch(&mut self, values: &[ArrayRef]) -> Result<()> {
+        let values = if values[0].null_count() != 0 || values[1].null_count() != 0 {
+            let mask = and(&is_not_null(&values[0])?, &is_not_null(&values[1])?)?;
+            let values1 = filter(&values[0], &mask)?;
+            let values2 = filter(&values[1], &mask)?;
+
+            vec![values1, values2]
+        } else {
+            values.to_vec()
+        };
+
+        self.covar.retract_batch(&values)?;
+        self.stddev1.retract_batch(&values[0..1])?;
+        self.stddev2.retract_batch(&values[1..2])?;
+        Ok(())
+    }
+
+    fn merge_batch(&mut self, states: &[ArrayRef]) -> Result<()> {
+        let states_c = [
+            Arc::clone(&states[0]),
+            Arc::clone(&states[1]),
+            Arc::clone(&states[2]),
+            Arc::clone(&states[3]),
+        ];
+        let states_s1 = [
+            Arc::clone(&states[0]),
+            Arc::clone(&states[1]),
+            Arc::clone(&states[4]),
+        ];
+        let states_s2 = [
+            Arc::clone(&states[0]),
+            Arc::clone(&states[2]),
+            Arc::clone(&states[5]),
+        ];
+
+        if states[0].len() > 0 && states[1].len() > 0 && states[2].len() > 0 {
+            self.covar.merge_batch(&states_c)?;
+            self.stddev1.merge_batch(&states_s1)?;
+            self.stddev2.merge_batch(&states_s2)?;
+        }
+        Ok(())
+    }
+
+    fn evaluate(&mut self) -> Result<ScalarValue> {
+        let covar = self.covar.evaluate()?;
+        let stddev1 = self.stddev1.evaluate()?;
+        let stddev2 = self.stddev2.evaluate()?;
+
+        match (covar, stddev1, stddev2) {
+            (
+                ScalarValue::Float64(Some(c)),
+                ScalarValue::Float64(Some(s1)),
+                ScalarValue::Float64(Some(s2)),
+            ) if s1 != 0.0 && s2 != 0.0 => Ok(ScalarValue::Float64(Some(c / (s1 * s2)))),
+            _ if self.null_on_divide_by_zero => Ok(ScalarValue::Float64(None)),
+            _ => {
+                if self.covar.get_count() == 1.0 {
+                    return Ok(ScalarValue::Float64(Some(f64::NAN)));
+                }
+                Ok(ScalarValue::Float64(None))
+            }
+        }
+    }
+
+    fn size(&self) -> usize {
+        std::mem::size_of_val(self) - std::mem::size_of_val(&self.covar) + self.covar.size()
+            - std::mem::size_of_val(&self.stddev1)
+            + self.stddev1.size()
+            - std::mem::size_of_val(&self.stddev2)
+            + self.stddev2.size()
+    }
+}
diff --git a/src/covariance.rs b/src/covariance.rs
new file mode 100644
index 000000000000..9166e397669c
--- /dev/null
+++ b/src/covariance.rs
@@ -0,0 +1,328 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+use std::{any::Any, sync::Arc};
+
+use arrow::{
+    array::{ArrayRef, Float64Array},
+    compute::cast,
+    datatypes::{DataType, Field},
+};
+use datafusion::logical_expr::Accumulator;
+use datafusion::physical_expr_common::physical_expr::down_cast_any_ref;
+use datafusion_common::{
+    downcast_value, unwrap_or_internal_err, DataFusionError, Result, ScalarValue,
+};
+use datafusion_expr::function::{AccumulatorArgs, StateFieldsArgs};
+use datafusion_expr::type_coercion::aggregates::NUMERICS;
+use datafusion_expr::{AggregateUDFImpl, Signature, Volatility};
+use datafusion_physical_expr::expressions::StatsType;
+use datafusion_physical_expr::{expressions::format_state_name, PhysicalExpr};
+
+/// COVAR_SAMP and COVAR_POP aggregate expression
+/// The implementation mostly is the same as the DataFusion's implementation. The reason
+/// we have our own implementation is that DataFusion has UInt64 for state_field count,
+/// while Spark has Double for count.
+#[derive(Debug, Clone)]
+pub struct Covariance {
+    name: String,
+    signature: Signature,
+    expr1: Arc<dyn PhysicalExpr>,
+    expr2: Arc<dyn PhysicalExpr>,
+    stats_type: StatsType,
+    null_on_divide_by_zero: bool,
+}
+
+impl Covariance {
+    /// Create a new COVAR aggregate function
+    pub fn new(
+        expr1: Arc<dyn PhysicalExpr>,
+        expr2: Arc<dyn PhysicalExpr>,
+        name: impl Into<String>,
+        data_type: DataType,
+        stats_type: StatsType,
+        null_on_divide_by_zero: bool,
+    ) -> Self {
+        // the result of covariance just support FLOAT64 data type.
+        assert!(matches!(data_type, DataType::Float64));
+        Self {
+            name: name.into(),
+            signature: Signature::uniform(2, NUMERICS.to_vec(), Volatility::Immutable),
+            expr1,
+            expr2,
+            stats_type,
+            null_on_divide_by_zero,
+        }
+    }
+}
+
+impl AggregateUDFImpl for Covariance {
+    /// Return a reference to Any that can be used for downcasting
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn name(&self) -> &str {
+        &self.name
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        Ok(DataType::Float64)
+    }
+    fn default_value(&self, _data_type: &DataType) -> Result<ScalarValue> {
+        Ok(ScalarValue::Float64(None))
+    }
+
+    fn accumulator(&self, _acc_args: AccumulatorArgs) -> Result<Box<dyn Accumulator>> {
+        Ok(Box::new(CovarianceAccumulator::try_new(
+            self.stats_type,
+            self.null_on_divide_by_zero,
+        )?))
+    }
+
+    fn state_fields(&self, _args: StateFieldsArgs) -> Result<Vec<Field>> {
+        Ok(vec![
+            Field::new(
+                format_state_name(&self.name, "count"),
+                DataType::Float64,
+                true,
+            ),
+            Field::new(
+                format_state_name(&self.name, "mean1"),
+                DataType::Float64,
+                true,
+            ),
+            Field::new(
+                format_state_name(&self.name, "mean2"),
+                DataType::Float64,
+                true,
+            ),
+            Field::new(
+                format_state_name(&self.name, "algo_const"),
+                DataType::Float64,
+                true,
+            ),
+        ])
+    }
+}
+
+impl PartialEq<dyn Any> for Covariance {
+    fn eq(&self, other: &dyn Any) -> bool {
+        down_cast_any_ref(other)
+            .downcast_ref::<Self>()
+            .map(|x| {
+                self.name == x.name
+                    && self.expr1.eq(&x.expr1)
+                    && self.expr2.eq(&x.expr2)
+                    && self.stats_type == x.stats_type
+                    && self.null_on_divide_by_zero == x.null_on_divide_by_zero
+            })
+            .unwrap_or(false)
+    }
+}
+
+/// An accumulator to compute covariance
+#[derive(Debug)]
+pub struct CovarianceAccumulator {
+    algo_const: f64,
+    mean1: f64,
+    mean2: f64,
+    count: f64,
+    stats_type: StatsType,
+    null_on_divide_by_zero: bool,
+}
+
+impl CovarianceAccumulator {
+    /// Creates a new `CovarianceAccumulator`
+    pub fn try_new(s_type: StatsType, null_on_divide_by_zero: bool) -> Result<Self> {
+        Ok(Self {
+            algo_const: 0_f64,
+            mean1: 0_f64,
+            mean2: 0_f64,
+            count: 0_f64,
+            stats_type: s_type,
+            null_on_divide_by_zero,
+        })
+    }
+
+    pub fn get_count(&self) -> f64 {
+        self.count
+    }
+
+    pub fn get_mean1(&self) -> f64 {
+        self.mean1
+    }
+
+    pub fn get_mean2(&self) -> f64 {
+        self.mean2
+    }
+
+    pub fn get_algo_const(&self) -> f64 {
+        self.algo_const
+    }
+}
+
+impl Accumulator for CovarianceAccumulator {
+    fn state(&mut self) -> Result<Vec<ScalarValue>> {
+        Ok(vec![
+            ScalarValue::from(self.count),
+            ScalarValue::from(self.mean1),
+            ScalarValue::from(self.mean2),
+            ScalarValue::from(self.algo_const),
+        ])
+    }
+
+    fn update_batch(&mut self, values: &[ArrayRef]) -> Result<()> {
+        let values1 = &cast(&values[0], &DataType::Float64)?;
+        let values2 = &cast(&values[1], &DataType::Float64)?;
+
+        let mut arr1 = downcast_value!(values1, Float64Array).iter().flatten();
+        let mut arr2 = downcast_value!(values2, Float64Array).iter().flatten();
+
+        for i in 0..values1.len() {
+            let value1 = if values1.is_valid(i) {
+                arr1.next()
+            } else {
+                None
+            };
+            let value2 = if values2.is_valid(i) {
+                arr2.next()
+            } else {
+                None
+            };
+
+            if value1.is_none() || value2.is_none() {
+                continue;
+            }
+
+            let value1 = unwrap_or_internal_err!(value1);
+            let value2 = unwrap_or_internal_err!(value2);
+            let new_count = self.count + 1.0;
+            let delta1 = value1 - self.mean1;
+            let new_mean1 = delta1 / new_count + self.mean1;
+            let delta2 = value2 - self.mean2;
+            let new_mean2 = delta2 / new_count + self.mean2;
+            let new_c = delta1 * (value2 - new_mean2) + self.algo_const;
+
+            self.count += 1.0;
+            self.mean1 = new_mean1;
+            self.mean2 = new_mean2;
+            self.algo_const = new_c;
+        }
+
+        Ok(())
+    }
+
+    fn retract_batch(&mut self, values: &[ArrayRef]) -> Result<()> {
+        let values1 = &cast(&values[0], &DataType::Float64)?;
+        let values2 = &cast(&values[1], &DataType::Float64)?;
+        let mut arr1 = downcast_value!(values1, Float64Array).iter().flatten();
+        let mut arr2 = downcast_value!(values2, Float64Array).iter().flatten();
+
+        for i in 0..values1.len() {
+            let value1 = if values1.is_valid(i) {
+                arr1.next()
+            } else {
+                None
+            };
+            let value2 = if values2.is_valid(i) {
+                arr2.next()
+            } else {
+                None
+            };
+
+            if value1.is_none() || value2.is_none() {
+                continue;
+            }
+
+            let value1 = unwrap_or_internal_err!(value1);
+            let value2 = unwrap_or_internal_err!(value2);
+
+            let new_count = self.count - 1.0;
+            let delta1 = self.mean1 - value1;
+            let new_mean1 = delta1 / new_count + self.mean1;
+            let delta2 = self.mean2 - value2;
+            let new_mean2 = delta2 / new_count + self.mean2;
+            let new_c = self.algo_const - delta1 * (new_mean2 - value2);
+
+            self.count -= 1.0;
+            self.mean1 = new_mean1;
+            self.mean2 = new_mean2;
+            self.algo_const = new_c;
+        }
+
+        Ok(())
+    }
+
+    fn merge_batch(&mut self, states: &[ArrayRef]) -> Result<()> {
+        let counts = downcast_value!(states[0], Float64Array);
+        let means1 = downcast_value!(states[1], Float64Array);
+        let means2 = downcast_value!(states[2], Float64Array);
+        let cs = downcast_value!(states[3], Float64Array);
+
+        for i in 0..counts.len() {
+            let c = counts.value(i);
+            if c == 0.0 {
+                continue;
+            }
+            let new_count = self.count + c;
+            let new_mean1 = self.mean1 * self.count / new_count + means1.value(i) * c / new_count;
+            let new_mean2 = self.mean2 * self.count / new_count + means2.value(i) * c / new_count;
+            let delta1 = self.mean1 - means1.value(i);
+            let delta2 = self.mean2 - means2.value(i);
+            let new_c =
+                self.algo_const + cs.value(i) + delta1 * delta2 * self.count * c / new_count;
+
+            self.count = new_count;
+            self.mean1 = new_mean1;
+            self.mean2 = new_mean2;
+            self.algo_const = new_c;
+        }
+        Ok(())
+    }
+
+    fn evaluate(&mut self) -> Result<ScalarValue> {
+        if self.count == 0.0 {
+            return Ok(ScalarValue::Float64(None));
+        }
+
+        let count = match self.stats_type {
+            StatsType::Population => self.count,
+            StatsType::Sample if self.count > 1.0 => self.count - 1.0,
+            StatsType::Sample => {
+                // self.count == 1.0
+                return if self.null_on_divide_by_zero {
+                    Ok(ScalarValue::Float64(None))
+                } else {
+                    Ok(ScalarValue::Float64(Some(f64::NAN)))
+                };
+            }
+        };
+
+        Ok(ScalarValue::Float64(Some(self.algo_const / count)))
+    }
+
+    fn size(&self) -> usize {
+        std::mem::size_of_val(self)
+    }
+}
diff --git a/src/lib.rs b/src/lib.rs
index c227b3a02522..15f446ef35b3 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -23,18 +23,38 @@ mod cast;
 mod error;
 mod if_expr;
 
+mod avg;
+pub use avg::Avg;
+mod bitwise_not;
+pub use bitwise_not::{bitwise_not, BitwiseNotExpr};
+mod avg_decimal;
+pub use avg_decimal::AvgDecimal;
+mod correlation;
+pub use correlation::Correlation;
+mod covariance;
+pub use covariance::Covariance;
 mod kernels;
 mod list;
 mod regexp;
 pub mod scalar_funcs;
 pub mod spark_hash;
+mod stddev;
+pub use stddev::Stddev;
 mod structs;
+mod sum_decimal;
+pub use sum_decimal::SumDecimal;
+mod normalize_nan;
 mod temporal;
 pub mod timezone;
 mod to_json;
 pub mod utils;
+pub use normalize_nan::NormalizeNaNAndZero;
 
+mod variance;
+pub use variance::Variance;
+mod comet_scalar_funcs;
 pub use cast::{spark_cast, Cast, SparkCastOptions};
+pub use comet_scalar_funcs::create_comet_physical_fun;
 pub use error::{SparkError, SparkResult};
 pub use if_expr::IfExpr;
 pub use list::{ArrayInsert, GetArrayStructFields, ListExtract};
diff --git a/src/normalize_nan.rs b/src/normalize_nan.rs
new file mode 100644
index 000000000000..c5331ad7bd4a
--- /dev/null
+++ b/src/normalize_nan.rs
@@ -0,0 +1,166 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::{
+    any::Any,
+    fmt::{Display, Formatter},
+    hash::{Hash, Hasher},
+    sync::Arc,
+};
+
+use arrow::{
+    array::{as_primitive_array, ArrayAccessor, ArrayIter, Float32Array, Float64Array},
+    datatypes::{ArrowNativeType, Float32Type, Float64Type},
+    record_batch::RecordBatch,
+};
+use arrow_schema::{DataType, Schema};
+use datafusion::logical_expr::ColumnarValue;
+use datafusion::physical_expr_common::physical_expr::down_cast_any_ref;
+use datafusion_physical_expr::PhysicalExpr;
+
+#[derive(Debug, Hash)]
+pub struct NormalizeNaNAndZero {
+    pub data_type: DataType,
+    pub child: Arc<dyn PhysicalExpr>,
+}
+
+impl NormalizeNaNAndZero {
+    pub fn new(data_type: DataType, child: Arc<dyn PhysicalExpr>) -> Self {
+        Self { data_type, child }
+    }
+}
+
+impl PhysicalExpr for NormalizeNaNAndZero {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn data_type(&self, input_schema: &Schema) -> datafusion_common::Result<DataType> {
+        self.child.data_type(input_schema)
+    }
+
+    fn nullable(&self, input_schema: &Schema) -> datafusion_common::Result<bool> {
+        self.child.nullable(input_schema)
+    }
+
+    fn evaluate(&self, batch: &RecordBatch) -> datafusion_common::Result<ColumnarValue> {
+        let cv = self.child.evaluate(batch)?;
+        let array = cv.into_array(batch.num_rows())?;
+
+        match &self.data_type {
+            DataType::Float32 => {
+                let v = eval_typed(as_primitive_array::<Float32Type>(&array));
+                let new_array = Float32Array::from(v);
+                Ok(ColumnarValue::Array(Arc::new(new_array)))
+            }
+            DataType::Float64 => {
+                let v = eval_typed(as_primitive_array::<Float64Type>(&array));
+                let new_array = Float64Array::from(v);
+                Ok(ColumnarValue::Array(Arc::new(new_array)))
+            }
+            dt => panic!("Unexpected data type {:?}", dt),
+        }
+    }
+
+    fn children(&self) -> Vec<&Arc<dyn PhysicalExpr>> {
+        self.child.children()
+    }
+
+    fn with_new_children(
+        self: Arc<Self>,
+        children: Vec<Arc<dyn PhysicalExpr>>,
+    ) -> datafusion_common::Result<Arc<dyn PhysicalExpr>> {
+        Ok(Arc::new(NormalizeNaNAndZero::new(
+            self.data_type.clone(),
+            Arc::clone(&children[0]),
+        )))
+    }
+
+    fn dyn_hash(&self, state: &mut dyn Hasher) {
+        let mut s = state;
+        self.child.hash(&mut s);
+        self.data_type.hash(&mut s);
+        self.hash(&mut s);
+    }
+}
+
+fn eval_typed<V: FloatDouble, T: ArrayAccessor<Item = V>>(input: T) -> Vec<Option<V>> {
+    let iter = ArrayIter::new(input);
+    iter.map(|o| {
+        o.map(|v| {
+            if v.is_nan() {
+                v.nan()
+            } else if v.is_neg_zero() {
+                v.zero()
+            } else {
+                v
+            }
+        })
+    })
+    .collect()
+}
+
+impl Display for NormalizeNaNAndZero {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        write!(f, "FloatNormalize [child: {}]", self.child)
+    }
+}
+
+impl PartialEq<dyn Any> for NormalizeNaNAndZero {
+    fn eq(&self, other: &dyn Any) -> bool {
+        down_cast_any_ref(other)
+            .downcast_ref::<Self>()
+            .map(|x| self.child.eq(&x.child) && self.data_type.eq(&x.data_type))
+            .unwrap_or(false)
+    }
+}
+
+trait FloatDouble: ArrowNativeType {
+    fn is_nan(&self) -> bool;
+    fn nan(&self) -> Self;
+    fn is_neg_zero(&self) -> bool;
+    fn zero(&self) -> Self;
+}
+
+impl FloatDouble for f32 {
+    fn is_nan(&self) -> bool {
+        f32::is_nan(*self)
+    }
+    fn nan(&self) -> Self {
+        f32::NAN
+    }
+    fn is_neg_zero(&self) -> bool {
+        *self == -0.0
+    }
+    fn zero(&self) -> Self {
+        0.0
+    }
+}
+impl FloatDouble for f64 {
+    fn is_nan(&self) -> bool {
+        f64::is_nan(*self)
+    }
+    fn nan(&self) -> Self {
+        f64::NAN
+    }
+    fn is_neg_zero(&self) -> bool {
+        *self == -0.0
+    }
+    fn zero(&self) -> Self {
+        0.0
+    }
+}
diff --git a/src/stddev.rs b/src/stddev.rs
new file mode 100644
index 000000000000..3cf604da0b3d
--- /dev/null
+++ b/src/stddev.rs
@@ -0,0 +1,190 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::{any::Any, sync::Arc};
+
+use crate::variance::VarianceAccumulator;
+use arrow::{
+    array::ArrayRef,
+    datatypes::{DataType, Field},
+};
+use datafusion::logical_expr::Accumulator;
+use datafusion::physical_expr_common::physical_expr::down_cast_any_ref;
+use datafusion_common::{internal_err, Result, ScalarValue};
+use datafusion_expr::function::{AccumulatorArgs, StateFieldsArgs};
+use datafusion_expr::{AggregateUDFImpl, Signature, Volatility};
+use datafusion_physical_expr::expressions::StatsType;
+use datafusion_physical_expr::{expressions::format_state_name, PhysicalExpr};
+
+/// STDDEV and STDDEV_SAMP (standard deviation) aggregate expression
+/// The implementation mostly is the same as the DataFusion's implementation. The reason
+/// we have our own implementation is that DataFusion has UInt64 for state_field `count`,
+/// while Spark has Double for count. Also we have added `null_on_divide_by_zero`
+/// to be consistent with Spark's implementation.
+#[derive(Debug)]
+pub struct Stddev {
+    name: String,
+    signature: Signature,
+    expr: Arc<dyn PhysicalExpr>,
+    stats_type: StatsType,
+    null_on_divide_by_zero: bool,
+}
+
+impl Stddev {
+    /// Create a new STDDEV aggregate function
+    pub fn new(
+        expr: Arc<dyn PhysicalExpr>,
+        name: impl Into<String>,
+        data_type: DataType,
+        stats_type: StatsType,
+        null_on_divide_by_zero: bool,
+    ) -> Self {
+        // the result of stddev just support FLOAT64.
+        assert!(matches!(data_type, DataType::Float64));
+        Self {
+            name: name.into(),
+            signature: Signature::coercible(vec![DataType::Float64], Volatility::Immutable),
+            expr,
+            stats_type,
+            null_on_divide_by_zero,
+        }
+    }
+}
+
+impl AggregateUDFImpl for Stddev {
+    /// Return a reference to Any that can be used for downcasting
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn name(&self) -> &str {
+        &self.name
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        Ok(DataType::Float64)
+    }
+
+    fn accumulator(&self, _acc_args: AccumulatorArgs) -> Result<Box<dyn Accumulator>> {
+        Ok(Box::new(StddevAccumulator::try_new(
+            self.stats_type,
+            self.null_on_divide_by_zero,
+        )?))
+    }
+
+    fn create_sliding_accumulator(
+        &self,
+        _acc_args: AccumulatorArgs,
+    ) -> Result<Box<dyn Accumulator>> {
+        Ok(Box::new(StddevAccumulator::try_new(
+            self.stats_type,
+            self.null_on_divide_by_zero,
+        )?))
+    }
+
+    fn state_fields(&self, _args: StateFieldsArgs) -> Result<Vec<Field>> {
+        Ok(vec![
+            Field::new(
+                format_state_name(&self.name, "count"),
+                DataType::Float64,
+                true,
+            ),
+            Field::new(
+                format_state_name(&self.name, "mean"),
+                DataType::Float64,
+                true,
+            ),
+            Field::new(format_state_name(&self.name, "m2"), DataType::Float64, true),
+        ])
+    }
+
+    fn default_value(&self, _data_type: &DataType) -> Result<ScalarValue> {
+        Ok(ScalarValue::Float64(None))
+    }
+}
+
+impl PartialEq<dyn Any> for Stddev {
+    fn eq(&self, other: &dyn Any) -> bool {
+        down_cast_any_ref(other)
+            .downcast_ref::<Self>()
+            .map(|x| {
+                self.name == x.name
+                    && self.expr.eq(&x.expr)
+                    && self.null_on_divide_by_zero == x.null_on_divide_by_zero
+                    && self.stats_type == x.stats_type
+            })
+            .unwrap_or(false)
+    }
+}
+
+/// An accumulator to compute the standard deviation
+#[derive(Debug)]
+pub struct StddevAccumulator {
+    variance: VarianceAccumulator,
+}
+
+impl StddevAccumulator {
+    /// Creates a new `StddevAccumulator`
+    pub fn try_new(s_type: StatsType, null_on_divide_by_zero: bool) -> Result<Self> {
+        Ok(Self {
+            variance: VarianceAccumulator::try_new(s_type, null_on_divide_by_zero)?,
+        })
+    }
+
+    pub fn get_m2(&self) -> f64 {
+        self.variance.get_m2()
+    }
+}
+
+impl Accumulator for StddevAccumulator {
+    fn state(&mut self) -> Result<Vec<ScalarValue>> {
+        Ok(vec![
+            ScalarValue::from(self.variance.get_count()),
+            ScalarValue::from(self.variance.get_mean()),
+            ScalarValue::from(self.variance.get_m2()),
+        ])
+    }
+
+    fn update_batch(&mut self, values: &[ArrayRef]) -> Result<()> {
+        self.variance.update_batch(values)
+    }
+
+    fn retract_batch(&mut self, values: &[ArrayRef]) -> Result<()> {
+        self.variance.retract_batch(values)
+    }
+
+    fn merge_batch(&mut self, states: &[ArrayRef]) -> Result<()> {
+        self.variance.merge_batch(states)
+    }
+
+    fn evaluate(&mut self) -> Result<ScalarValue> {
+        let variance = self.variance.evaluate()?;
+        match variance {
+            ScalarValue::Float64(Some(e)) => Ok(ScalarValue::Float64(Some(e.sqrt()))),
+            ScalarValue::Float64(None) => Ok(ScalarValue::Float64(None)),
+            _ => internal_err!("Variance should be f64"),
+        }
+    }
+
+    fn size(&self) -> usize {
+        std::mem::align_of_val(self) - std::mem::align_of_val(&self.variance) + self.variance.size()
+    }
+}
diff --git a/src/sum_decimal.rs b/src/sum_decimal.rs
new file mode 100644
index 000000000000..ab142aee6925
--- /dev/null
+++ b/src/sum_decimal.rs
@@ -0,0 +1,575 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::utils::{is_valid_decimal_precision, unlikely};
+use arrow::{
+    array::BooleanBufferBuilder,
+    buffer::{BooleanBuffer, NullBuffer},
+};
+use arrow_array::{
+    cast::AsArray, types::Decimal128Type, Array, ArrayRef, BooleanArray, Decimal128Array,
+};
+use arrow_schema::{DataType, Field};
+use datafusion::logical_expr::{Accumulator, EmitTo, GroupsAccumulator};
+use datafusion::physical_expr_common::physical_expr::down_cast_any_ref;
+use datafusion_common::{DataFusionError, Result as DFResult, ScalarValue};
+use datafusion_expr::function::{AccumulatorArgs, StateFieldsArgs};
+use datafusion_expr::Volatility::Immutable;
+use datafusion_expr::{AggregateUDFImpl, ReversedUDAF, Signature};
+use datafusion_physical_expr::PhysicalExpr;
+use std::{any::Any, ops::BitAnd, sync::Arc};
+
+#[derive(Debug)]
+pub struct SumDecimal {
+    /// Aggregate function signature
+    signature: Signature,
+    /// The expression that provides the input decimal values to be summed
+    expr: Arc<dyn PhysicalExpr>,
+    /// The data type of the SUM result. This will always be a decimal type
+    /// with the same precision and scale as specified in this struct
+    result_type: DataType,
+    /// Decimal precision
+    precision: u8,
+    /// Decimal scale
+    scale: i8,
+}
+
+impl SumDecimal {
+    pub fn try_new(expr: Arc<dyn PhysicalExpr>, data_type: DataType) -> DFResult<Self> {
+        // The `data_type` is the SUM result type passed from Spark side
+        let (precision, scale) = match data_type {
+            DataType::Decimal128(p, s) => (p, s),
+            _ => {
+                return Err(DataFusionError::Internal(
+                    "Invalid data type for SumDecimal".into(),
+                ))
+            }
+        };
+        Ok(Self {
+            signature: Signature::user_defined(Immutable),
+            expr,
+            result_type: data_type,
+            precision,
+            scale,
+        })
+    }
+}
+
+impl AggregateUDFImpl for SumDecimal {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn accumulator(&self, _args: AccumulatorArgs) -> DFResult<Box<dyn Accumulator>> {
+        Ok(Box::new(SumDecimalAccumulator::new(
+            self.precision,
+            self.scale,
+        )))
+    }
+
+    fn state_fields(&self, _args: StateFieldsArgs) -> DFResult<Vec<Field>> {
+        let fields = vec![
+            Field::new(self.name(), self.result_type.clone(), self.is_nullable()),
+            Field::new("is_empty", DataType::Boolean, false),
+        ];
+        Ok(fields)
+    }
+
+    fn name(&self) -> &str {
+        "sum"
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, _arg_types: &[DataType]) -> DFResult<DataType> {
+        Ok(self.result_type.clone())
+    }
+
+    fn groups_accumulator_supported(&self, _args: AccumulatorArgs) -> bool {
+        true
+    }
+
+    fn create_groups_accumulator(
+        &self,
+        _args: AccumulatorArgs,
+    ) -> DFResult<Box<dyn GroupsAccumulator>> {
+        Ok(Box::new(SumDecimalGroupsAccumulator::new(
+            self.result_type.clone(),
+            self.precision,
+        )))
+    }
+
+    fn default_value(&self, _data_type: &DataType) -> DFResult<ScalarValue> {
+        ScalarValue::new_primitive::<Decimal128Type>(
+            None,
+            &DataType::Decimal128(self.precision, self.scale),
+        )
+    }
+
+    fn reverse_expr(&self) -> ReversedUDAF {
+        ReversedUDAF::Identical
+    }
+
+    fn is_nullable(&self) -> bool {
+        // SumDecimal is always nullable because overflows can cause null values
+        true
+    }
+}
+
+impl PartialEq<dyn Any> for SumDecimal {
+    fn eq(&self, other: &dyn Any) -> bool {
+        down_cast_any_ref(other)
+            .downcast_ref::<Self>()
+            .map(|x| {
+                // note that we do not compare result_type because this
+                // is guaranteed to match if the precision and scale
+                // match
+                self.precision == x.precision && self.scale == x.scale && self.expr.eq(&x.expr)
+            })
+            .unwrap_or(false)
+    }
+}
+
+#[derive(Debug)]
+struct SumDecimalAccumulator {
+    sum: i128,
+    is_empty: bool,
+    is_not_null: bool,
+
+    precision: u8,
+    scale: i8,
+}
+
+impl SumDecimalAccumulator {
+    fn new(precision: u8, scale: i8) -> Self {
+        Self {
+            sum: 0,
+            is_empty: true,
+            is_not_null: true,
+            precision,
+            scale,
+        }
+    }
+
+    fn update_single(&mut self, values: &Decimal128Array, idx: usize) {
+        let v = unsafe { values.value_unchecked(idx) };
+        let (new_sum, is_overflow) = self.sum.overflowing_add(v);
+
+        if is_overflow || !is_valid_decimal_precision(new_sum, self.precision) {
+            // Overflow: set buffer accumulator to null
+            self.is_not_null = false;
+            return;
+        }
+
+        self.sum = new_sum;
+        self.is_not_null = true;
+    }
+}
+
+impl Accumulator for SumDecimalAccumulator {
+    fn update_batch(&mut self, values: &[ArrayRef]) -> DFResult<()> {
+        assert_eq!(
+            values.len(),
+            1,
+            "Expect only one element in 'values' but found {}",
+            values.len()
+        );
+
+        if !self.is_empty && !self.is_not_null {
+            // This means there's a overflow in decimal, so we will just skip the rest
+            // of the computation
+            return Ok(());
+        }
+
+        let values = &values[0];
+        let data = values.as_primitive::<Decimal128Type>();
+
+        self.is_empty = self.is_empty && values.len() == values.null_count();
+
+        if values.null_count() == 0 {
+            for i in 0..data.len() {
+                self.update_single(data, i);
+            }
+        } else {
+            for i in 0..data.len() {
+                if data.is_null(i) {
+                    continue;
+                }
+                self.update_single(data, i);
+            }
+        }
+
+        Ok(())
+    }
+
+    fn evaluate(&mut self) -> DFResult<ScalarValue> {
+        // For each group:
+        //   1. if `is_empty` is true, it means either there is no value or all values for the group
+        //      are null, in this case we'll return null
+        //   2. if `is_empty` is false, but `null_state` is true, it means there's an overflow. In
+        //      non-ANSI mode Spark returns null.
+        if self.is_empty || !self.is_not_null {
+            ScalarValue::new_primitive::<Decimal128Type>(
+                None,
+                &DataType::Decimal128(self.precision, self.scale),
+            )
+        } else {
+            ScalarValue::try_new_decimal128(self.sum, self.precision, self.scale)
+        }
+    }
+
+    fn size(&self) -> usize {
+        std::mem::size_of_val(self)
+    }
+
+    fn state(&mut self) -> DFResult<Vec<ScalarValue>> {
+        let sum = if self.is_not_null {
+            ScalarValue::try_new_decimal128(self.sum, self.precision, self.scale)?
+        } else {
+            ScalarValue::new_primitive::<Decimal128Type>(
+                None,
+                &DataType::Decimal128(self.precision, self.scale),
+            )?
+        };
+        Ok(vec![sum, ScalarValue::from(self.is_empty)])
+    }
+
+    fn merge_batch(&mut self, states: &[ArrayRef]) -> DFResult<()> {
+        assert_eq!(
+            states.len(),
+            2,
+            "Expect two element in 'states' but found {}",
+            states.len()
+        );
+        assert_eq!(states[0].len(), 1);
+        assert_eq!(states[1].len(), 1);
+
+        let that_sum = states[0].as_primitive::<Decimal128Type>();
+        let that_is_empty = states[1].as_any().downcast_ref::<BooleanArray>().unwrap();
+
+        let this_overflow = !self.is_empty && !self.is_not_null;
+        let that_overflow = !that_is_empty.value(0) && that_sum.is_null(0);
+
+        self.is_not_null = !this_overflow && !that_overflow;
+        self.is_empty = self.is_empty && that_is_empty.value(0);
+
+        if self.is_not_null {
+            self.sum += that_sum.value(0);
+        }
+
+        Ok(())
+    }
+}
+
+struct SumDecimalGroupsAccumulator {
+    // Whether aggregate buffer for a particular group is null. True indicates it is not null.
+    is_not_null: BooleanBufferBuilder,
+    is_empty: BooleanBufferBuilder,
+    sum: Vec<i128>,
+    result_type: DataType,
+    precision: u8,
+}
+
+impl SumDecimalGroupsAccumulator {
+    fn new(result_type: DataType, precision: u8) -> Self {
+        Self {
+            is_not_null: BooleanBufferBuilder::new(0),
+            is_empty: BooleanBufferBuilder::new(0),
+            sum: Vec::new(),
+            result_type,
+            precision,
+        }
+    }
+
+    fn is_overflow(&self, index: usize) -> bool {
+        !self.is_empty.get_bit(index) && !self.is_not_null.get_bit(index)
+    }
+
+    fn update_single(&mut self, group_index: usize, value: i128) {
+        if unlikely(self.is_overflow(group_index)) {
+            // This means there's a overflow in decimal, so we will just skip the rest
+            // of the computation
+            return;
+        }
+
+        self.is_empty.set_bit(group_index, false);
+        let (new_sum, is_overflow) = self.sum[group_index].overflowing_add(value);
+
+        if is_overflow || !is_valid_decimal_precision(new_sum, self.precision) {
+            // Overflow: set buffer accumulator to null
+            self.is_not_null.set_bit(group_index, false);
+            return;
+        }
+
+        self.sum[group_index] = new_sum;
+        self.is_not_null.set_bit(group_index, true)
+    }
+}
+
+fn ensure_bit_capacity(builder: &mut BooleanBufferBuilder, capacity: usize) {
+    if builder.len() < capacity {
+        let additional = capacity - builder.len();
+        builder.append_n(additional, true);
+    }
+}
+
+/// Build a boolean buffer from the state and reset the state, based on the emit_to
+/// strategy.
+fn build_bool_state(state: &mut BooleanBufferBuilder, emit_to: &EmitTo) -> BooleanBuffer {
+    let bool_state: BooleanBuffer = state.finish();
+
+    match emit_to {
+        EmitTo::All => bool_state,
+        EmitTo::First(n) => {
+            // split off the first N values in bool_state
+            let first_n_bools: BooleanBuffer = bool_state.iter().take(*n).collect();
+            // reset the existing seen buffer
+            for seen in bool_state.iter().skip(*n) {
+                state.append(seen);
+            }
+            first_n_bools
+        }
+    }
+}
+
+impl GroupsAccumulator for SumDecimalGroupsAccumulator {
+    fn update_batch(
+        &mut self,
+        values: &[ArrayRef],
+        group_indices: &[usize],
+        opt_filter: Option<&BooleanArray>,
+        total_num_groups: usize,
+    ) -> DFResult<()> {
+        assert!(opt_filter.is_none(), "opt_filter is not supported yet");
+        assert_eq!(values.len(), 1);
+        let values = values[0].as_primitive::<Decimal128Type>();
+        let data = values.values();
+
+        // Update size for the accumulate states
+        self.sum.resize(total_num_groups, 0);
+        ensure_bit_capacity(&mut self.is_empty, total_num_groups);
+        ensure_bit_capacity(&mut self.is_not_null, total_num_groups);
+
+        let iter = group_indices.iter().zip(data.iter());
+        if values.null_count() == 0 {
+            for (&group_index, &value) in iter {
+                self.update_single(group_index, value);
+            }
+        } else {
+            for (idx, (&group_index, &value)) in iter.enumerate() {
+                if values.is_null(idx) {
+                    continue;
+                }
+                self.update_single(group_index, value);
+            }
+        }
+
+        Ok(())
+    }
+
+    fn evaluate(&mut self, emit_to: EmitTo) -> DFResult<ArrayRef> {
+        // For each group:
+        //   1. if `is_empty` is true, it means either there is no value or all values for the group
+        //      are null, in this case we'll return null
+        //   2. if `is_empty` is false, but `null_state` is true, it means there's an overflow. In
+        //      non-ANSI mode Spark returns null.
+        let nulls = build_bool_state(&mut self.is_not_null, &emit_to);
+        let is_empty = build_bool_state(&mut self.is_empty, &emit_to);
+        let x = (!&is_empty).bitand(&nulls);
+
+        let result = emit_to.take_needed(&mut self.sum);
+        let result = Decimal128Array::new(result.into(), Some(NullBuffer::new(x)))
+            .with_data_type(self.result_type.clone());
+
+        Ok(Arc::new(result))
+    }
+
+    fn state(&mut self, emit_to: EmitTo) -> DFResult<Vec<ArrayRef>> {
+        let nulls = build_bool_state(&mut self.is_not_null, &emit_to);
+        let nulls = Some(NullBuffer::new(nulls));
+
+        let sum = emit_to.take_needed(&mut self.sum);
+        let sum = Decimal128Array::new(sum.into(), nulls.clone())
+            .with_data_type(self.result_type.clone());
+
+        let is_empty = build_bool_state(&mut self.is_empty, &emit_to);
+        let is_empty = BooleanArray::new(is_empty, None);
+
+        Ok(vec![
+            Arc::new(sum) as ArrayRef,
+            Arc::new(is_empty) as ArrayRef,
+        ])
+    }
+
+    fn merge_batch(
+        &mut self,
+        values: &[ArrayRef],
+        group_indices: &[usize],
+        opt_filter: Option<&BooleanArray>,
+        total_num_groups: usize,
+    ) -> DFResult<()> {
+        assert_eq!(
+            values.len(),
+            2,
+            "Expected two arrays: 'sum' and 'is_empty', but found {}",
+            values.len()
+        );
+        assert!(opt_filter.is_none(), "opt_filter is not supported yet");
+
+        // Make sure we have enough capacity for the additional groups
+        self.sum.resize(total_num_groups, 0);
+        ensure_bit_capacity(&mut self.is_empty, total_num_groups);
+        ensure_bit_capacity(&mut self.is_not_null, total_num_groups);
+
+        let that_sum = &values[0];
+        let that_sum = that_sum.as_primitive::<Decimal128Type>();
+        let that_is_empty = &values[1];
+        let that_is_empty = that_is_empty
+            .as_any()
+            .downcast_ref::<BooleanArray>()
+            .unwrap();
+
+        group_indices
+            .iter()
+            .enumerate()
+            .for_each(|(idx, &group_index)| unsafe {
+                let this_overflow = self.is_overflow(group_index);
+                let that_is_empty = that_is_empty.value_unchecked(idx);
+                let that_overflow = !that_is_empty && that_sum.is_null(idx);
+                let is_overflow = this_overflow || that_overflow;
+
+                // This part follows the logic in Spark:
+                //   `org.apache.spark.sql.catalyst.expressions.aggregate.Sum`
+                self.is_not_null.set_bit(group_index, !is_overflow);
+                self.is_empty.set_bit(
+                    group_index,
+                    self.is_empty.get_bit(group_index) && that_is_empty,
+                );
+                if !is_overflow {
+                    // .. otherwise, the sum value for this particular index must not be null,
+                    // and thus we merge both values and update this sum.
+                    self.sum[group_index] += that_sum.value_unchecked(idx);
+                }
+            });
+
+        Ok(())
+    }
+
+    fn size(&self) -> usize {
+        self.sum.capacity() * std::mem::size_of::<i128>()
+            + self.is_empty.capacity() / 8
+            + self.is_not_null.capacity() / 8
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use arrow::datatypes::*;
+    use arrow_array::builder::{Decimal128Builder, StringBuilder};
+    use arrow_array::RecordBatch;
+    use datafusion::execution::TaskContext;
+    use datafusion::physical_plan::aggregates::{AggregateExec, AggregateMode, PhysicalGroupBy};
+    use datafusion::physical_plan::memory::MemoryExec;
+    use datafusion::physical_plan::ExecutionPlan;
+    use datafusion_common::Result;
+    use datafusion_expr::AggregateUDF;
+    use datafusion_physical_expr::aggregate::AggregateExprBuilder;
+    use datafusion_physical_expr::expressions::{Column, Literal};
+    use futures::StreamExt;
+
+    #[test]
+    fn invalid_data_type() {
+        let expr = Arc::new(Literal::new(ScalarValue::Int32(Some(1))));
+        assert!(SumDecimal::try_new(expr, DataType::Int32).is_err());
+    }
+
+    #[tokio::test]
+    async fn sum_no_overflow() -> Result<()> {
+        let num_rows = 8192;
+        let batch = create_record_batch(num_rows);
+        let mut batches = Vec::new();
+        for _ in 0..10 {
+            batches.push(batch.clone());
+        }
+        let partitions = &[batches];
+        let c0: Arc<dyn PhysicalExpr> = Arc::new(Column::new("c0", 0));
+        let c1: Arc<dyn PhysicalExpr> = Arc::new(Column::new("c1", 1));
+
+        let data_type = DataType::Decimal128(8, 2);
+        let schema = Arc::clone(&partitions[0][0].schema());
+        let scan: Arc<dyn ExecutionPlan> =
+            Arc::new(MemoryExec::try_new(partitions, Arc::clone(&schema), None).unwrap());
+
+        let aggregate_udf = Arc::new(AggregateUDF::new_from_impl(SumDecimal::try_new(
+            Arc::clone(&c1),
+            data_type.clone(),
+        )?));
+
+        let aggr_expr = AggregateExprBuilder::new(aggregate_udf, vec![c1])
+            .schema(Arc::clone(&schema))
+            .alias("sum")
+            .with_ignore_nulls(false)
+            .with_distinct(false)
+            .build()?;
+
+        let aggregate = Arc::new(AggregateExec::try_new(
+            AggregateMode::Partial,
+            PhysicalGroupBy::new_single(vec![(c0, "c0".to_string())]),
+            vec![aggr_expr.into()],
+            vec![None], // no filter expressions
+            scan,
+            Arc::clone(&schema),
+        )?);
+
+        let mut stream = aggregate
+            .execute(0, Arc::new(TaskContext::default()))
+            .unwrap();
+        while let Some(batch) = stream.next().await {
+            let _batch = batch?;
+        }
+
+        Ok(())
+    }
+
+    fn create_record_batch(num_rows: usize) -> RecordBatch {
+        let mut decimal_builder = Decimal128Builder::with_capacity(num_rows);
+        let mut string_builder = StringBuilder::with_capacity(num_rows, num_rows * 32);
+        for i in 0..num_rows {
+            decimal_builder.append_value(i as i128);
+            string_builder.append_value(format!("this is string #{}", i % 1024));
+        }
+        let decimal_array = Arc::new(decimal_builder.finish());
+        let string_array = Arc::new(string_builder.finish());
+
+        let mut fields = vec![];
+        let mut columns: Vec<ArrayRef> = vec![];
+
+        // string column
+        fields.push(Field::new("c0", DataType::Utf8, false));
+        columns.push(string_array);
+
+        // decimal column
+        fields.push(Field::new("c1", DataType::Decimal128(38, 10), false));
+        columns.push(decimal_array);
+
+        let schema = Schema::new(fields);
+        RecordBatch::try_new(Arc::new(schema), columns).unwrap()
+    }
+}
diff --git a/src/utils.rs b/src/utils.rs
index db4ad1956a32..18a2314fb107 100644
--- a/src/utils.rs
+++ b/src/utils.rs
@@ -19,7 +19,7 @@ use arrow_array::{
     cast::as_primitive_array,
     types::{Int32Type, TimestampMicrosecondType},
 };
-use arrow_schema::{ArrowError, DataType};
+use arrow_schema::{ArrowError, DataType, DECIMAL128_MAX_PRECISION};
 use std::sync::Arc;
 
 use crate::timezone::Tz;
@@ -27,6 +27,7 @@ use arrow::{
     array::{as_dictionary_array, Array, ArrayRef, PrimitiveArray},
     temporal_conversions::as_datetime,
 };
+use arrow_data::decimal::{MAX_DECIMAL_FOR_EACH_PRECISION, MIN_DECIMAL_FOR_EACH_PRECISION};
 use chrono::{DateTime, Offset, TimeZone};
 
 /// Preprocesses input arrays to add timezone information from Spark to Arrow array datatype or
@@ -176,3 +177,39 @@ fn pre_timestamp_cast(array: ArrayRef, timezone: String) -> Result<ArrayRef, Arr
         _ => Ok(array),
     }
 }
+
+/// Adapted from arrow-rs `validate_decimal_precision` but returns bool
+/// instead of Err to avoid the cost of formatting the error strings and is
+/// optimized to remove a memcpy that exists in the original function
+/// we can remove this code once we upgrade to a version of arrow-rs that
+/// includes https://github.com/apache/arrow-rs/pull/6419
+#[inline]
+pub fn is_valid_decimal_precision(value: i128, precision: u8) -> bool {
+    precision <= DECIMAL128_MAX_PRECISION
+        && value >= MIN_DECIMAL_FOR_EACH_PRECISION[precision as usize - 1]
+        && value <= MAX_DECIMAL_FOR_EACH_PRECISION[precision as usize - 1]
+}
+
+// These are borrowed from hashbrown crate:
+//   https://github.com/rust-lang/hashbrown/blob/master/src/raw/mod.rs
+
+// On stable we can use #[cold] to get a equivalent effect: this attributes
+// suggests that the function is unlikely to be called
+#[inline]
+#[cold]
+pub fn cold() {}
+
+#[inline]
+pub fn likely(b: bool) -> bool {
+    if !b {
+        cold();
+    }
+    b
+}
+#[inline]
+pub fn unlikely(b: bool) -> bool {
+    if b {
+        cold();
+    }
+    b
+}
diff --git a/src/variance.rs b/src/variance.rs
new file mode 100644
index 000000000000..2f4d8091c24a
--- /dev/null
+++ b/src/variance.rs
@@ -0,0 +1,264 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::{any::Any, sync::Arc};
+
+use arrow::{
+    array::{ArrayRef, Float64Array},
+    datatypes::{DataType, Field},
+};
+use datafusion::logical_expr::Accumulator;
+use datafusion::physical_expr_common::physical_expr::down_cast_any_ref;
+use datafusion_common::{downcast_value, DataFusionError, Result, ScalarValue};
+use datafusion_expr::function::{AccumulatorArgs, StateFieldsArgs};
+use datafusion_expr::Volatility::Immutable;
+use datafusion_expr::{AggregateUDFImpl, Signature};
+use datafusion_physical_expr::expressions::StatsType;
+use datafusion_physical_expr::{expressions::format_state_name, PhysicalExpr};
+
+/// VAR_SAMP and VAR_POP aggregate expression
+/// The implementation mostly is the same as the DataFusion's implementation. The reason
+/// we have our own implementation is that DataFusion has UInt64 for state_field `count`,
+/// while Spark has Double for count. Also we have added `null_on_divide_by_zero`
+/// to be consistent with Spark's implementation.
+#[derive(Debug)]
+pub struct Variance {
+    name: String,
+    signature: Signature,
+    expr: Arc<dyn PhysicalExpr>,
+    stats_type: StatsType,
+    null_on_divide_by_zero: bool,
+}
+
+impl Variance {
+    /// Create a new VARIANCE aggregate function
+    pub fn new(
+        expr: Arc<dyn PhysicalExpr>,
+        name: impl Into<String>,
+        data_type: DataType,
+        stats_type: StatsType,
+        null_on_divide_by_zero: bool,
+    ) -> Self {
+        // the result of variance just support FLOAT64 data type.
+        assert!(matches!(data_type, DataType::Float64));
+        Self {
+            name: name.into(),
+            signature: Signature::numeric(1, Immutable),
+            expr,
+            stats_type,
+            null_on_divide_by_zero,
+        }
+    }
+}
+
+impl AggregateUDFImpl for Variance {
+    /// Return a reference to Any that can be used for downcasting
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn name(&self) -> &str {
+        &self.name
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        Ok(DataType::Float64)
+    }
+
+    fn accumulator(&self, _acc_args: AccumulatorArgs) -> Result<Box<dyn Accumulator>> {
+        Ok(Box::new(VarianceAccumulator::try_new(
+            self.stats_type,
+            self.null_on_divide_by_zero,
+        )?))
+    }
+
+    fn create_sliding_accumulator(&self, _args: AccumulatorArgs) -> Result<Box<dyn Accumulator>> {
+        Ok(Box::new(VarianceAccumulator::try_new(
+            self.stats_type,
+            self.null_on_divide_by_zero,
+        )?))
+    }
+
+    fn state_fields(&self, _args: StateFieldsArgs) -> Result<Vec<Field>> {
+        Ok(vec![
+            Field::new(
+                format_state_name(&self.name, "count"),
+                DataType::Float64,
+                true,
+            ),
+            Field::new(
+                format_state_name(&self.name, "mean"),
+                DataType::Float64,
+                true,
+            ),
+            Field::new(format_state_name(&self.name, "m2"), DataType::Float64, true),
+        ])
+    }
+
+    fn default_value(&self, _data_type: &DataType) -> Result<ScalarValue> {
+        Ok(ScalarValue::Float64(None))
+    }
+}
+
+impl PartialEq<dyn Any> for Variance {
+    fn eq(&self, other: &dyn Any) -> bool {
+        down_cast_any_ref(other)
+            .downcast_ref::<Self>()
+            .map(|x| {
+                self.name == x.name && self.expr.eq(&x.expr) && self.stats_type == x.stats_type
+            })
+            .unwrap_or(false)
+    }
+}
+
+/// An accumulator to compute variance
+#[derive(Debug)]
+pub struct VarianceAccumulator {
+    m2: f64,
+    mean: f64,
+    count: f64,
+    stats_type: StatsType,
+    null_on_divide_by_zero: bool,
+}
+
+impl VarianceAccumulator {
+    /// Creates a new `VarianceAccumulator`
+    pub fn try_new(s_type: StatsType, null_on_divide_by_zero: bool) -> Result<Self> {
+        Ok(Self {
+            m2: 0_f64,
+            mean: 0_f64,
+            count: 0_f64,
+            stats_type: s_type,
+            null_on_divide_by_zero,
+        })
+    }
+
+    pub fn get_count(&self) -> f64 {
+        self.count
+    }
+
+    pub fn get_mean(&self) -> f64 {
+        self.mean
+    }
+
+    pub fn get_m2(&self) -> f64 {
+        self.m2
+    }
+}
+
+impl Accumulator for VarianceAccumulator {
+    fn state(&mut self) -> Result<Vec<ScalarValue>> {
+        Ok(vec![
+            ScalarValue::from(self.count),
+            ScalarValue::from(self.mean),
+            ScalarValue::from(self.m2),
+        ])
+    }
+
+    fn update_batch(&mut self, values: &[ArrayRef]) -> Result<()> {
+        let arr = downcast_value!(&values[0], Float64Array).iter().flatten();
+
+        for value in arr {
+            let new_count = self.count + 1.0;
+            let delta1 = value - self.mean;
+            let new_mean = delta1 / new_count + self.mean;
+            let delta2 = value - new_mean;
+            let new_m2 = self.m2 + delta1 * delta2;
+
+            self.count += 1.0;
+            self.mean = new_mean;
+            self.m2 = new_m2;
+        }
+
+        Ok(())
+    }
+
+    fn retract_batch(&mut self, values: &[ArrayRef]) -> Result<()> {
+        let arr = downcast_value!(&values[0], Float64Array).iter().flatten();
+
+        for value in arr {
+            let new_count = self.count - 1.0;
+            let delta1 = self.mean - value;
+            let new_mean = delta1 / new_count + self.mean;
+            let delta2 = new_mean - value;
+            let new_m2 = self.m2 - delta1 * delta2;
+
+            self.count -= 1.0;
+            self.mean = new_mean;
+            self.m2 = new_m2;
+        }
+
+        Ok(())
+    }
+
+    fn merge_batch(&mut self, states: &[ArrayRef]) -> Result<()> {
+        let counts = downcast_value!(states[0], Float64Array);
+        let means = downcast_value!(states[1], Float64Array);
+        let m2s = downcast_value!(states[2], Float64Array);
+
+        for i in 0..counts.len() {
+            let c = counts.value(i);
+            if c == 0_f64 {
+                continue;
+            }
+            let new_count = self.count + c;
+            let new_mean = self.mean * self.count / new_count + means.value(i) * c / new_count;
+            let delta = self.mean - means.value(i);
+            let new_m2 = self.m2 + m2s.value(i) + delta * delta * self.count * c / new_count;
+
+            self.count = new_count;
+            self.mean = new_mean;
+            self.m2 = new_m2;
+        }
+        Ok(())
+    }
+
+    fn evaluate(&mut self) -> Result<ScalarValue> {
+        let count = match self.stats_type {
+            StatsType::Population => self.count,
+            StatsType::Sample => {
+                if self.count > 0.0 {
+                    self.count - 1.0
+                } else {
+                    self.count
+                }
+            }
+        };
+
+        Ok(ScalarValue::Float64(match self.count {
+            count if count == 0.0 => None,
+            count if count == 1.0 => {
+                if let StatsType::Population = self.stats_type {
+                    Some(0.0)
+                } else if self.null_on_divide_by_zero {
+                    None
+                } else {
+                    Some(f64::NAN)
+                }
+            }
+            _ => Some(self.m2 / count),
+        }))
+    }
+
+    fn size(&self) -> usize {
+        std::mem::size_of_val(self)
+    }
+}

From 38597245d9f4c9617546d975c8b1e4ecc0f06cfb Mon Sep 17 00:00:00 2001
From: Andy Grove <agrove@apache.org>
Date: Thu, 12 Dec 2024 07:45:06 -0700
Subject: [PATCH 39/68] chore: Move string kernels and expressions to
 spark-expr crate (#1164)

* Move string kernels and expressions to spark-expr crate

* remove unused hash kernel

* remove unused dependencies
---
 src/kernels/mod.rs     |   1 +
 src/kernels/strings.rs | 127 +++++++++++++++++
 src/lib.rs             |   2 +
 src/strings.rs         | 301 +++++++++++++++++++++++++++++++++++++++++
 4 files changed, 431 insertions(+)
 create mode 100644 src/kernels/strings.rs
 create mode 100644 src/strings.rs

diff --git a/src/kernels/mod.rs b/src/kernels/mod.rs
index 88aa34b1a3f8..3669ff13ad0e 100644
--- a/src/kernels/mod.rs
+++ b/src/kernels/mod.rs
@@ -17,4 +17,5 @@
 
 //! Kernels
 
+pub mod strings;
 pub(crate) mod temporal;
diff --git a/src/kernels/strings.rs b/src/kernels/strings.rs
new file mode 100644
index 000000000000..bb275fbb9f7d
--- /dev/null
+++ b/src/kernels/strings.rs
@@ -0,0 +1,127 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! String kernels
+
+use std::sync::Arc;
+
+use arrow::{
+    array::*,
+    buffer::MutableBuffer,
+    compute::kernels::substring::{substring as arrow_substring, substring_by_char},
+    datatypes::{DataType, Int32Type},
+};
+use datafusion_common::DataFusionError;
+
+/// Returns an ArrayRef with a string consisting of `length` spaces.
+///
+/// # Preconditions
+///
+/// - elements in `length` must not be negative
+pub fn string_space(length: &dyn Array) -> Result<ArrayRef, DataFusionError> {
+    match length.data_type() {
+        DataType::Int32 => {
+            let array = length.as_any().downcast_ref::<Int32Array>().unwrap();
+            Ok(generic_string_space::<i32>(array))
+        }
+        DataType::Dictionary(_, _) => {
+            let dict = as_dictionary_array::<Int32Type>(length);
+            let values = string_space(dict.values())?;
+            let result = DictionaryArray::try_new(dict.keys().clone(), values)?;
+            Ok(Arc::new(result))
+        }
+        dt => panic!(
+            "Unsupported input type for function 'string_space': {:?}",
+            dt
+        ),
+    }
+}
+
+pub fn substring(array: &dyn Array, start: i64, length: u64) -> Result<ArrayRef, DataFusionError> {
+    match array.data_type() {
+        DataType::LargeUtf8 => substring_by_char(
+            array
+                .as_any()
+                .downcast_ref::<LargeStringArray>()
+                .expect("A large string is expected"),
+            start,
+            Some(length),
+        )
+        .map_err(|e| e.into())
+        .map(|t| make_array(t.into_data())),
+        DataType::Utf8 => substring_by_char(
+            array
+                .as_any()
+                .downcast_ref::<StringArray>()
+                .expect("A string is expected"),
+            start,
+            Some(length),
+        )
+        .map_err(|e| e.into())
+        .map(|t| make_array(t.into_data())),
+        DataType::Binary | DataType::LargeBinary => {
+            arrow_substring(array, start, Some(length)).map_err(|e| e.into())
+        }
+        DataType::Dictionary(_, _) => {
+            let dict = as_dictionary_array::<Int32Type>(array);
+            let values = substring(dict.values(), start, length)?;
+            let result = DictionaryArray::try_new(dict.keys().clone(), values)?;
+            Ok(Arc::new(result))
+        }
+        dt => panic!("Unsupported input type for function 'substring': {:?}", dt),
+    }
+}
+
+fn generic_string_space<OffsetSize: OffsetSizeTrait>(length: &Int32Array) -> ArrayRef {
+    let array_len = length.len();
+    let mut offsets = MutableBuffer::new((array_len + 1) * std::mem::size_of::<OffsetSize>());
+    let mut length_so_far = OffsetSize::zero();
+
+    // compute null bitmap (copy)
+    let null_bit_buffer = length.to_data().nulls().map(|b| b.buffer().clone());
+
+    // Gets slice of length array to access it directly for performance.
+    let length_data = length.to_data();
+    let lengths = length_data.buffers()[0].typed_data::<i32>();
+    let total = lengths.iter().map(|l| *l as usize).sum::<usize>();
+    let mut values = MutableBuffer::new(total);
+
+    offsets.push(length_so_far);
+
+    let blank = " ".as_bytes()[0];
+    values.resize(total, blank);
+
+    (0..array_len).for_each(|i| {
+        let current_len = lengths[i] as usize;
+
+        length_so_far += OffsetSize::from_usize(current_len).unwrap();
+        offsets.push(length_so_far);
+    });
+
+    let data = unsafe {
+        ArrayData::new_unchecked(
+            GenericStringArray::<OffsetSize>::DATA_TYPE,
+            array_len,
+            None,
+            null_bit_buffer,
+            0,
+            vec![offsets.into(), values.into()],
+            vec![],
+        )
+    };
+    make_array(data)
+}
diff --git a/src/lib.rs b/src/lib.rs
index 15f446ef35b3..5dff6e0b8f4e 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -33,6 +33,8 @@ mod correlation;
 pub use correlation::Correlation;
 mod covariance;
 pub use covariance::Covariance;
+mod strings;
+pub use strings::{Contains, EndsWith, Like, StartsWith, StringSpaceExpr, SubstringExpr};
 mod kernels;
 mod list;
 mod regexp;
diff --git a/src/strings.rs b/src/strings.rs
new file mode 100644
index 000000000000..a8aab6aee969
--- /dev/null
+++ b/src/strings.rs
@@ -0,0 +1,301 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#![allow(deprecated)]
+
+use crate::kernels::strings::{string_space, substring};
+use arrow::{
+    compute::{
+        contains_dyn, contains_utf8_scalar_dyn, ends_with_dyn, ends_with_utf8_scalar_dyn, like_dyn,
+        like_utf8_scalar_dyn, starts_with_dyn, starts_with_utf8_scalar_dyn,
+    },
+    record_batch::RecordBatch,
+};
+use arrow_schema::{DataType, Schema};
+use datafusion::logical_expr::ColumnarValue;
+use datafusion::physical_expr_common::physical_expr::down_cast_any_ref;
+use datafusion_common::{DataFusionError, ScalarValue::Utf8};
+use datafusion_physical_expr::PhysicalExpr;
+use std::{
+    any::Any,
+    fmt::{Display, Formatter},
+    hash::{Hash, Hasher},
+    sync::Arc,
+};
+
+macro_rules! make_predicate_function {
+    ($name: ident, $kernel: ident, $str_scalar_kernel: ident) => {
+        #[derive(Debug, Hash)]
+        pub struct $name {
+            left: Arc<dyn PhysicalExpr>,
+            right: Arc<dyn PhysicalExpr>,
+        }
+
+        impl $name {
+            pub fn new(left: Arc<dyn PhysicalExpr>, right: Arc<dyn PhysicalExpr>) -> Self {
+                Self { left, right }
+            }
+        }
+
+        impl Display for $name {
+            fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+                write!(f, "$name [left: {}, right: {}]", self.left, self.right)
+            }
+        }
+
+        impl PartialEq<dyn Any> for $name {
+            fn eq(&self, other: &dyn Any) -> bool {
+                down_cast_any_ref(other)
+                    .downcast_ref::<Self>()
+                    .map(|x| self.left.eq(&x.left) && self.right.eq(&x.right))
+                    .unwrap_or(false)
+            }
+        }
+
+        impl PhysicalExpr for $name {
+            fn as_any(&self) -> &dyn Any {
+                self
+            }
+
+            fn data_type(&self, _: &Schema) -> datafusion_common::Result<DataType> {
+                Ok(DataType::Boolean)
+            }
+
+            fn nullable(&self, _: &Schema) -> datafusion_common::Result<bool> {
+                Ok(true)
+            }
+
+            fn evaluate(&self, batch: &RecordBatch) -> datafusion_common::Result<ColumnarValue> {
+                let left_arg = self.left.evaluate(batch)?;
+                let right_arg = self.right.evaluate(batch)?;
+
+                let array = match (left_arg, right_arg) {
+                    // array (op) scalar
+                    (ColumnarValue::Array(array), ColumnarValue::Scalar(Utf8(Some(string)))) => {
+                        $str_scalar_kernel(&array, string.as_str())
+                    }
+                    (ColumnarValue::Array(_), ColumnarValue::Scalar(other)) => {
+                        return Err(DataFusionError::Execution(format!(
+                            "Should be String but got: {:?}",
+                            other
+                        )))
+                    }
+                    // array (op) array
+                    (ColumnarValue::Array(array1), ColumnarValue::Array(array2)) => {
+                        $kernel(&array1, &array2)
+                    }
+                    // scalar (op) scalar should be folded at Spark optimizer
+                    _ => {
+                        return Err(DataFusionError::Execution(
+                            "Predicate on two literals should be folded at Spark".to_string(),
+                        ))
+                    }
+                }?;
+
+                Ok(ColumnarValue::Array(Arc::new(array)))
+            }
+
+            fn children(&self) -> Vec<&Arc<dyn PhysicalExpr>> {
+                vec![&self.left, &self.right]
+            }
+
+            fn with_new_children(
+                self: Arc<Self>,
+                children: Vec<Arc<dyn PhysicalExpr>>,
+            ) -> datafusion_common::Result<Arc<dyn PhysicalExpr>> {
+                Ok(Arc::new($name::new(
+                    children[0].clone(),
+                    children[1].clone(),
+                )))
+            }
+
+            fn dyn_hash(&self, state: &mut dyn Hasher) {
+                let mut s = state;
+                self.left.hash(&mut s);
+                self.right.hash(&mut s);
+                self.hash(&mut s);
+            }
+        }
+    };
+}
+
+make_predicate_function!(Like, like_dyn, like_utf8_scalar_dyn);
+
+make_predicate_function!(StartsWith, starts_with_dyn, starts_with_utf8_scalar_dyn);
+
+make_predicate_function!(EndsWith, ends_with_dyn, ends_with_utf8_scalar_dyn);
+
+make_predicate_function!(Contains, contains_dyn, contains_utf8_scalar_dyn);
+
+#[derive(Debug, Hash)]
+pub struct SubstringExpr {
+    pub child: Arc<dyn PhysicalExpr>,
+    pub start: i64,
+    pub len: u64,
+}
+
+#[derive(Debug, Hash)]
+pub struct StringSpaceExpr {
+    pub child: Arc<dyn PhysicalExpr>,
+}
+
+impl SubstringExpr {
+    pub fn new(child: Arc<dyn PhysicalExpr>, start: i64, len: u64) -> Self {
+        Self { child, start, len }
+    }
+}
+
+impl StringSpaceExpr {
+    pub fn new(child: Arc<dyn PhysicalExpr>) -> Self {
+        Self { child }
+    }
+}
+
+impl Display for SubstringExpr {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        write!(
+            f,
+            "StringSpace [start: {}, len: {}, child: {}]",
+            self.start, self.len, self.child
+        )
+    }
+}
+
+impl Display for StringSpaceExpr {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        write!(f, "StringSpace [child: {}] ", self.child)
+    }
+}
+
+impl PartialEq<dyn Any> for SubstringExpr {
+    fn eq(&self, other: &dyn Any) -> bool {
+        down_cast_any_ref(other)
+            .downcast_ref::<Self>()
+            .map(|x| self.child.eq(&x.child) && self.start.eq(&x.start) && self.len.eq(&x.len))
+            .unwrap_or(false)
+    }
+}
+
+impl PhysicalExpr for SubstringExpr {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn data_type(&self, input_schema: &Schema) -> datafusion_common::Result<DataType> {
+        self.child.data_type(input_schema)
+    }
+
+    fn nullable(&self, _: &Schema) -> datafusion_common::Result<bool> {
+        Ok(true)
+    }
+
+    fn evaluate(&self, batch: &RecordBatch) -> datafusion_common::Result<ColumnarValue> {
+        let arg = self.child.evaluate(batch)?;
+        match arg {
+            ColumnarValue::Array(array) => {
+                let result = substring(&array, self.start, self.len)?;
+
+                Ok(ColumnarValue::Array(result))
+            }
+            _ => Err(DataFusionError::Execution(
+                "Substring(scalar) should be fold in Spark JVM side.".to_string(),
+            )),
+        }
+    }
+
+    fn children(&self) -> Vec<&Arc<dyn PhysicalExpr>> {
+        vec![&self.child]
+    }
+
+    fn with_new_children(
+        self: Arc<Self>,
+        children: Vec<Arc<dyn PhysicalExpr>>,
+    ) -> datafusion_common::Result<Arc<dyn PhysicalExpr>> {
+        Ok(Arc::new(SubstringExpr::new(
+            Arc::clone(&children[0]),
+            self.start,
+            self.len,
+        )))
+    }
+
+    fn dyn_hash(&self, state: &mut dyn Hasher) {
+        let mut s = state;
+        self.child.hash(&mut s);
+        self.start.hash(&mut s);
+        self.len.hash(&mut s);
+        self.hash(&mut s);
+    }
+}
+
+impl PartialEq<dyn Any> for StringSpaceExpr {
+    fn eq(&self, other: &dyn Any) -> bool {
+        down_cast_any_ref(other)
+            .downcast_ref::<Self>()
+            .map(|x| self.child.eq(&x.child))
+            .unwrap_or(false)
+    }
+}
+
+impl PhysicalExpr for StringSpaceExpr {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn data_type(&self, input_schema: &Schema) -> datafusion_common::Result<DataType> {
+        match self.child.data_type(input_schema)? {
+            DataType::Dictionary(key_type, _) => {
+                Ok(DataType::Dictionary(key_type, Box::new(DataType::Utf8)))
+            }
+            _ => Ok(DataType::Utf8),
+        }
+    }
+
+    fn nullable(&self, _: &Schema) -> datafusion_common::Result<bool> {
+        Ok(true)
+    }
+
+    fn evaluate(&self, batch: &RecordBatch) -> datafusion_common::Result<ColumnarValue> {
+        let arg = self.child.evaluate(batch)?;
+        match arg {
+            ColumnarValue::Array(array) => {
+                let result = string_space(&array)?;
+
+                Ok(ColumnarValue::Array(result))
+            }
+            _ => Err(DataFusionError::Execution(
+                "StringSpace(scalar) should be fold in Spark JVM side.".to_string(),
+            )),
+        }
+    }
+
+    fn children(&self) -> Vec<&Arc<dyn PhysicalExpr>> {
+        vec![&self.child]
+    }
+
+    fn with_new_children(
+        self: Arc<Self>,
+        children: Vec<Arc<dyn PhysicalExpr>>,
+    ) -> datafusion_common::Result<Arc<dyn PhysicalExpr>> {
+        Ok(Arc::new(StringSpaceExpr::new(Arc::clone(&children[0]))))
+    }
+
+    fn dyn_hash(&self, state: &mut dyn Hasher) {
+        let mut s = state;
+        self.child.hash(&mut s);
+        self.hash(&mut s);
+    }
+}

From f69148dfded14ec4a957512d4ebaab7a7a3a568f Mon Sep 17 00:00:00 2001
From: Andy Grove <agrove@apache.org>
Date: Thu, 12 Dec 2024 14:51:19 -0700
Subject: [PATCH 40/68] chore: Move remaining expressions to spark-expr crate +
 some minor refactoring (#1165)

* move CheckOverflow to spark-expr crate

* move NegativeExpr to spark-expr crate

* move UnboundColumn to spark-expr crate

* move ExpandExec from execution::datafusion::operators to execution::operators

* refactoring to remove datafusion subpackage

* update imports in benches

* fix

* fix
---
 Cargo.toml           |   1 +
 src/checkoverflow.rs | 173 ++++++++++++++++++++++++++++
 src/lib.rs           |  12 ++
 src/negative.rs      | 266 +++++++++++++++++++++++++++++++++++++++++++
 src/unbound.rs       | 110 ++++++++++++++++++
 5 files changed, 562 insertions(+)
 create mode 100644 src/checkoverflow.rs
 create mode 100644 src/negative.rs
 create mode 100644 src/unbound.rs

diff --git a/Cargo.toml b/Cargo.toml
index 65517431d2d9..d0bc2fd9dd53 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -29,6 +29,7 @@ edition = { workspace = true }
 [dependencies]
 arrow = { workspace = true }
 arrow-array = { workspace = true }
+arrow-buffer = { workspace = true }
 arrow-data = { workspace = true }
 arrow-schema = { workspace = true }
 chrono = { workspace = true }
diff --git a/src/checkoverflow.rs b/src/checkoverflow.rs
new file mode 100644
index 000000000000..e922171bd2b5
--- /dev/null
+++ b/src/checkoverflow.rs
@@ -0,0 +1,173 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::{
+    any::Any,
+    fmt::{Display, Formatter},
+    hash::{Hash, Hasher},
+    sync::Arc,
+};
+
+use arrow::{
+    array::{as_primitive_array, Array, ArrayRef, Decimal128Array},
+    datatypes::{Decimal128Type, DecimalType},
+    record_batch::RecordBatch,
+};
+use arrow_schema::{DataType, Schema};
+use datafusion::logical_expr::ColumnarValue;
+use datafusion::physical_expr_common::physical_expr::down_cast_any_ref;
+use datafusion_common::{DataFusionError, ScalarValue};
+use datafusion_physical_expr::PhysicalExpr;
+
+/// This is from Spark `CheckOverflow` expression. Spark `CheckOverflow` expression rounds decimals
+/// to given scale and check if the decimals can fit in given precision. As `cast` kernel rounds
+/// decimals already, Comet `CheckOverflow` expression only checks if the decimals can fit in the
+/// precision.
+#[derive(Debug, Hash)]
+pub struct CheckOverflow {
+    pub child: Arc<dyn PhysicalExpr>,
+    pub data_type: DataType,
+    pub fail_on_error: bool,
+}
+
+impl CheckOverflow {
+    pub fn new(child: Arc<dyn PhysicalExpr>, data_type: DataType, fail_on_error: bool) -> Self {
+        Self {
+            child,
+            data_type,
+            fail_on_error,
+        }
+    }
+}
+
+impl Display for CheckOverflow {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        write!(
+            f,
+            "CheckOverflow [datatype: {}, fail_on_error: {}, child: {}]",
+            self.data_type, self.fail_on_error, self.child
+        )
+    }
+}
+
+impl PartialEq<dyn Any> for CheckOverflow {
+    fn eq(&self, other: &dyn Any) -> bool {
+        down_cast_any_ref(other)
+            .downcast_ref::<Self>()
+            .map(|x| {
+                self.child.eq(&x.child)
+                    && self.data_type.eq(&x.data_type)
+                    && self.fail_on_error.eq(&x.fail_on_error)
+            })
+            .unwrap_or(false)
+    }
+}
+
+impl PhysicalExpr for CheckOverflow {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn data_type(&self, _: &Schema) -> datafusion_common::Result<DataType> {
+        Ok(self.data_type.clone())
+    }
+
+    fn nullable(&self, _: &Schema) -> datafusion_common::Result<bool> {
+        Ok(true)
+    }
+
+    fn evaluate(&self, batch: &RecordBatch) -> datafusion_common::Result<ColumnarValue> {
+        let arg = self.child.evaluate(batch)?;
+        match arg {
+            ColumnarValue::Array(array)
+                if matches!(array.data_type(), DataType::Decimal128(_, _)) =>
+            {
+                let (precision, scale) = match &self.data_type {
+                    DataType::Decimal128(p, s) => (p, s),
+                    dt => {
+                        return Err(DataFusionError::Execution(format!(
+                            "CheckOverflow expects only Decimal128, but got {:?}",
+                            dt
+                        )))
+                    }
+                };
+
+                let decimal_array = as_primitive_array::<Decimal128Type>(&array);
+
+                let casted_array = if self.fail_on_error {
+                    // Returning error if overflow
+                    decimal_array.validate_decimal_precision(*precision)?;
+                    decimal_array
+                } else {
+                    // Overflowing gets null value
+                    &decimal_array.null_if_overflow_precision(*precision)
+                };
+
+                let new_array = Decimal128Array::from(casted_array.into_data())
+                    .with_precision_and_scale(*precision, *scale)
+                    .map(|a| Arc::new(a) as ArrayRef)?;
+
+                Ok(ColumnarValue::Array(new_array))
+            }
+            ColumnarValue::Scalar(ScalarValue::Decimal128(v, precision, scale)) => {
+                // `fail_on_error` is only true when ANSI is enabled, which we don't support yet
+                // (Java side will simply fallback to Spark when it is enabled)
+                assert!(
+                    !self.fail_on_error,
+                    "fail_on_error (ANSI mode) is not supported yet"
+                );
+
+                let new_v: Option<i128> = v.and_then(|v| {
+                    Decimal128Type::validate_decimal_precision(v, precision)
+                        .map(|_| v)
+                        .ok()
+                });
+
+                Ok(ColumnarValue::Scalar(ScalarValue::Decimal128(
+                    new_v, precision, scale,
+                )))
+            }
+            v => Err(DataFusionError::Execution(format!(
+                "CheckOverflow's child expression should be decimal array, but found {:?}",
+                v
+            ))),
+        }
+    }
+
+    fn children(&self) -> Vec<&Arc<dyn PhysicalExpr>> {
+        vec![&self.child]
+    }
+
+    fn with_new_children(
+        self: Arc<Self>,
+        children: Vec<Arc<dyn PhysicalExpr>>,
+    ) -> datafusion_common::Result<Arc<dyn PhysicalExpr>> {
+        Ok(Arc::new(CheckOverflow::new(
+            Arc::clone(&children[0]),
+            self.data_type.clone(),
+            self.fail_on_error,
+        )))
+    }
+
+    fn dyn_hash(&self, state: &mut dyn Hasher) {
+        let mut s = state;
+        self.child.hash(&mut s);
+        self.data_type.hash(&mut s);
+        self.fail_on_error.hash(&mut s);
+        self.hash(&mut s);
+    }
+}
diff --git a/src/lib.rs b/src/lib.rs
index 5dff6e0b8f4e..8a5748058769 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -29,6 +29,8 @@ mod bitwise_not;
 pub use bitwise_not::{bitwise_not, BitwiseNotExpr};
 mod avg_decimal;
 pub use avg_decimal::AvgDecimal;
+mod checkoverflow;
+pub use checkoverflow::CheckOverflow;
 mod correlation;
 pub use correlation::Correlation;
 mod covariance;
@@ -45,10 +47,14 @@ pub use stddev::Stddev;
 mod structs;
 mod sum_decimal;
 pub use sum_decimal::SumDecimal;
+mod negative;
+pub use negative::{create_negate_expr, NegativeExpr};
 mod normalize_nan;
 mod temporal;
 pub mod timezone;
 mod to_json;
+mod unbound;
+pub use unbound::UnboundColumn;
 pub mod utils;
 pub use normalize_nan::NormalizeNaNAndZero;
 
@@ -83,3 +89,9 @@ pub enum EvalMode {
     /// failing the entire query.
     Try,
 }
+
+pub(crate) fn arithmetic_overflow_error(from_type: &str) -> SparkError {
+    SparkError::ArithmeticOverflow {
+        from_type: from_type.to_string(),
+    }
+}
diff --git a/src/negative.rs b/src/negative.rs
new file mode 100644
index 000000000000..3d9063e7835f
--- /dev/null
+++ b/src/negative.rs
@@ -0,0 +1,266 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use super::arithmetic_overflow_error;
+use crate::SparkError;
+use arrow::{compute::kernels::numeric::neg_wrapping, datatypes::IntervalDayTimeType};
+use arrow_array::RecordBatch;
+use arrow_buffer::IntervalDayTime;
+use arrow_schema::{DataType, Schema};
+use datafusion::physical_expr_common::physical_expr::down_cast_any_ref;
+use datafusion::{
+    logical_expr::{interval_arithmetic::Interval, ColumnarValue},
+    physical_expr::PhysicalExpr,
+};
+use datafusion_common::{DataFusionError, Result, ScalarValue};
+use datafusion_expr::sort_properties::ExprProperties;
+use std::{
+    any::Any,
+    hash::{Hash, Hasher},
+    sync::Arc,
+};
+
+pub fn create_negate_expr(
+    expr: Arc<dyn PhysicalExpr>,
+    fail_on_error: bool,
+) -> Result<Arc<dyn PhysicalExpr>, DataFusionError> {
+    Ok(Arc::new(NegativeExpr::new(expr, fail_on_error)))
+}
+
+/// Negative expression
+#[derive(Debug, Hash)]
+pub struct NegativeExpr {
+    /// Input expression
+    arg: Arc<dyn PhysicalExpr>,
+    fail_on_error: bool,
+}
+
+macro_rules! check_overflow {
+    ($array:expr, $array_type:ty, $min_val:expr, $type_name:expr) => {{
+        let typed_array = $array
+            .as_any()
+            .downcast_ref::<$array_type>()
+            .expect(concat!(stringify!($array_type), " expected"));
+        for i in 0..typed_array.len() {
+            if typed_array.value(i) == $min_val {
+                if $type_name == "byte" || $type_name == "short" {
+                    let value = format!("{:?} caused", typed_array.value(i));
+                    return Err(arithmetic_overflow_error(value.as_str()).into());
+                }
+                return Err(arithmetic_overflow_error($type_name).into());
+            }
+        }
+    }};
+}
+
+impl NegativeExpr {
+    /// Create new not expression
+    pub fn new(arg: Arc<dyn PhysicalExpr>, fail_on_error: bool) -> Self {
+        Self { arg, fail_on_error }
+    }
+
+    /// Get the input expression
+    pub fn arg(&self) -> &Arc<dyn PhysicalExpr> {
+        &self.arg
+    }
+}
+
+impl std::fmt::Display for NegativeExpr {
+    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
+        write!(f, "(- {})", self.arg)
+    }
+}
+
+impl PhysicalExpr for NegativeExpr {
+    /// Return a reference to Any that can be used for downcasting
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn data_type(&self, input_schema: &Schema) -> Result<DataType> {
+        self.arg.data_type(input_schema)
+    }
+
+    fn nullable(&self, input_schema: &Schema) -> Result<bool> {
+        self.arg.nullable(input_schema)
+    }
+
+    fn evaluate(&self, batch: &RecordBatch) -> Result<ColumnarValue> {
+        let arg = self.arg.evaluate(batch)?;
+
+        // overflow checks only apply in ANSI mode
+        // datatypes supported are byte, short, integer, long, float, interval
+        match arg {
+            ColumnarValue::Array(array) => {
+                if self.fail_on_error {
+                    match array.data_type() {
+                        DataType::Int8 => {
+                            check_overflow!(array, arrow::array::Int8Array, i8::MIN, "byte")
+                        }
+                        DataType::Int16 => {
+                            check_overflow!(array, arrow::array::Int16Array, i16::MIN, "short")
+                        }
+                        DataType::Int32 => {
+                            check_overflow!(array, arrow::array::Int32Array, i32::MIN, "integer")
+                        }
+                        DataType::Int64 => {
+                            check_overflow!(array, arrow::array::Int64Array, i64::MIN, "long")
+                        }
+                        DataType::Interval(value) => match value {
+                            arrow::datatypes::IntervalUnit::YearMonth => check_overflow!(
+                                array,
+                                arrow::array::IntervalYearMonthArray,
+                                i32::MIN,
+                                "interval"
+                            ),
+                            arrow::datatypes::IntervalUnit::DayTime => check_overflow!(
+                                array,
+                                arrow::array::IntervalDayTimeArray,
+                                IntervalDayTime::MIN,
+                                "interval"
+                            ),
+                            arrow::datatypes::IntervalUnit::MonthDayNano => {
+                                // Overflow checks are not supported
+                            }
+                        },
+                        _ => {
+                            // Overflow checks are not supported for other datatypes
+                        }
+                    }
+                }
+                let result = neg_wrapping(array.as_ref())?;
+                Ok(ColumnarValue::Array(result))
+            }
+            ColumnarValue::Scalar(scalar) => {
+                if self.fail_on_error {
+                    match scalar {
+                        ScalarValue::Int8(value) => {
+                            if value == Some(i8::MIN) {
+                                return Err(arithmetic_overflow_error(" caused").into());
+                            }
+                        }
+                        ScalarValue::Int16(value) => {
+                            if value == Some(i16::MIN) {
+                                return Err(arithmetic_overflow_error(" caused").into());
+                            }
+                        }
+                        ScalarValue::Int32(value) => {
+                            if value == Some(i32::MIN) {
+                                return Err(arithmetic_overflow_error("integer").into());
+                            }
+                        }
+                        ScalarValue::Int64(value) => {
+                            if value == Some(i64::MIN) {
+                                return Err(arithmetic_overflow_error("long").into());
+                            }
+                        }
+                        ScalarValue::IntervalDayTime(value) => {
+                            let (days, ms) =
+                                IntervalDayTimeType::to_parts(value.unwrap_or_default());
+                            if days == i32::MIN || ms == i32::MIN {
+                                return Err(arithmetic_overflow_error("interval").into());
+                            }
+                        }
+                        ScalarValue::IntervalYearMonth(value) => {
+                            if value == Some(i32::MIN) {
+                                return Err(arithmetic_overflow_error("interval").into());
+                            }
+                        }
+                        _ => {
+                            // Overflow checks are not supported for other datatypes
+                        }
+                    }
+                }
+                Ok(ColumnarValue::Scalar((scalar.arithmetic_negate())?))
+            }
+        }
+    }
+
+    fn children(&self) -> Vec<&Arc<dyn PhysicalExpr>> {
+        vec![&self.arg]
+    }
+
+    fn with_new_children(
+        self: Arc<Self>,
+        children: Vec<Arc<dyn PhysicalExpr>>,
+    ) -> Result<Arc<dyn PhysicalExpr>> {
+        Ok(Arc::new(NegativeExpr::new(
+            Arc::clone(&children[0]),
+            self.fail_on_error,
+        )))
+    }
+
+    fn dyn_hash(&self, state: &mut dyn Hasher) {
+        let mut s = state;
+        self.hash(&mut s);
+    }
+
+    /// Given the child interval of a NegativeExpr, it calculates the NegativeExpr's interval.
+    /// It replaces the upper and lower bounds after multiplying them with -1.
+    /// Ex: `(a, b]` => `[-b, -a)`
+    fn evaluate_bounds(&self, children: &[&Interval]) -> Result<Interval> {
+        Interval::try_new(
+            children[0].upper().arithmetic_negate()?,
+            children[0].lower().arithmetic_negate()?,
+        )
+    }
+
+    /// Returns a new [`Interval`] of a NegativeExpr  that has the existing `interval` given that
+    /// given the input interval is known to be `children`.
+    fn propagate_constraints(
+        &self,
+        interval: &Interval,
+        children: &[&Interval],
+    ) -> Result<Option<Vec<Interval>>> {
+        let child_interval = children[0];
+
+        if child_interval.lower() == &ScalarValue::Int32(Some(i32::MIN))
+            || child_interval.upper() == &ScalarValue::Int32(Some(i32::MIN))
+            || child_interval.lower() == &ScalarValue::Int64(Some(i64::MIN))
+            || child_interval.upper() == &ScalarValue::Int64(Some(i64::MIN))
+        {
+            return Err(SparkError::ArithmeticOverflow {
+                from_type: "long".to_string(),
+            }
+            .into());
+        }
+
+        let negated_interval = Interval::try_new(
+            interval.upper().arithmetic_negate()?,
+            interval.lower().arithmetic_negate()?,
+        )?;
+
+        Ok(child_interval
+            .intersect(negated_interval)?
+            .map(|result| vec![result]))
+    }
+
+    /// The ordering of a [`NegativeExpr`] is simply the reverse of its child.
+    fn get_properties(&self, children: &[ExprProperties]) -> Result<ExprProperties> {
+        let properties = children[0].clone().with_order(children[0].sort_properties);
+        Ok(properties)
+    }
+}
+
+impl PartialEq<dyn Any> for NegativeExpr {
+    fn eq(&self, other: &dyn Any) -> bool {
+        down_cast_any_ref(other)
+            .downcast_ref::<Self>()
+            .map(|x| self.arg.eq(&x.arg))
+            .unwrap_or(false)
+    }
+}
diff --git a/src/unbound.rs b/src/unbound.rs
new file mode 100644
index 000000000000..a6babd0f7ef1
--- /dev/null
+++ b/src/unbound.rs
@@ -0,0 +1,110 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow_array::RecordBatch;
+use arrow_schema::{DataType, Schema};
+use datafusion::physical_expr_common::physical_expr::down_cast_any_ref;
+use datafusion::physical_plan::ColumnarValue;
+use datafusion_common::{internal_err, Result};
+use datafusion_physical_expr::PhysicalExpr;
+use std::{
+    any::Any,
+    hash::{Hash, Hasher},
+    sync::Arc,
+};
+
+/// This is similar to `UnKnownColumn` in DataFusion, but it has data type.
+/// This is only used when the column is not bound to a schema, for example, the
+/// inputs to aggregation functions in final aggregation. In the case, we cannot
+/// bind the aggregation functions to the input schema which is grouping columns
+/// and aggregate buffer attributes in Spark (DataFusion has different design).
+/// But when creating certain aggregation functions, we need to know its input
+/// data types. As `UnKnownColumn` doesn't have data type, we implement this
+/// `UnboundColumn` to carry the data type.
+#[derive(Debug, Hash, PartialEq, Eq, Clone)]
+pub struct UnboundColumn {
+    name: String,
+    datatype: DataType,
+}
+
+impl UnboundColumn {
+    /// Create a new unbound column expression
+    pub fn new(name: &str, datatype: DataType) -> Self {
+        Self {
+            name: name.to_owned(),
+            datatype,
+        }
+    }
+
+    /// Get the column name
+    pub fn name(&self) -> &str {
+        &self.name
+    }
+}
+
+impl std::fmt::Display for UnboundColumn {
+    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
+        write!(f, "{}, datatype: {}", self.name, self.datatype)
+    }
+}
+
+impl PhysicalExpr for UnboundColumn {
+    /// Return a reference to Any that can be used for downcasting
+    fn as_any(&self) -> &dyn std::any::Any {
+        self
+    }
+
+    /// Get the data type of this expression, given the schema of the input
+    fn data_type(&self, _input_schema: &Schema) -> Result<DataType> {
+        Ok(self.datatype.clone())
+    }
+
+    /// Decide whether this expression is nullable, given the schema of the input
+    fn nullable(&self, _input_schema: &Schema) -> Result<bool> {
+        Ok(true)
+    }
+
+    /// Evaluate the expression
+    fn evaluate(&self, _batch: &RecordBatch) -> Result<ColumnarValue> {
+        internal_err!("UnboundColumn::evaluate() should not be called")
+    }
+
+    fn children(&self) -> Vec<&Arc<dyn PhysicalExpr>> {
+        vec![]
+    }
+
+    fn with_new_children(
+        self: Arc<Self>,
+        _children: Vec<Arc<dyn PhysicalExpr>>,
+    ) -> Result<Arc<dyn PhysicalExpr>> {
+        Ok(self)
+    }
+
+    fn dyn_hash(&self, state: &mut dyn Hasher) {
+        let mut s = state;
+        self.hash(&mut s);
+    }
+}
+
+impl PartialEq<dyn Any> for UnboundColumn {
+    fn eq(&self, other: &dyn Any) -> bool {
+        down_cast_any_ref(other)
+            .downcast_ref::<Self>()
+            .map(|x| self == x)
+            .unwrap_or(false)
+    }
+}

From 7bd99f686598c2f1e8a6f412ea3386fae30719a8 Mon Sep 17 00:00:00 2001
From: Andy Grove <agrove@apache.org>
Date: Tue, 17 Dec 2024 16:35:54 -0700
Subject: [PATCH 41/68] feat: Add Spark-compatible implementation of
 SchemaAdapterFactory (#1169)

* Add Spark-compatible SchemaAdapterFactory implementation

* remove prototype code

* fix

* refactor

* implement more cast logic

* implement more cast logic

* add basic test

* improve test

* cleanup

* fmt

* add support for casting unsigned int to signed int

* clippy

* address feedback

* fix test
---
 Cargo.toml                   |   4 +-
 src/cast.rs                  | 353 ++++++++++++++++++++++++++------
 src/lib.rs                   |   5 +
 src/schema_adapter.rs        | 376 +++++++++++++++++++++++++++++++++++
 src/test_common/file_util.rs |  53 +++++
 src/test_common/mod.rs       |  17 ++
 6 files changed, 746 insertions(+), 62 deletions(-)
 create mode 100644 src/schema_adapter.rs
 create mode 100644 src/test_common/file_util.rs
 create mode 100644 src/test_common/mod.rs

diff --git a/Cargo.toml b/Cargo.toml
index d0bc2fd9dd53..27367d83e169 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -33,7 +33,7 @@ arrow-buffer = { workspace = true }
 arrow-data = { workspace = true }
 arrow-schema = { workspace = true }
 chrono = { workspace = true }
-datafusion = { workspace = true }
+datafusion = { workspace = true, features = ["parquet"] }
 datafusion-common = { workspace = true }
 datafusion-expr = { workspace = true }
 datafusion-physical-expr = { workspace = true }
@@ -43,9 +43,11 @@ regex = { workspace = true }
 thiserror = { workspace = true }
 futures = { workspace = true }
 twox-hash = "2.0.0"
+rand = { workspace = true }
 
 [dev-dependencies]
 arrow-data = {workspace = true}
+parquet = { workspace = true, features = ["arrow"] }
 criterion = "0.5.1"
 rand = { workspace = true}
 tokio = { version = "1", features = ["rt-multi-thread"] }
diff --git a/src/cast.rs b/src/cast.rs
index f62d0220c9d4..d96bcbbdb627 100644
--- a/src/cast.rs
+++ b/src/cast.rs
@@ -15,6 +15,9 @@
 // specific language governing permissions and limitations
 // under the License.
 
+use crate::timezone;
+use crate::utils::array_with_timezone;
+use crate::{EvalMode, SparkError, SparkResult};
 use arrow::{
     array::{
         cast::AsArray,
@@ -35,11 +38,18 @@ use arrow::{
 use arrow_array::builder::StringBuilder;
 use arrow_array::{DictionaryArray, StringArray, StructArray};
 use arrow_schema::{DataType, Field, Schema};
+use chrono::{NaiveDate, NaiveDateTime, TimeZone, Timelike};
+use datafusion::physical_expr_common::physical_expr::down_cast_any_ref;
 use datafusion_common::{
     cast::as_generic_string_array, internal_err, Result as DataFusionResult, ScalarValue,
 };
 use datafusion_expr::ColumnarValue;
 use datafusion_physical_expr::PhysicalExpr;
+use num::{
+    cast::AsPrimitive, integer::div_floor, traits::CheckedNeg, CheckedSub, Integer, Num,
+    ToPrimitive,
+};
+use regex::Regex;
 use std::str::FromStr;
 use std::{
     any::Any,
@@ -49,19 +59,6 @@ use std::{
     sync::Arc,
 };
 
-use chrono::{NaiveDate, NaiveDateTime, TimeZone, Timelike};
-use datafusion::physical_expr_common::physical_expr::down_cast_any_ref;
-use num::{
-    cast::AsPrimitive, integer::div_floor, traits::CheckedNeg, CheckedSub, Integer, Num,
-    ToPrimitive,
-};
-use regex::Regex;
-
-use crate::timezone;
-use crate::utils::array_with_timezone;
-
-use crate::{EvalMode, SparkError, SparkResult};
-
 static TIMESTAMP_FORMAT: Option<&str> = Some("%Y-%m-%d %H:%M:%S%.f");
 
 const MICROS_PER_SECOND: i64 = 1000000;
@@ -141,6 +138,240 @@ pub struct Cast {
     pub cast_options: SparkCastOptions,
 }
 
+/// Determine if Comet supports a cast, taking options such as EvalMode and Timezone into account.
+pub fn cast_supported(
+    from_type: &DataType,
+    to_type: &DataType,
+    options: &SparkCastOptions,
+) -> bool {
+    use DataType::*;
+
+    let from_type = if let Dictionary(_, dt) = from_type {
+        dt
+    } else {
+        from_type
+    };
+
+    let to_type = if let Dictionary(_, dt) = to_type {
+        dt
+    } else {
+        to_type
+    };
+
+    if from_type == to_type {
+        return true;
+    }
+
+    match (from_type, to_type) {
+        (Boolean, _) => can_cast_from_boolean(to_type, options),
+        (UInt8 | UInt16 | UInt32 | UInt64, Int8 | Int16 | Int32 | Int64)
+            if options.allow_cast_unsigned_ints =>
+        {
+            true
+        }
+        (Int8, _) => can_cast_from_byte(to_type, options),
+        (Int16, _) => can_cast_from_short(to_type, options),
+        (Int32, _) => can_cast_from_int(to_type, options),
+        (Int64, _) => can_cast_from_long(to_type, options),
+        (Float32, _) => can_cast_from_float(to_type, options),
+        (Float64, _) => can_cast_from_double(to_type, options),
+        (Decimal128(p, s), _) => can_cast_from_decimal(p, s, to_type, options),
+        (Timestamp(_, None), _) => can_cast_from_timestamp_ntz(to_type, options),
+        (Timestamp(_, Some(_)), _) => can_cast_from_timestamp(to_type, options),
+        (Utf8 | LargeUtf8, _) => can_cast_from_string(to_type, options),
+        (_, Utf8 | LargeUtf8) => can_cast_to_string(from_type, options),
+        (Struct(from_fields), Struct(to_fields)) => from_fields
+            .iter()
+            .zip(to_fields.iter())
+            .all(|(a, b)| cast_supported(a.data_type(), b.data_type(), options)),
+        _ => false,
+    }
+}
+
+fn can_cast_from_string(to_type: &DataType, options: &SparkCastOptions) -> bool {
+    use DataType::*;
+    match to_type {
+        Boolean | Int8 | Int16 | Int32 | Int64 | Binary => true,
+        Float32 | Float64 => {
+            // https://github.com/apache/datafusion-comet/issues/326
+            // Does not support inputs ending with 'd' or 'f'. Does not support 'inf'.
+            // Does not support ANSI mode.
+            options.allow_incompat
+        }
+        Decimal128(_, _) => {
+            // https://github.com/apache/datafusion-comet/issues/325
+            // Does not support inputs ending with 'd' or 'f'. Does not support 'inf'.
+            // Does not support ANSI mode. Returns 0.0 instead of null if input contains no digits
+
+            options.allow_incompat
+        }
+        Date32 | Date64 => {
+            // https://github.com/apache/datafusion-comet/issues/327
+            // Only supports years between 262143 BC and 262142 AD
+            options.allow_incompat
+        }
+        Timestamp(_, _) if options.eval_mode == EvalMode::Ansi => {
+            // ANSI mode not supported
+            false
+        }
+        Timestamp(_, Some(tz)) if tz.as_ref() != "UTC" => {
+            // Cast will use UTC instead of $timeZoneId
+            options.allow_incompat
+        }
+        Timestamp(_, _) => {
+            // https://github.com/apache/datafusion-comet/issues/328
+            // Not all valid formats are supported
+            options.allow_incompat
+        }
+        _ => false,
+    }
+}
+
+fn can_cast_to_string(from_type: &DataType, options: &SparkCastOptions) -> bool {
+    use DataType::*;
+    match from_type {
+        Boolean | Int8 | Int16 | Int32 | Int64 | Date32 | Date64 | Timestamp(_, _) => true,
+        Float32 | Float64 => {
+            // There can be differences in precision.
+            // For example, the input \"1.4E-45\" will produce 1.0E-45 " +
+            // instead of 1.4E-45"))
+            true
+        }
+        Decimal128(_, _) => {
+            // https://github.com/apache/datafusion-comet/issues/1068
+            // There can be formatting differences in some case due to Spark using
+            // scientific notation where Comet does not
+            true
+        }
+        Binary => {
+            // https://github.com/apache/datafusion-comet/issues/377
+            // Only works for binary data representing valid UTF-8 strings
+            options.allow_incompat
+        }
+        Struct(fields) => fields
+            .iter()
+            .all(|f| can_cast_to_string(f.data_type(), options)),
+        _ => false,
+    }
+}
+
+fn can_cast_from_timestamp_ntz(to_type: &DataType, options: &SparkCastOptions) -> bool {
+    use DataType::*;
+    match to_type {
+        Timestamp(_, _) | Date32 | Date64 | Utf8 => {
+            // incompatible
+            options.allow_incompat
+        }
+        _ => {
+            // unsupported
+            false
+        }
+    }
+}
+
+fn can_cast_from_timestamp(to_type: &DataType, _options: &SparkCastOptions) -> bool {
+    use DataType::*;
+    match to_type {
+        Boolean | Int8 | Int16 => {
+            // https://github.com/apache/datafusion-comet/issues/352
+            // this seems like an edge case that isn't important for us to support
+            false
+        }
+        Int64 => {
+            // https://github.com/apache/datafusion-comet/issues/352
+            true
+        }
+        Date32 | Date64 | Utf8 | Decimal128(_, _) => true,
+        _ => {
+            // unsupported
+            false
+        }
+    }
+}
+
+fn can_cast_from_boolean(to_type: &DataType, _: &SparkCastOptions) -> bool {
+    use DataType::*;
+    matches!(to_type, Int8 | Int16 | Int32 | Int64 | Float32 | Float64)
+}
+
+fn can_cast_from_byte(to_type: &DataType, _: &SparkCastOptions) -> bool {
+    use DataType::*;
+    matches!(
+        to_type,
+        Boolean | Int8 | Int16 | Int32 | Int64 | Float32 | Float64 | Decimal128(_, _)
+    )
+}
+
+fn can_cast_from_short(to_type: &DataType, _: &SparkCastOptions) -> bool {
+    use DataType::*;
+    matches!(
+        to_type,
+        Boolean | Int8 | Int16 | Int32 | Int64 | Float32 | Float64 | Decimal128(_, _)
+    )
+}
+
+fn can_cast_from_int(to_type: &DataType, options: &SparkCastOptions) -> bool {
+    use DataType::*;
+    match to_type {
+        Boolean | Int8 | Int16 | Int32 | Int64 | Float32 | Float64 | Utf8 => true,
+        Decimal128(_, _) => {
+            // incompatible: no overflow check
+            options.allow_incompat
+        }
+        _ => false,
+    }
+}
+
+fn can_cast_from_long(to_type: &DataType, options: &SparkCastOptions) -> bool {
+    use DataType::*;
+    match to_type {
+        Boolean | Int8 | Int16 | Int32 | Int64 | Float32 | Float64 => true,
+        Decimal128(_, _) => {
+            // incompatible: no overflow check
+            options.allow_incompat
+        }
+        _ => false,
+    }
+}
+
+fn can_cast_from_float(to_type: &DataType, _: &SparkCastOptions) -> bool {
+    use DataType::*;
+    matches!(
+        to_type,
+        Boolean | Int8 | Int16 | Int32 | Int64 | Float64 | Decimal128(_, _)
+    )
+}
+
+fn can_cast_from_double(to_type: &DataType, _: &SparkCastOptions) -> bool {
+    use DataType::*;
+    matches!(
+        to_type,
+        Boolean | Int8 | Int16 | Int32 | Int64 | Float32 | Decimal128(_, _)
+    )
+}
+
+fn can_cast_from_decimal(
+    p1: &u8,
+    _s1: &i8,
+    to_type: &DataType,
+    options: &SparkCastOptions,
+) -> bool {
+    use DataType::*;
+    match to_type {
+        Int8 | Int16 | Int32 | Int64 | Float32 | Float64 => true,
+        Decimal128(p2, _) => {
+            if p2 < p1 {
+                // https://github.com/apache/datafusion/issues/13492
+                // Incompatible(Some("Casting to smaller precision is not supported"))
+                options.allow_incompat
+            } else {
+                true
+            }
+        }
+        _ => false,
+    }
+}
+
 macro_rules! cast_utf8_to_int {
     ($array:expr, $eval_mode:expr, $array_type:ty, $cast_method:ident) => {{
         let len = $array.len();
@@ -560,6 +791,8 @@ pub struct SparkCastOptions {
     pub timezone: String,
     /// Allow casts that are supported but not guaranteed to be 100% compatible
     pub allow_incompat: bool,
+    /// Support casting unsigned ints to signed ints (used by Parquet SchemaAdapter)
+    pub allow_cast_unsigned_ints: bool,
 }
 
 impl SparkCastOptions {
@@ -568,6 +801,7 @@ impl SparkCastOptions {
             eval_mode,
             timezone: timezone.to_string(),
             allow_incompat,
+            allow_cast_unsigned_ints: false,
         }
     }
 
@@ -576,6 +810,7 @@ impl SparkCastOptions {
             eval_mode,
             timezone: "".to_string(),
             allow_incompat,
+            allow_cast_unsigned_ints: false,
         }
     }
 }
@@ -611,14 +846,14 @@ fn cast_array(
     to_type: &DataType,
     cast_options: &SparkCastOptions,
 ) -> DataFusionResult<ArrayRef> {
+    use DataType::*;
     let array = array_with_timezone(array, cast_options.timezone.clone(), Some(to_type))?;
     let from_type = array.data_type().clone();
 
     let array = match &from_type {
-        DataType::Dictionary(key_type, value_type)
-            if key_type.as_ref() == &DataType::Int32
-                && (value_type.as_ref() == &DataType::Utf8
-                    || value_type.as_ref() == &DataType::LargeUtf8) =>
+        Dictionary(key_type, value_type)
+            if key_type.as_ref() == &Int32
+                && (value_type.as_ref() == &Utf8 || value_type.as_ref() == &LargeUtf8) =>
         {
             let dict_array = array
                 .as_any()
@@ -631,7 +866,7 @@ fn cast_array(
             );
 
             let casted_result = match to_type {
-                DataType::Dictionary(_, _) => Arc::new(casted_dictionary.clone()),
+                Dictionary(_, _) => Arc::new(casted_dictionary.clone()),
                 _ => take(casted_dictionary.values().as_ref(), dict_array.keys(), None)?,
             };
             return Ok(spark_cast_postprocess(casted_result, &from_type, to_type));
@@ -642,70 +877,66 @@ fn cast_array(
     let eval_mode = cast_options.eval_mode;
 
     let cast_result = match (from_type, to_type) {
-        (DataType::Utf8, DataType::Boolean) => spark_cast_utf8_to_boolean::<i32>(&array, eval_mode),
-        (DataType::LargeUtf8, DataType::Boolean) => {
-            spark_cast_utf8_to_boolean::<i64>(&array, eval_mode)
-        }
-        (DataType::Utf8, DataType::Timestamp(_, _)) => {
+        (Utf8, Boolean) => spark_cast_utf8_to_boolean::<i32>(&array, eval_mode),
+        (LargeUtf8, Boolean) => spark_cast_utf8_to_boolean::<i64>(&array, eval_mode),
+        (Utf8, Timestamp(_, _)) => {
             cast_string_to_timestamp(&array, to_type, eval_mode, &cast_options.timezone)
         }
-        (DataType::Utf8, DataType::Date32) => cast_string_to_date(&array, to_type, eval_mode),
-        (DataType::Int64, DataType::Int32)
-        | (DataType::Int64, DataType::Int16)
-        | (DataType::Int64, DataType::Int8)
-        | (DataType::Int32, DataType::Int16)
-        | (DataType::Int32, DataType::Int8)
-        | (DataType::Int16, DataType::Int8)
+        (Utf8, Date32) => cast_string_to_date(&array, to_type, eval_mode),
+        (Int64, Int32)
+        | (Int64, Int16)
+        | (Int64, Int8)
+        | (Int32, Int16)
+        | (Int32, Int8)
+        | (Int16, Int8)
             if eval_mode != EvalMode::Try =>
         {
             spark_cast_int_to_int(&array, eval_mode, from_type, to_type)
         }
-        (DataType::Utf8, DataType::Int8 | DataType::Int16 | DataType::Int32 | DataType::Int64) => {
+        (Utf8, Int8 | Int16 | Int32 | Int64) => {
             cast_string_to_int::<i32>(to_type, &array, eval_mode)
         }
-        (
-            DataType::LargeUtf8,
-            DataType::Int8 | DataType::Int16 | DataType::Int32 | DataType::Int64,
-        ) => cast_string_to_int::<i64>(to_type, &array, eval_mode),
-        (DataType::Float64, DataType::Utf8) => spark_cast_float64_to_utf8::<i32>(&array, eval_mode),
-        (DataType::Float64, DataType::LargeUtf8) => {
-            spark_cast_float64_to_utf8::<i64>(&array, eval_mode)
-        }
-        (DataType::Float32, DataType::Utf8) => spark_cast_float32_to_utf8::<i32>(&array, eval_mode),
-        (DataType::Float32, DataType::LargeUtf8) => {
-            spark_cast_float32_to_utf8::<i64>(&array, eval_mode)
-        }
-        (DataType::Float32, DataType::Decimal128(precision, scale)) => {
+        (LargeUtf8, Int8 | Int16 | Int32 | Int64) => {
+            cast_string_to_int::<i64>(to_type, &array, eval_mode)
+        }
+        (Float64, Utf8) => spark_cast_float64_to_utf8::<i32>(&array, eval_mode),
+        (Float64, LargeUtf8) => spark_cast_float64_to_utf8::<i64>(&array, eval_mode),
+        (Float32, Utf8) => spark_cast_float32_to_utf8::<i32>(&array, eval_mode),
+        (Float32, LargeUtf8) => spark_cast_float32_to_utf8::<i64>(&array, eval_mode),
+        (Float32, Decimal128(precision, scale)) => {
             cast_float32_to_decimal128(&array, *precision, *scale, eval_mode)
         }
-        (DataType::Float64, DataType::Decimal128(precision, scale)) => {
+        (Float64, Decimal128(precision, scale)) => {
             cast_float64_to_decimal128(&array, *precision, *scale, eval_mode)
         }
-        (DataType::Float32, DataType::Int8)
-        | (DataType::Float32, DataType::Int16)
-        | (DataType::Float32, DataType::Int32)
-        | (DataType::Float32, DataType::Int64)
-        | (DataType::Float64, DataType::Int8)
-        | (DataType::Float64, DataType::Int16)
-        | (DataType::Float64, DataType::Int32)
-        | (DataType::Float64, DataType::Int64)
-        | (DataType::Decimal128(_, _), DataType::Int8)
-        | (DataType::Decimal128(_, _), DataType::Int16)
-        | (DataType::Decimal128(_, _), DataType::Int32)
-        | (DataType::Decimal128(_, _), DataType::Int64)
+        (Float32, Int8)
+        | (Float32, Int16)
+        | (Float32, Int32)
+        | (Float32, Int64)
+        | (Float64, Int8)
+        | (Float64, Int16)
+        | (Float64, Int32)
+        | (Float64, Int64)
+        | (Decimal128(_, _), Int8)
+        | (Decimal128(_, _), Int16)
+        | (Decimal128(_, _), Int32)
+        | (Decimal128(_, _), Int64)
             if eval_mode != EvalMode::Try =>
         {
             spark_cast_nonintegral_numeric_to_integral(&array, eval_mode, from_type, to_type)
         }
-        (DataType::Struct(_), DataType::Utf8) => {
-            Ok(casts_struct_to_string(array.as_struct(), cast_options)?)
-        }
-        (DataType::Struct(_), DataType::Struct(_)) => Ok(cast_struct_to_struct(
+        (Struct(_), Utf8) => Ok(casts_struct_to_string(array.as_struct(), cast_options)?),
+        (Struct(_), Struct(_)) => Ok(cast_struct_to_struct(
             array.as_struct(),
             from_type,
             to_type,
             cast_options,
         )?),
+        (UInt8 | UInt16 | UInt32 | UInt64, Int8 | Int16 | Int32 | Int64)
+            if cast_options.allow_cast_unsigned_ints =>
+        {
+            Ok(cast_with_options(&array, to_type, &CAST_OPTIONS)?)
+        }
         _ if is_datafusion_spark_compatible(from_type, to_type, cast_options.allow_incompat) => {
             // use DataFusion cast only when we know that it is compatible with Spark
             Ok(cast_with_options(&array, to_type, &CAST_OPTIONS)?)
diff --git a/src/lib.rs b/src/lib.rs
index 8a5748058769..f35873100412 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -41,6 +41,9 @@ mod kernels;
 mod list;
 mod regexp;
 pub mod scalar_funcs;
+mod schema_adapter;
+pub use schema_adapter::SparkSchemaAdapterFactory;
+
 pub mod spark_hash;
 mod stddev;
 pub use stddev::Stddev;
@@ -51,6 +54,8 @@ mod negative;
 pub use negative::{create_negate_expr, NegativeExpr};
 mod normalize_nan;
 mod temporal;
+
+pub mod test_common;
 pub mod timezone;
 mod to_json;
 mod unbound;
diff --git a/src/schema_adapter.rs b/src/schema_adapter.rs
new file mode 100644
index 000000000000..161ad6f164d9
--- /dev/null
+++ b/src/schema_adapter.rs
@@ -0,0 +1,376 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Custom schema adapter that uses Spark-compatible casts
+
+use crate::cast::cast_supported;
+use crate::{spark_cast, SparkCastOptions};
+use arrow_array::{new_null_array, Array, RecordBatch, RecordBatchOptions};
+use arrow_schema::{Schema, SchemaRef};
+use datafusion::datasource::schema_adapter::{SchemaAdapter, SchemaAdapterFactory, SchemaMapper};
+use datafusion_common::plan_err;
+use datafusion_expr::ColumnarValue;
+use std::sync::Arc;
+
+/// An implementation of DataFusion's `SchemaAdapterFactory` that uses a Spark-compatible
+/// `cast` implementation.
+#[derive(Clone, Debug)]
+pub struct SparkSchemaAdapterFactory {
+    /// Spark cast options
+    cast_options: SparkCastOptions,
+}
+
+impl SparkSchemaAdapterFactory {
+    pub fn new(options: SparkCastOptions) -> Self {
+        Self {
+            cast_options: options,
+        }
+    }
+}
+
+impl SchemaAdapterFactory for SparkSchemaAdapterFactory {
+    /// Create a new factory for mapping batches from a file schema to a table
+    /// schema.
+    ///
+    /// This is a convenience for [`DefaultSchemaAdapterFactory::create`] with
+    /// the same schema for both the projected table schema and the table
+    /// schema.
+    fn create(
+        &self,
+        required_schema: SchemaRef,
+        table_schema: SchemaRef,
+    ) -> Box<dyn SchemaAdapter> {
+        Box::new(SparkSchemaAdapter {
+            required_schema,
+            table_schema,
+            cast_options: self.cast_options.clone(),
+        })
+    }
+}
+
+/// This SchemaAdapter requires both the table schema and the projected table
+/// schema. See  [`SchemaMapping`] for more details
+#[derive(Clone, Debug)]
+pub struct SparkSchemaAdapter {
+    /// The schema for the table, projected to include only the fields being output (projected) by the
+    /// associated ParquetExec
+    required_schema: SchemaRef,
+    /// The entire table schema for the table we're using this to adapt.
+    ///
+    /// This is used to evaluate any filters pushed down into the scan
+    /// which may refer to columns that are not referred to anywhere
+    /// else in the plan.
+    table_schema: SchemaRef,
+    /// Spark cast options
+    cast_options: SparkCastOptions,
+}
+
+impl SchemaAdapter for SparkSchemaAdapter {
+    /// Map a column index in the table schema to a column index in a particular
+    /// file schema
+    ///
+    /// Panics if index is not in range for the table schema
+    fn map_column_index(&self, index: usize, file_schema: &Schema) -> Option<usize> {
+        let field = self.required_schema.field(index);
+        Some(file_schema.fields.find(field.name())?.0)
+    }
+
+    /// Creates a `SchemaMapping` for casting or mapping the columns from the
+    /// file schema to the table schema.
+    ///
+    /// If the provided `file_schema` contains columns of a different type to
+    /// the expected `table_schema`, the method will attempt to cast the array
+    /// data from the file schema to the table schema where possible.
+    ///
+    /// Returns a [`SchemaMapping`] that can be applied to the output batch
+    /// along with an ordered list of columns to project from the file
+    fn map_schema(
+        &self,
+        file_schema: &Schema,
+    ) -> datafusion_common::Result<(Arc<dyn SchemaMapper>, Vec<usize>)> {
+        let mut projection = Vec::with_capacity(file_schema.fields().len());
+        let mut field_mappings = vec![None; self.required_schema.fields().len()];
+
+        for (file_idx, file_field) in file_schema.fields.iter().enumerate() {
+            if let Some((table_idx, table_field)) =
+                self.required_schema.fields().find(file_field.name())
+            {
+                if cast_supported(
+                    file_field.data_type(),
+                    table_field.data_type(),
+                    &self.cast_options,
+                ) {
+                    field_mappings[table_idx] = Some(projection.len());
+                    projection.push(file_idx);
+                } else {
+                    return plan_err!(
+                        "Cannot cast file schema field {} of type {:?} to required schema field of type {:?}",
+                        file_field.name(),
+                        file_field.data_type(),
+                        table_field.data_type()
+                    );
+                }
+            }
+        }
+
+        Ok((
+            Arc::new(SchemaMapping {
+                required_schema: Arc::<Schema>::clone(&self.required_schema),
+                field_mappings,
+                table_schema: Arc::<Schema>::clone(&self.table_schema),
+                cast_options: self.cast_options.clone(),
+            }),
+            projection,
+        ))
+    }
+}
+
+// TODO SchemaMapping is mostly copied from DataFusion but calls spark_cast
+// instead of arrow cast - can we reduce the amount of code copied here and make
+// the DataFusion version more extensible?
+
+/// The SchemaMapping struct holds a mapping from the file schema to the table
+/// schema and any necessary type conversions.
+///
+/// Note, because `map_batch` and `map_partial_batch` functions have different
+/// needs, this struct holds two schemas:
+///
+/// 1. The projected **table** schema
+/// 2. The full table schema
+///
+/// [`map_batch`] is used by the ParquetOpener to produce a RecordBatch which
+/// has the projected schema, since that's the schema which is supposed to come
+/// out of the execution of this query. Thus `map_batch` uses
+/// `projected_table_schema` as it can only operate on the projected fields.
+///
+/// [`map_partial_batch`]  is used to create a RecordBatch with a schema that
+/// can be used for Parquet predicate pushdown, meaning that it may contain
+/// fields which are not in the projected schema (as the fields that parquet
+/// pushdown filters operate can be completely distinct from the fields that are
+/// projected (output) out of the ParquetExec). `map_partial_batch` thus uses
+/// `table_schema` to create the resulting RecordBatch (as it could be operating
+/// on any fields in the schema).
+///
+/// [`map_batch`]: Self::map_batch
+/// [`map_partial_batch`]: Self::map_partial_batch
+#[derive(Debug)]
+pub struct SchemaMapping {
+    /// The schema of the table. This is the expected schema after conversion
+    /// and it should match the schema of the query result.
+    required_schema: SchemaRef,
+    /// Mapping from field index in `projected_table_schema` to index in
+    /// projected file_schema.
+    ///
+    /// They are Options instead of just plain `usize`s because the table could
+    /// have fields that don't exist in the file.
+    field_mappings: Vec<Option<usize>>,
+    /// The entire table schema, as opposed to the projected_table_schema (which
+    /// only contains the columns that we are projecting out of this query).
+    /// This contains all fields in the table, regardless of if they will be
+    /// projected out or not.
+    table_schema: SchemaRef,
+    /// Spark cast options
+    cast_options: SparkCastOptions,
+}
+
+impl SchemaMapper for SchemaMapping {
+    /// Adapts a `RecordBatch` to match the `projected_table_schema` using the stored mapping and
+    /// conversions. The produced RecordBatch has a schema that contains only the projected
+    /// columns, so if one needs a RecordBatch with a schema that references columns which are not
+    /// in the projected, it would be better to use `map_partial_batch`
+    fn map_batch(&self, batch: RecordBatch) -> datafusion_common::Result<RecordBatch> {
+        let batch_rows = batch.num_rows();
+        let batch_cols = batch.columns().to_vec();
+
+        let cols = self
+            .required_schema
+            // go through each field in the projected schema
+            .fields()
+            .iter()
+            // and zip it with the index that maps fields from the projected table schema to the
+            // projected file schema in `batch`
+            .zip(&self.field_mappings)
+            // and for each one...
+            .map(|(field, file_idx)| {
+                file_idx.map_or_else(
+                    // If this field only exists in the table, and not in the file, then we know
+                    // that it's null, so just return that.
+                    || Ok(new_null_array(field.data_type(), batch_rows)),
+                    // However, if it does exist in both, then try to cast it to the correct output
+                    // type
+                    |batch_idx| {
+                        spark_cast(
+                            ColumnarValue::Array(Arc::clone(&batch_cols[batch_idx])),
+                            field.data_type(),
+                            &self.cast_options,
+                        )?
+                        .into_array(batch_rows)
+                    },
+                )
+            })
+            .collect::<datafusion_common::Result<Vec<_>, _>>()?;
+
+        // Necessary to handle empty batches
+        let options = RecordBatchOptions::new().with_row_count(Some(batch.num_rows()));
+
+        let schema = Arc::<Schema>::clone(&self.required_schema);
+        let record_batch = RecordBatch::try_new_with_options(schema, cols, &options)?;
+        Ok(record_batch)
+    }
+
+    /// Adapts a [`RecordBatch`]'s schema into one that has all the correct output types and only
+    /// contains the fields that exist in both the file schema and table schema.
+    ///
+    /// Unlike `map_batch` this method also preserves the columns that
+    /// may not appear in the final output (`projected_table_schema`) but may
+    /// appear in push down predicates
+    fn map_partial_batch(&self, batch: RecordBatch) -> datafusion_common::Result<RecordBatch> {
+        let batch_cols = batch.columns().to_vec();
+        let schema = batch.schema();
+
+        // for each field in the batch's schema (which is based on a file, not a table)...
+        let (cols, fields) = schema
+            .fields()
+            .iter()
+            .zip(batch_cols.iter())
+            .flat_map(|(field, batch_col)| {
+                self.table_schema
+                    // try to get the same field from the table schema that we have stored in self
+                    .field_with_name(field.name())
+                    // and if we don't have it, that's fine, ignore it. This may occur when we've
+                    // created an external table whose fields are a subset of the fields in this
+                    // file, then tried to read data from the file into this table. If that is the
+                    // case here, it's fine to ignore because we don't care about this field
+                    // anyways
+                    .ok()
+                    // but if we do have it,
+                    .map(|table_field| {
+                        // try to cast it into the correct output type. we don't want to ignore this
+                        // error, though, so it's propagated.
+                        spark_cast(
+                            ColumnarValue::Array(Arc::clone(batch_col)),
+                            table_field.data_type(),
+                            &self.cast_options,
+                        )?
+                        .into_array(batch_col.len())
+                        // and if that works, return the field and column.
+                        .map(|new_col| (new_col, table_field.clone()))
+                    })
+            })
+            .collect::<Result<Vec<_>, _>>()?
+            .into_iter()
+            .unzip::<_, _, Vec<_>, Vec<_>>();
+
+        // Necessary to handle empty batches
+        let options = RecordBatchOptions::new().with_row_count(Some(batch.num_rows()));
+
+        let schema = Arc::new(Schema::new_with_metadata(fields, schema.metadata().clone()));
+        let record_batch = RecordBatch::try_new_with_options(schema, cols, &options)?;
+        Ok(record_batch)
+    }
+}
+
+#[cfg(test)]
+mod test {
+    use crate::test_common::file_util::get_temp_filename;
+    use crate::{EvalMode, SparkCastOptions, SparkSchemaAdapterFactory};
+    use arrow::array::{Int32Array, StringArray};
+    use arrow::datatypes::{DataType, Field, Schema};
+    use arrow::record_batch::RecordBatch;
+    use arrow_array::UInt32Array;
+    use arrow_schema::SchemaRef;
+    use datafusion::datasource::listing::PartitionedFile;
+    use datafusion::datasource::physical_plan::{FileScanConfig, ParquetExec};
+    use datafusion::execution::object_store::ObjectStoreUrl;
+    use datafusion::execution::TaskContext;
+    use datafusion::physical_plan::ExecutionPlan;
+    use datafusion_common::DataFusionError;
+    use futures::StreamExt;
+    use parquet::arrow::ArrowWriter;
+    use std::fs::File;
+    use std::sync::Arc;
+
+    #[tokio::test]
+    async fn parquet_roundtrip_int_as_string() -> Result<(), DataFusionError> {
+        let file_schema = Arc::new(Schema::new(vec![
+            Field::new("id", DataType::Int32, false),
+            Field::new("name", DataType::Utf8, false),
+        ]));
+
+        let ids = Arc::new(Int32Array::from(vec![1, 2, 3])) as Arc<dyn arrow::array::Array>;
+        let names = Arc::new(StringArray::from(vec!["Alice", "Bob", "Charlie"]))
+            as Arc<dyn arrow::array::Array>;
+        let batch = RecordBatch::try_new(Arc::clone(&file_schema), vec![ids, names])?;
+
+        let required_schema = Arc::new(Schema::new(vec![
+            Field::new("id", DataType::Utf8, false),
+            Field::new("name", DataType::Utf8, false),
+        ]));
+
+        let _ = roundtrip(&batch, required_schema).await?;
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn parquet_roundtrip_unsigned_int() -> Result<(), DataFusionError> {
+        let file_schema = Arc::new(Schema::new(vec![Field::new("id", DataType::UInt32, false)]));
+
+        let ids = Arc::new(UInt32Array::from(vec![1, 2, 3])) as Arc<dyn arrow::array::Array>;
+        let batch = RecordBatch::try_new(Arc::clone(&file_schema), vec![ids])?;
+
+        let required_schema = Arc::new(Schema::new(vec![Field::new("id", DataType::Int32, false)]));
+
+        let _ = roundtrip(&batch, required_schema).await?;
+
+        Ok(())
+    }
+
+    /// Create a Parquet file containing a single batch and then read the batch back using
+    /// the specified required_schema. This will cause the SchemaAdapter code to be used.
+    async fn roundtrip(
+        batch: &RecordBatch,
+        required_schema: SchemaRef,
+    ) -> Result<RecordBatch, DataFusionError> {
+        let filename = get_temp_filename();
+        let filename = filename.as_path().as_os_str().to_str().unwrap().to_string();
+        let file = File::create(&filename)?;
+        let mut writer = ArrowWriter::try_new(file, Arc::clone(&batch.schema()), None)?;
+        writer.write(batch)?;
+        writer.close()?;
+
+        let object_store_url = ObjectStoreUrl::local_filesystem();
+        let file_scan_config = FileScanConfig::new(object_store_url, required_schema)
+            .with_file_groups(vec![vec![PartitionedFile::from_path(
+                filename.to_string(),
+            )?]]);
+
+        let mut spark_cast_options = SparkCastOptions::new(EvalMode::Legacy, "UTC", false);
+        spark_cast_options.allow_cast_unsigned_ints = true;
+
+        let parquet_exec = ParquetExec::builder(file_scan_config)
+            .with_schema_adapter_factory(Arc::new(SparkSchemaAdapterFactory::new(
+                spark_cast_options,
+            )))
+            .build();
+
+        let mut stream = parquet_exec
+            .execute(0, Arc::new(TaskContext::default()))
+            .unwrap();
+        stream.next().await.unwrap()
+    }
+}
diff --git a/src/test_common/file_util.rs b/src/test_common/file_util.rs
new file mode 100644
index 000000000000..78e42d29e643
--- /dev/null
+++ b/src/test_common/file_util.rs
@@ -0,0 +1,53 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::{env, fs, io::Write, path::PathBuf};
+
+/// Returns file handle for a temp file in 'target' directory with a provided content
+pub fn get_temp_file(file_name: &str, content: &[u8]) -> fs::File {
+    // build tmp path to a file in "target/debug/testdata"
+    let mut path_buf = env::current_dir().unwrap();
+    path_buf.push("target");
+    path_buf.push("debug");
+    path_buf.push("testdata");
+    fs::create_dir_all(&path_buf).unwrap();
+    path_buf.push(file_name);
+
+    // write file content
+    let mut tmp_file = fs::File::create(path_buf.as_path()).unwrap();
+    tmp_file.write_all(content).unwrap();
+    tmp_file.sync_all().unwrap();
+
+    // return file handle for both read and write
+    let file = fs::OpenOptions::new()
+        .read(true)
+        .write(true)
+        .open(path_buf.as_path());
+    assert!(file.is_ok());
+    file.unwrap()
+}
+
+pub fn get_temp_filename() -> PathBuf {
+    let mut path_buf = env::current_dir().unwrap();
+    path_buf.push("target");
+    path_buf.push("debug");
+    path_buf.push("testdata");
+    fs::create_dir_all(&path_buf).unwrap();
+    path_buf.push(rand::random::<i16>().to_string());
+
+    path_buf
+}
diff --git a/src/test_common/mod.rs b/src/test_common/mod.rs
new file mode 100644
index 000000000000..efd25a4a2af0
--- /dev/null
+++ b/src/test_common/mod.rs
@@ -0,0 +1,17 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+pub mod file_util;

From 67038db93e2f8edd4d3d7a03e2a53ff8d395fca7 Mon Sep 17 00:00:00 2001
From: Liang-Chi Hsieh <viirya@gmail.com>
Date: Thu, 19 Dec 2024 13:06:56 -0800
Subject: [PATCH 42/68] fix: stddev_pop should not directly return 0.0 when
 count is 1.0 (#1184)

* add test

* fix

* fix

* fix
---
 src/variance.rs | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/src/variance.rs b/src/variance.rs
index 2f4d8091c24a..4370d89ff48d 100644
--- a/src/variance.rs
+++ b/src/variance.rs
@@ -245,10 +245,8 @@ impl Accumulator for VarianceAccumulator {
 
         Ok(ScalarValue::Float64(match self.count {
             count if count == 0.0 => None,
-            count if count == 1.0 => {
-                if let StatsType::Population = self.stats_type {
-                    Some(0.0)
-                } else if self.null_on_divide_by_zero {
+            count if count == 1.0 && StatsType::Sample == self.stats_type => {
+                if self.null_on_divide_by_zero {
                     None
                 } else {
                     Some(f64::NAN)

From cf4490228d4b51870b4395f7ea39fb7792b8c599 Mon Sep 17 00:00:00 2001
From: Andy Grove <agrove@apache.org>
Date: Sat, 28 Dec 2024 15:14:11 -0700
Subject: [PATCH 43/68] chore: Upgrade to DataFusion 44.0.0-rc2 (#1154)

* move aggregate expressions to spark-expr crate

* move more expressions

* move benchmark

* normalize_nan

* bitwise not

* comet scalar funcs

* update bench imports

* save

* save

* save

* remove unused imports

* clippy

* implement more hashers

* implement Hash and PartialEq

* implement Hash and PartialEq

* implement Hash and PartialEq

* benches

* fix ScalarUDFImpl.return_type failure

* exclude test from miri

* ignore correct test

* ignore another test

* remove miri checks

* use return_type_from_exprs

* Revert "use return_type_from_exprs"

This reverts commit febc1f1ec1301f9b359fc23ad6a117224fce35b7.

* use DF main branch

* hacky workaround for regression in ScalarUDFImpl.return_type

* fix repo url

* pin to revision

* bump to latest rev

* bump to latest DF rev

* bump DF to rev 9f530dd

* add Cargo.lock

* bump DF version

* no default features

* Revert "remove miri checks"

This reverts commit 4638fe3aa5501966cd5d8b53acf26c698b10b3c9.

* Update pin to DataFusion e99e02b9b9093ceb0c13a2dd32a2a89beba47930

* update pin

* Update Cargo.toml

Bump to 44.0.0-rc2

* update cargo lock

* revert miri change

---------

Co-authored-by: Andrew Lamb <andrew@nerdnetworks.org>
---
 Cargo.toml           |   1 +
 benches/aggregate.rs |   3 +-
 src/avg.rs           |  22 +-----
 src/avg_decimal.rs   |  20 +----
 src/bitwise_not.rs   |  38 ++++------
 src/cast.rs          |  42 +++++------
 src/checkoverflow.rs |  53 ++++++-------
 src/correlation.rs   |  29 +------
 src/covariance.rs    |  26 +------
 src/if_expr.rs       |  49 +++++-------
 src/list.rs          | 127 +++++++++++++------------------
 src/negative.rs      |  37 ++++-----
 src/normalize_nan.rs |  45 +++++------
 src/regexp.rs        |  27 +++----
 src/stddev.rs        |  30 +++-----
 src/strings.rs       |  89 ++++++++++------------
 src/structs.rs       |  59 ++++-----------
 src/sum_decimal.rs   |  28 +------
 src/temporal.rs      | 175 +++++++++++++++++++------------------------
 src/to_json.rs       |  23 +++---
 src/unbound.rs       |  21 +-----
 src/variance.rs      |  19 +----
 22 files changed, 346 insertions(+), 617 deletions(-)

diff --git a/Cargo.toml b/Cargo.toml
index 27367d83e169..fc348f81bfff 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -36,6 +36,7 @@ chrono = { workspace = true }
 datafusion = { workspace = true, features = ["parquet"] }
 datafusion-common = { workspace = true }
 datafusion-expr = { workspace = true }
+datafusion-expr-common = { workspace = true }
 datafusion-physical-expr = { workspace = true }
 chrono-tz = { workspace = true }
 num = { workspace = true }
diff --git a/benches/aggregate.rs b/benches/aggregate.rs
index 43194fdda2f8..051ac5eb6273 100644
--- a/benches/aggregate.rs
+++ b/benches/aggregate.rs
@@ -66,7 +66,6 @@ fn criterion_benchmark(c: &mut Criterion) {
 
     group.bench_function("avg_decimal_comet", |b| {
         let comet_avg_decimal = Arc::new(AggregateUDF::new_from_impl(AvgDecimal::new(
-            Arc::clone(&c1),
             DataType::Decimal128(38, 10),
             DataType::Decimal128(38, 10),
         )));
@@ -96,7 +95,7 @@ fn criterion_benchmark(c: &mut Criterion) {
 
     group.bench_function("sum_decimal_comet", |b| {
         let comet_sum_decimal = Arc::new(AggregateUDF::new_from_impl(
-            SumDecimal::try_new(Arc::clone(&c1), DataType::Decimal128(38, 10)).unwrap(),
+            SumDecimal::try_new(DataType::Decimal128(38, 10)).unwrap(),
         ));
         b.to_async(&rt).iter(|| {
             black_box(agg_test(
diff --git a/src/avg.rs b/src/avg.rs
index 7820497d46f9..816440ac9ade 100644
--- a/src/avg.rs
+++ b/src/avg.rs
@@ -27,11 +27,10 @@ use datafusion::logical_expr::{
     type_coercion::aggregates::avg_return_type, Accumulator, EmitTo, GroupsAccumulator, Signature,
 };
 use datafusion_common::{not_impl_err, Result, ScalarValue};
-use datafusion_physical_expr::{expressions::format_state_name, PhysicalExpr};
+use datafusion_physical_expr::expressions::format_state_name;
 use std::{any::Any, sync::Arc};
 
 use arrow_array::ArrowNativeTypeOp;
-use datafusion::physical_expr_common::physical_expr::down_cast_any_ref;
 use datafusion_expr::function::{AccumulatorArgs, StateFieldsArgs};
 use datafusion_expr::Volatility::Immutable;
 use datafusion_expr::{AggregateUDFImpl, ReversedUDAF};
@@ -42,20 +41,19 @@ use DataType::*;
 pub struct Avg {
     name: String,
     signature: Signature,
-    expr: Arc<dyn PhysicalExpr>,
+    // expr: Arc<dyn PhysicalExpr>,
     input_data_type: DataType,
     result_data_type: DataType,
 }
 
 impl Avg {
     /// Create a new AVG aggregate function
-    pub fn new(expr: Arc<dyn PhysicalExpr>, name: impl Into<String>, data_type: DataType) -> Self {
+    pub fn new(name: impl Into<String>, data_type: DataType) -> Self {
         let result_data_type = avg_return_type("avg", &data_type).unwrap();
 
         Self {
             name: name.into(),
             signature: Signature::user_defined(Immutable),
-            expr,
             input_data_type: data_type,
             result_data_type,
         }
@@ -139,20 +137,6 @@ impl AggregateUDFImpl for Avg {
     }
 }
 
-impl PartialEq<dyn Any> for Avg {
-    fn eq(&self, other: &dyn Any) -> bool {
-        down_cast_any_ref(other)
-            .downcast_ref::<Self>()
-            .map(|x| {
-                self.name == x.name
-                    && self.input_data_type == x.input_data_type
-                    && self.result_data_type == x.result_data_type
-                    && self.expr.eq(&x.expr)
-            })
-            .unwrap_or(false)
-    }
-}
-
 /// An accumulator to compute the average
 #[derive(Debug, Default)]
 pub struct AvgAccumulator {
diff --git a/src/avg_decimal.rs b/src/avg_decimal.rs
index 163e1560b680..05fc28e58341 100644
--- a/src/avg_decimal.rs
+++ b/src/avg_decimal.rs
@@ -25,14 +25,13 @@ use arrow_array::{
 use arrow_schema::{DataType, Field};
 use datafusion::logical_expr::{Accumulator, EmitTo, GroupsAccumulator, Signature};
 use datafusion_common::{not_impl_err, Result, ScalarValue};
-use datafusion_physical_expr::{expressions::format_state_name, PhysicalExpr};
+use datafusion_physical_expr::expressions::format_state_name;
 use std::{any::Any, sync::Arc};
 
 use crate::utils::is_valid_decimal_precision;
 use arrow_array::ArrowNativeTypeOp;
 use arrow_data::decimal::{MAX_DECIMAL_FOR_EACH_PRECISION, MIN_DECIMAL_FOR_EACH_PRECISION};
 use datafusion::logical_expr::Volatility::Immutable;
-use datafusion::physical_expr_common::physical_expr::down_cast_any_ref;
 use datafusion_expr::function::{AccumulatorArgs, StateFieldsArgs};
 use datafusion_expr::type_coercion::aggregates::avg_return_type;
 use datafusion_expr::{AggregateUDFImpl, ReversedUDAF};
@@ -43,17 +42,15 @@ use DataType::*;
 #[derive(Debug, Clone)]
 pub struct AvgDecimal {
     signature: Signature,
-    expr: Arc<dyn PhysicalExpr>,
     sum_data_type: DataType,
     result_data_type: DataType,
 }
 
 impl AvgDecimal {
     /// Create a new AVG aggregate function
-    pub fn new(expr: Arc<dyn PhysicalExpr>, result_type: DataType, sum_type: DataType) -> Self {
+    pub fn new(result_type: DataType, sum_type: DataType) -> Self {
         Self {
             signature: Signature::user_defined(Immutable),
-            expr,
             result_data_type: result_type,
             sum_data_type: sum_type,
         }
@@ -156,19 +153,6 @@ impl AggregateUDFImpl for AvgDecimal {
     }
 }
 
-impl PartialEq<dyn Any> for AvgDecimal {
-    fn eq(&self, other: &dyn Any) -> bool {
-        down_cast_any_ref(other)
-            .downcast_ref::<Self>()
-            .map(|x| {
-                self.sum_data_type == x.sum_data_type
-                    && self.result_data_type == x.result_data_type
-                    && self.expr.eq(&x.expr)
-            })
-            .unwrap_or(false)
-    }
-}
-
 /// An accumulator to compute the average for decimals
 #[derive(Debug)]
 struct AvgDecimalAccumulator {
diff --git a/src/bitwise_not.rs b/src/bitwise_not.rs
index 36234935e1aa..d7c31836fff6 100644
--- a/src/bitwise_not.rs
+++ b/src/bitwise_not.rs
@@ -15,21 +15,16 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use std::{
-    any::Any,
-    hash::{Hash, Hasher},
-    sync::Arc,
-};
-
 use arrow::{
     array::*,
     datatypes::{DataType, Schema},
     record_batch::RecordBatch,
 };
-use datafusion::physical_expr_common::physical_expr::down_cast_any_ref;
 use datafusion::{error::DataFusionError, logical_expr::ColumnarValue};
 use datafusion_common::Result;
 use datafusion_physical_expr::PhysicalExpr;
+use std::hash::Hash;
+use std::{any::Any, sync::Arc};
 
 macro_rules! compute_op {
     ($OPERAND:expr, $DT:ident) => {{
@@ -43,12 +38,24 @@ macro_rules! compute_op {
 }
 
 /// BitwiseNot expression
-#[derive(Debug, Hash)]
+#[derive(Debug, Eq)]
 pub struct BitwiseNotExpr {
     /// Input expression
     arg: Arc<dyn PhysicalExpr>,
 }
 
+impl Hash for BitwiseNotExpr {
+    fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
+        self.arg.hash(state);
+    }
+}
+
+impl PartialEq for BitwiseNotExpr {
+    fn eq(&self, other: &Self) -> bool {
+        self.arg.eq(&other.arg)
+    }
+}
+
 impl BitwiseNotExpr {
     /// Create new bitwise not expression
     pub fn new(arg: Arc<dyn PhysicalExpr>) -> Self {
@@ -114,21 +121,6 @@ impl PhysicalExpr for BitwiseNotExpr {
     ) -> Result<Arc<dyn PhysicalExpr>> {
         Ok(Arc::new(BitwiseNotExpr::new(Arc::clone(&children[0]))))
     }
-
-    fn dyn_hash(&self, state: &mut dyn Hasher) {
-        let mut s = state;
-        self.arg.hash(&mut s);
-        self.hash(&mut s);
-    }
-}
-
-impl PartialEq<dyn Any> for BitwiseNotExpr {
-    fn eq(&self, other: &dyn Any) -> bool {
-        down_cast_any_ref(other)
-            .downcast_ref::<Self>()
-            .map(|x| self.arg.eq(&x.arg))
-            .unwrap_or(false)
-    }
 }
 
 pub fn bitwise_not(arg: Arc<dyn PhysicalExpr>) -> Result<Arc<dyn PhysicalExpr>> {
diff --git a/src/cast.rs b/src/cast.rs
index d96bcbbdb627..6e0e0915cde8 100644
--- a/src/cast.rs
+++ b/src/cast.rs
@@ -39,7 +39,6 @@ use arrow_array::builder::StringBuilder;
 use arrow_array::{DictionaryArray, StringArray, StructArray};
 use arrow_schema::{DataType, Field, Schema};
 use chrono::{NaiveDate, NaiveDateTime, TimeZone, Timelike};
-use datafusion::physical_expr_common::physical_expr::down_cast_any_ref;
 use datafusion_common::{
     cast::as_generic_string_array, internal_err, Result as DataFusionResult, ScalarValue,
 };
@@ -54,7 +53,7 @@ use std::str::FromStr;
 use std::{
     any::Any,
     fmt::{Debug, Display, Formatter},
-    hash::{Hash, Hasher},
+    hash::Hash,
     num::Wrapping,
     sync::Arc,
 };
@@ -131,13 +130,29 @@ impl TimeStampInfo {
     }
 }
 
-#[derive(Debug, Hash)]
+#[derive(Debug, Eq)]
 pub struct Cast {
     pub child: Arc<dyn PhysicalExpr>,
     pub data_type: DataType,
     pub cast_options: SparkCastOptions,
 }
 
+impl PartialEq for Cast {
+    fn eq(&self, other: &Self) -> bool {
+        self.child.eq(&other.child)
+            && self.data_type.eq(&other.data_type)
+            && self.cast_options.eq(&other.cast_options)
+    }
+}
+
+impl Hash for Cast {
+    fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
+        self.child.hash(state);
+        self.data_type.hash(state);
+        self.cast_options.hash(state);
+    }
+}
+
 /// Determine if Comet supports a cast, taking options such as EvalMode and Timezone into account.
 pub fn cast_supported(
     from_type: &DataType,
@@ -1681,19 +1696,6 @@ impl Display for Cast {
     }
 }
 
-impl PartialEq<dyn Any> for Cast {
-    fn eq(&self, other: &dyn Any) -> bool {
-        down_cast_any_ref(other)
-            .downcast_ref::<Self>()
-            .map(|x| {
-                self.child.eq(&x.child)
-                    && self.cast_options.eq(&x.cast_options)
-                    && self.data_type.eq(&x.data_type)
-            })
-            .unwrap_or(false)
-    }
-}
-
 impl PhysicalExpr for Cast {
     fn as_any(&self) -> &dyn Any {
         self
@@ -1729,14 +1731,6 @@ impl PhysicalExpr for Cast {
             _ => internal_err!("Cast should have exactly one child"),
         }
     }
-
-    fn dyn_hash(&self, state: &mut dyn Hasher) {
-        let mut s = state;
-        self.child.hash(&mut s);
-        self.data_type.hash(&mut s);
-        self.cast_options.hash(&mut s);
-        self.hash(&mut s);
-    }
 }
 
 fn timestamp_parser<T: TimeZone>(
diff --git a/src/checkoverflow.rs b/src/checkoverflow.rs
index e922171bd2b5..528bbd5d96fc 100644
--- a/src/checkoverflow.rs
+++ b/src/checkoverflow.rs
@@ -15,13 +15,6 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use std::{
-    any::Any,
-    fmt::{Display, Formatter},
-    hash::{Hash, Hasher},
-    sync::Arc,
-};
-
 use arrow::{
     array::{as_primitive_array, Array, ArrayRef, Decimal128Array},
     datatypes::{Decimal128Type, DecimalType},
@@ -29,21 +22,42 @@ use arrow::{
 };
 use arrow_schema::{DataType, Schema};
 use datafusion::logical_expr::ColumnarValue;
-use datafusion::physical_expr_common::physical_expr::down_cast_any_ref;
 use datafusion_common::{DataFusionError, ScalarValue};
 use datafusion_physical_expr::PhysicalExpr;
+use std::hash::Hash;
+use std::{
+    any::Any,
+    fmt::{Display, Formatter},
+    sync::Arc,
+};
 
 /// This is from Spark `CheckOverflow` expression. Spark `CheckOverflow` expression rounds decimals
 /// to given scale and check if the decimals can fit in given precision. As `cast` kernel rounds
 /// decimals already, Comet `CheckOverflow` expression only checks if the decimals can fit in the
 /// precision.
-#[derive(Debug, Hash)]
+#[derive(Debug, Eq)]
 pub struct CheckOverflow {
     pub child: Arc<dyn PhysicalExpr>,
     pub data_type: DataType,
     pub fail_on_error: bool,
 }
 
+impl Hash for CheckOverflow {
+    fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
+        self.child.hash(state);
+        self.data_type.hash(state);
+        self.fail_on_error.hash(state);
+    }
+}
+
+impl PartialEq for CheckOverflow {
+    fn eq(&self, other: &Self) -> bool {
+        self.child.eq(&other.child)
+            && self.data_type.eq(&other.data_type)
+            && self.fail_on_error.eq(&other.fail_on_error)
+    }
+}
+
 impl CheckOverflow {
     pub fn new(child: Arc<dyn PhysicalExpr>, data_type: DataType, fail_on_error: bool) -> Self {
         Self {
@@ -64,19 +78,6 @@ impl Display for CheckOverflow {
     }
 }
 
-impl PartialEq<dyn Any> for CheckOverflow {
-    fn eq(&self, other: &dyn Any) -> bool {
-        down_cast_any_ref(other)
-            .downcast_ref::<Self>()
-            .map(|x| {
-                self.child.eq(&x.child)
-                    && self.data_type.eq(&x.data_type)
-                    && self.fail_on_error.eq(&x.fail_on_error)
-            })
-            .unwrap_or(false)
-    }
-}
-
 impl PhysicalExpr for CheckOverflow {
     fn as_any(&self) -> &dyn Any {
         self
@@ -162,12 +163,4 @@ impl PhysicalExpr for CheckOverflow {
             self.fail_on_error,
         )))
     }
-
-    fn dyn_hash(&self, state: &mut dyn Hasher) {
-        let mut s = state;
-        self.child.hash(&mut s);
-        self.data_type.hash(&mut s);
-        self.fail_on_error.hash(&mut s);
-        self.hash(&mut s);
-    }
 }
diff --git a/src/correlation.rs b/src/correlation.rs
index e5f36c6f9585..e4ddab95de73 100644
--- a/src/correlation.rs
+++ b/src/correlation.rs
@@ -26,13 +26,12 @@ use arrow::{
     datatypes::{DataType, Field},
 };
 use datafusion::logical_expr::Accumulator;
-use datafusion::physical_expr_common::physical_expr::down_cast_any_ref;
 use datafusion_common::{Result, ScalarValue};
 use datafusion_expr::function::{AccumulatorArgs, StateFieldsArgs};
 use datafusion_expr::type_coercion::aggregates::NUMERICS;
 use datafusion_expr::{AggregateUDFImpl, Signature, Volatility};
+use datafusion_physical_expr::expressions::format_state_name;
 use datafusion_physical_expr::expressions::StatsType;
-use datafusion_physical_expr::{expressions::format_state_name, PhysicalExpr};
 
 /// CORR aggregate expression
 /// The implementation mostly is the same as the DataFusion's implementation. The reason
@@ -43,26 +42,16 @@ use datafusion_physical_expr::{expressions::format_state_name, PhysicalExpr};
 pub struct Correlation {
     name: String,
     signature: Signature,
-    expr1: Arc<dyn PhysicalExpr>,
-    expr2: Arc<dyn PhysicalExpr>,
     null_on_divide_by_zero: bool,
 }
 
 impl Correlation {
-    pub fn new(
-        expr1: Arc<dyn PhysicalExpr>,
-        expr2: Arc<dyn PhysicalExpr>,
-        name: impl Into<String>,
-        data_type: DataType,
-        null_on_divide_by_zero: bool,
-    ) -> Self {
+    pub fn new(name: impl Into<String>, data_type: DataType, null_on_divide_by_zero: bool) -> Self {
         // the result of correlation just support FLOAT64 data type.
         assert!(matches!(data_type, DataType::Float64));
         Self {
             name: name.into(),
             signature: Signature::uniform(2, NUMERICS.to_vec(), Volatility::Immutable),
-            expr1,
-            expr2,
             null_on_divide_by_zero,
         }
     }
@@ -131,20 +120,6 @@ impl AggregateUDFImpl for Correlation {
     }
 }
 
-impl PartialEq<dyn Any> for Correlation {
-    fn eq(&self, other: &dyn Any) -> bool {
-        down_cast_any_ref(other)
-            .downcast_ref::<Self>()
-            .map(|x| {
-                self.name == x.name
-                    && self.expr1.eq(&x.expr1)
-                    && self.expr2.eq(&x.expr2)
-                    && self.null_on_divide_by_zero == x.null_on_divide_by_zero
-            })
-            .unwrap_or(false)
-    }
-}
-
 /// An accumulator to compute correlation
 #[derive(Debug)]
 pub struct CorrelationAccumulator {
diff --git a/src/covariance.rs b/src/covariance.rs
index 9166e397669c..fa3563cdea55 100644
--- a/src/covariance.rs
+++ b/src/covariance.rs
@@ -17,7 +17,7 @@
  * under the License.
  */
 
-use std::{any::Any, sync::Arc};
+use std::any::Any;
 
 use arrow::{
     array::{ArrayRef, Float64Array},
@@ -25,15 +25,14 @@ use arrow::{
     datatypes::{DataType, Field},
 };
 use datafusion::logical_expr::Accumulator;
-use datafusion::physical_expr_common::physical_expr::down_cast_any_ref;
 use datafusion_common::{
     downcast_value, unwrap_or_internal_err, DataFusionError, Result, ScalarValue,
 };
 use datafusion_expr::function::{AccumulatorArgs, StateFieldsArgs};
 use datafusion_expr::type_coercion::aggregates::NUMERICS;
 use datafusion_expr::{AggregateUDFImpl, Signature, Volatility};
+use datafusion_physical_expr::expressions::format_state_name;
 use datafusion_physical_expr::expressions::StatsType;
-use datafusion_physical_expr::{expressions::format_state_name, PhysicalExpr};
 
 /// COVAR_SAMP and COVAR_POP aggregate expression
 /// The implementation mostly is the same as the DataFusion's implementation. The reason
@@ -43,8 +42,6 @@ use datafusion_physical_expr::{expressions::format_state_name, PhysicalExpr};
 pub struct Covariance {
     name: String,
     signature: Signature,
-    expr1: Arc<dyn PhysicalExpr>,
-    expr2: Arc<dyn PhysicalExpr>,
     stats_type: StatsType,
     null_on_divide_by_zero: bool,
 }
@@ -52,8 +49,6 @@ pub struct Covariance {
 impl Covariance {
     /// Create a new COVAR aggregate function
     pub fn new(
-        expr1: Arc<dyn PhysicalExpr>,
-        expr2: Arc<dyn PhysicalExpr>,
         name: impl Into<String>,
         data_type: DataType,
         stats_type: StatsType,
@@ -64,8 +59,6 @@ impl Covariance {
         Self {
             name: name.into(),
             signature: Signature::uniform(2, NUMERICS.to_vec(), Volatility::Immutable),
-            expr1,
-            expr2,
             stats_type,
             null_on_divide_by_zero,
         }
@@ -126,21 +119,6 @@ impl AggregateUDFImpl for Covariance {
     }
 }
 
-impl PartialEq<dyn Any> for Covariance {
-    fn eq(&self, other: &dyn Any) -> bool {
-        down_cast_any_ref(other)
-            .downcast_ref::<Self>()
-            .map(|x| {
-                self.name == x.name
-                    && self.expr1.eq(&x.expr1)
-                    && self.expr2.eq(&x.expr2)
-                    && self.stats_type == x.stats_type
-                    && self.null_on_divide_by_zero == x.null_on_divide_by_zero
-            })
-            .unwrap_or(false)
-    }
-}
-
 /// An accumulator to compute covariance
 #[derive(Debug)]
 pub struct CovarianceAccumulator {
diff --git a/src/if_expr.rs b/src/if_expr.rs
index 193a90fb5578..01c754ad6de9 100644
--- a/src/if_expr.rs
+++ b/src/if_expr.rs
@@ -15,24 +15,19 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use std::{
-    any::Any,
-    hash::{Hash, Hasher},
-    sync::Arc,
-};
-
 use arrow::{
     datatypes::{DataType, Schema},
     record_batch::RecordBatch,
 };
 use datafusion::logical_expr::ColumnarValue;
-use datafusion::physical_expr_common::physical_expr::down_cast_any_ref;
 use datafusion_common::Result;
 use datafusion_physical_expr::{expressions::CaseExpr, PhysicalExpr};
+use std::hash::Hash;
+use std::{any::Any, sync::Arc};
 
 /// IfExpr is a wrapper around CaseExpr, because `IF(a, b, c)` is semantically equivalent to
 /// `CASE WHEN a THEN b ELSE c END`.
-#[derive(Debug, Hash)]
+#[derive(Debug, Eq)]
 pub struct IfExpr {
     if_expr: Arc<dyn PhysicalExpr>,
     true_expr: Arc<dyn PhysicalExpr>,
@@ -41,6 +36,23 @@ pub struct IfExpr {
     case_expr: Arc<CaseExpr>,
 }
 
+impl Hash for IfExpr {
+    fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
+        self.if_expr.hash(state);
+        self.true_expr.hash(state);
+        self.false_expr.hash(state);
+        self.case_expr.hash(state);
+    }
+}
+impl PartialEq for IfExpr {
+    fn eq(&self, other: &Self) -> bool {
+        self.if_expr.eq(&other.if_expr)
+            && self.true_expr.eq(&other.true_expr)
+            && self.false_expr.eq(&other.false_expr)
+            && self.case_expr.eq(&other.case_expr)
+    }
+}
+
 impl std::fmt::Display for IfExpr {
     fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
         write!(
@@ -106,27 +118,6 @@ impl PhysicalExpr for IfExpr {
             Arc::clone(&children[2]),
         )))
     }
-
-    fn dyn_hash(&self, state: &mut dyn Hasher) {
-        let mut s = state;
-        self.if_expr.hash(&mut s);
-        self.true_expr.hash(&mut s);
-        self.false_expr.hash(&mut s);
-        self.hash(&mut s);
-    }
-}
-
-impl PartialEq<dyn Any> for IfExpr {
-    fn eq(&self, other: &dyn Any) -> bool {
-        down_cast_any_ref(other)
-            .downcast_ref::<Self>()
-            .map(|x| {
-                self.if_expr.eq(&x.if_expr)
-                    && self.true_expr.eq(&x.true_expr)
-                    && self.false_expr.eq(&x.false_expr)
-            })
-            .unwrap_or(false)
-    }
 }
 
 #[cfg(test)]
diff --git a/src/list.rs b/src/list.rs
index 7dc17b568884..fc31b11a0beb 100644
--- a/src/list.rs
+++ b/src/list.rs
@@ -26,16 +26,15 @@ use arrow_array::{
 };
 use arrow_schema::{DataType, Field, FieldRef, Schema};
 use datafusion::logical_expr::ColumnarValue;
-use datafusion::physical_expr_common::physical_expr::down_cast_any_ref;
 use datafusion_common::{
     cast::{as_int32_array, as_large_list_array, as_list_array},
     internal_err, DataFusionError, Result as DataFusionResult, ScalarValue,
 };
 use datafusion_physical_expr::PhysicalExpr;
+use std::hash::Hash;
 use std::{
     any::Any,
     fmt::{Debug, Display, Formatter},
-    hash::{Hash, Hasher},
     sync::Arc,
 };
 
@@ -44,7 +43,7 @@ use std::{
 // https://github.com/apache/spark/blob/master/common/utils/src/main/java/org/apache/spark/unsafe/array/ByteArrayUtils.java
 const MAX_ROUNDED_ARRAY_LENGTH: usize = 2147483632;
 
-#[derive(Debug, Hash)]
+#[derive(Debug, Eq)]
 pub struct ListExtract {
     child: Arc<dyn PhysicalExpr>,
     ordinal: Arc<dyn PhysicalExpr>,
@@ -53,6 +52,25 @@ pub struct ListExtract {
     fail_on_error: bool,
 }
 
+impl Hash for ListExtract {
+    fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
+        self.child.hash(state);
+        self.ordinal.hash(state);
+        self.default_value.hash(state);
+        self.one_based.hash(state);
+        self.fail_on_error.hash(state);
+    }
+}
+impl PartialEq for ListExtract {
+    fn eq(&self, other: &Self) -> bool {
+        self.child.eq(&other.child)
+            && self.ordinal.eq(&other.ordinal)
+            && self.default_value.eq(&other.default_value)
+            && self.one_based.eq(&other.one_based)
+            && self.fail_on_error.eq(&other.fail_on_error)
+    }
+}
+
 impl ListExtract {
     pub fn new(
         child: Arc<dyn PhysicalExpr>,
@@ -176,16 +194,6 @@ impl PhysicalExpr for ListExtract {
             _ => internal_err!("ListExtract should have exactly two children"),
         }
     }
-
-    fn dyn_hash(&self, state: &mut dyn Hasher) {
-        let mut s = state;
-        self.child.hash(&mut s);
-        self.ordinal.hash(&mut s);
-        self.default_value.hash(&mut s);
-        self.one_based.hash(&mut s);
-        self.fail_on_error.hash(&mut s);
-        self.hash(&mut s);
-    }
 }
 
 fn one_based_index(index: i32, len: usize) -> DataFusionResult<Option<usize>> {
@@ -267,33 +275,24 @@ impl Display for ListExtract {
     }
 }
 
-impl PartialEq<dyn Any> for ListExtract {
-    fn eq(&self, other: &dyn Any) -> bool {
-        down_cast_any_ref(other)
-            .downcast_ref::<Self>()
-            .map(|x| {
-                self.child.eq(&x.child)
-                    && self.ordinal.eq(&x.ordinal)
-                    && (self.default_value.is_none() == x.default_value.is_none())
-                    && self
-                        .default_value
-                        .as_ref()
-                        .zip(x.default_value.as_ref())
-                        .map(|(s, x)| s.eq(x))
-                        .unwrap_or(true)
-                    && self.one_based.eq(&x.one_based)
-                    && self.fail_on_error.eq(&x.fail_on_error)
-            })
-            .unwrap_or(false)
-    }
-}
-
-#[derive(Debug, Hash)]
+#[derive(Debug, Eq)]
 pub struct GetArrayStructFields {
     child: Arc<dyn PhysicalExpr>,
     ordinal: usize,
 }
 
+impl Hash for GetArrayStructFields {
+    fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
+        self.child.hash(state);
+        self.ordinal.hash(state);
+    }
+}
+impl PartialEq for GetArrayStructFields {
+    fn eq(&self, other: &Self) -> bool {
+        self.child.eq(&other.child) && self.ordinal.eq(&other.ordinal)
+    }
+}
+
 impl GetArrayStructFields {
     pub fn new(child: Arc<dyn PhysicalExpr>, ordinal: usize) -> Self {
         Self { child, ordinal }
@@ -379,13 +378,6 @@ impl PhysicalExpr for GetArrayStructFields {
             _ => internal_err!("GetArrayStructFields should have exactly one child"),
         }
     }
-
-    fn dyn_hash(&self, state: &mut dyn Hasher) {
-        let mut s = state;
-        self.child.hash(&mut s);
-        self.ordinal.hash(&mut s);
-        self.hash(&mut s);
-    }
 }
 
 fn get_array_struct_fields<O: OffsetSizeTrait>(
@@ -417,16 +409,7 @@ impl Display for GetArrayStructFields {
     }
 }
 
-impl PartialEq<dyn Any> for GetArrayStructFields {
-    fn eq(&self, other: &dyn Any) -> bool {
-        down_cast_any_ref(other)
-            .downcast_ref::<Self>()
-            .map(|x| self.child.eq(&x.child) && self.ordinal.eq(&x.ordinal))
-            .unwrap_or(false)
-    }
-}
-
-#[derive(Debug, Hash)]
+#[derive(Debug, Eq)]
 pub struct ArrayInsert {
     src_array_expr: Arc<dyn PhysicalExpr>,
     pos_expr: Arc<dyn PhysicalExpr>,
@@ -434,6 +417,23 @@ pub struct ArrayInsert {
     legacy_negative_index: bool,
 }
 
+impl Hash for ArrayInsert {
+    fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
+        self.src_array_expr.hash(state);
+        self.pos_expr.hash(state);
+        self.item_expr.hash(state);
+        self.legacy_negative_index.hash(state);
+    }
+}
+impl PartialEq for ArrayInsert {
+    fn eq(&self, other: &Self) -> bool {
+        self.src_array_expr.eq(&other.src_array_expr)
+            && self.pos_expr.eq(&other.pos_expr)
+            && self.item_expr.eq(&other.item_expr)
+            && self.legacy_negative_index.eq(&other.legacy_negative_index)
+    }
+}
+
 impl ArrayInsert {
     pub fn new(
         src_array_expr: Arc<dyn PhysicalExpr>,
@@ -555,15 +555,6 @@ impl PhysicalExpr for ArrayInsert {
             _ => internal_err!("ArrayInsert should have exactly three childrens"),
         }
     }
-
-    fn dyn_hash(&self, _state: &mut dyn Hasher) {
-        let mut s = _state;
-        self.src_array_expr.hash(&mut s);
-        self.pos_expr.hash(&mut s);
-        self.item_expr.hash(&mut s);
-        self.legacy_negative_index.hash(&mut s);
-        self.hash(&mut s);
-    }
 }
 
 fn array_insert<O: OffsetSizeTrait>(
@@ -694,20 +685,6 @@ impl Display for ArrayInsert {
     }
 }
 
-impl PartialEq<dyn Any> for ArrayInsert {
-    fn eq(&self, other: &dyn Any) -> bool {
-        down_cast_any_ref(other)
-            .downcast_ref::<Self>()
-            .map(|x| {
-                self.src_array_expr.eq(&x.src_array_expr)
-                    && self.pos_expr.eq(&x.pos_expr)
-                    && self.item_expr.eq(&x.item_expr)
-                    && self.legacy_negative_index.eq(&x.legacy_negative_index)
-            })
-            .unwrap_or(false)
-    }
-}
-
 #[cfg(test)]
 mod test {
     use crate::list::{array_insert, list_extract, zero_based_index};
diff --git a/src/negative.rs b/src/negative.rs
index 3d9063e7835f..7fb50891791c 100644
--- a/src/negative.rs
+++ b/src/negative.rs
@@ -21,18 +21,14 @@ use arrow::{compute::kernels::numeric::neg_wrapping, datatypes::IntervalDayTimeT
 use arrow_array::RecordBatch;
 use arrow_buffer::IntervalDayTime;
 use arrow_schema::{DataType, Schema};
-use datafusion::physical_expr_common::physical_expr::down_cast_any_ref;
 use datafusion::{
     logical_expr::{interval_arithmetic::Interval, ColumnarValue},
     physical_expr::PhysicalExpr,
 };
 use datafusion_common::{DataFusionError, Result, ScalarValue};
 use datafusion_expr::sort_properties::ExprProperties;
-use std::{
-    any::Any,
-    hash::{Hash, Hasher},
-    sync::Arc,
-};
+use std::hash::Hash;
+use std::{any::Any, sync::Arc};
 
 pub fn create_negate_expr(
     expr: Arc<dyn PhysicalExpr>,
@@ -42,13 +38,26 @@ pub fn create_negate_expr(
 }
 
 /// Negative expression
-#[derive(Debug, Hash)]
+#[derive(Debug, Eq)]
 pub struct NegativeExpr {
     /// Input expression
     arg: Arc<dyn PhysicalExpr>,
     fail_on_error: bool,
 }
 
+impl Hash for NegativeExpr {
+    fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
+        self.arg.hash(state);
+        self.fail_on_error.hash(state);
+    }
+}
+
+impl PartialEq for NegativeExpr {
+    fn eq(&self, other: &Self) -> bool {
+        self.arg.eq(&other.arg) && self.fail_on_error.eq(&other.fail_on_error)
+    }
+}
+
 macro_rules! check_overflow {
     ($array:expr, $array_type:ty, $min_val:expr, $type_name:expr) => {{
         let typed_array = $array
@@ -204,11 +213,6 @@ impl PhysicalExpr for NegativeExpr {
         )))
     }
 
-    fn dyn_hash(&self, state: &mut dyn Hasher) {
-        let mut s = state;
-        self.hash(&mut s);
-    }
-
     /// Given the child interval of a NegativeExpr, it calculates the NegativeExpr's interval.
     /// It replaces the upper and lower bounds after multiplying them with -1.
     /// Ex: `(a, b]` => `[-b, -a)`
@@ -255,12 +259,3 @@ impl PhysicalExpr for NegativeExpr {
         Ok(properties)
     }
 }
-
-impl PartialEq<dyn Any> for NegativeExpr {
-    fn eq(&self, other: &dyn Any) -> bool {
-        down_cast_any_ref(other)
-            .downcast_ref::<Self>()
-            .map(|x| self.arg.eq(&x.arg))
-            .unwrap_or(false)
-    }
-}
diff --git a/src/normalize_nan.rs b/src/normalize_nan.rs
index c5331ad7bd4a..078ce4b5a4b1 100644
--- a/src/normalize_nan.rs
+++ b/src/normalize_nan.rs
@@ -15,13 +15,6 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use std::{
-    any::Any,
-    fmt::{Display, Formatter},
-    hash::{Hash, Hasher},
-    sync::Arc,
-};
-
 use arrow::{
     array::{as_primitive_array, ArrayAccessor, ArrayIter, Float32Array, Float64Array},
     datatypes::{ArrowNativeType, Float32Type, Float64Type},
@@ -29,15 +22,33 @@ use arrow::{
 };
 use arrow_schema::{DataType, Schema};
 use datafusion::logical_expr::ColumnarValue;
-use datafusion::physical_expr_common::physical_expr::down_cast_any_ref;
 use datafusion_physical_expr::PhysicalExpr;
+use std::hash::Hash;
+use std::{
+    any::Any,
+    fmt::{Display, Formatter},
+    sync::Arc,
+};
 
-#[derive(Debug, Hash)]
+#[derive(Debug, Eq)]
 pub struct NormalizeNaNAndZero {
     pub data_type: DataType,
     pub child: Arc<dyn PhysicalExpr>,
 }
 
+impl PartialEq for NormalizeNaNAndZero {
+    fn eq(&self, other: &Self) -> bool {
+        self.child.eq(&other.child) && self.data_type.eq(&other.data_type)
+    }
+}
+
+impl Hash for NormalizeNaNAndZero {
+    fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
+        self.child.hash(state);
+        self.data_type.hash(state);
+    }
+}
+
 impl NormalizeNaNAndZero {
     pub fn new(data_type: DataType, child: Arc<dyn PhysicalExpr>) -> Self {
         Self { data_type, child }
@@ -89,13 +100,6 @@ impl PhysicalExpr for NormalizeNaNAndZero {
             Arc::clone(&children[0]),
         )))
     }
-
-    fn dyn_hash(&self, state: &mut dyn Hasher) {
-        let mut s = state;
-        self.child.hash(&mut s);
-        self.data_type.hash(&mut s);
-        self.hash(&mut s);
-    }
 }
 
 fn eval_typed<V: FloatDouble, T: ArrayAccessor<Item = V>>(input: T) -> Vec<Option<V>> {
@@ -120,15 +124,6 @@ impl Display for NormalizeNaNAndZero {
     }
 }
 
-impl PartialEq<dyn Any> for NormalizeNaNAndZero {
-    fn eq(&self, other: &dyn Any) -> bool {
-        down_cast_any_ref(other)
-            .downcast_ref::<Self>()
-            .map(|x| self.child.eq(&x.child) && self.data_type.eq(&x.data_type))
-            .unwrap_or(false)
-    }
-}
-
 trait FloatDouble: ArrowNativeType {
     fn is_nan(&self) -> bool;
     fn nan(&self) -> Self;
diff --git a/src/regexp.rs b/src/regexp.rs
index c7626285a28c..7f367a8bb98a 100644
--- a/src/regexp.rs
+++ b/src/regexp.rs
@@ -21,7 +21,7 @@ use arrow_array::builder::BooleanBuilder;
 use arrow_array::types::Int32Type;
 use arrow_array::{Array, BooleanArray, DictionaryArray, RecordBatch, StringArray};
 use arrow_schema::{DataType, Schema};
-use datafusion::physical_expr_common::physical_expr::down_cast_any_ref;
+use datafusion::physical_expr_common::physical_expr::DynEq;
 use datafusion_common::{internal_err, Result};
 use datafusion_expr::ColumnarValue;
 use datafusion_physical_expr::PhysicalExpr;
@@ -53,6 +53,16 @@ impl Hash for RLike {
     }
 }
 
+impl DynEq for RLike {
+    fn dyn_eq(&self, other: &dyn Any) -> bool {
+        if let Some(other) = other.downcast_ref::<Self>() {
+            self.pattern_str == other.pattern_str
+        } else {
+            false
+        }
+    }
+}
+
 impl RLike {
     pub fn try_new(child: Arc<dyn PhysicalExpr>, pattern: &str) -> Result<Self> {
         Ok(Self {
@@ -93,15 +103,6 @@ impl Display for RLike {
     }
 }
 
-impl PartialEq<dyn Any> for RLike {
-    fn eq(&self, other: &dyn Any) -> bool {
-        down_cast_any_ref(other)
-            .downcast_ref::<Self>()
-            .map(|x| self.child.eq(&x.child) && self.pattern_str.eq(&x.pattern_str))
-            .unwrap_or(false)
-    }
-}
-
 impl PhysicalExpr for RLike {
     fn as_any(&self) -> &dyn Any {
         self
@@ -161,10 +162,4 @@ impl PhysicalExpr for RLike {
             &self.pattern_str,
         )?))
     }
-
-    fn dyn_hash(&self, state: &mut dyn Hasher) {
-        use std::hash::Hash;
-        let mut s = state;
-        self.hash(&mut s);
-    }
 }
diff --git a/src/stddev.rs b/src/stddev.rs
index 3cf604da0b3d..1ec5ffb69ac8 100644
--- a/src/stddev.rs
+++ b/src/stddev.rs
@@ -23,12 +23,12 @@ use arrow::{
     datatypes::{DataType, Field},
 };
 use datafusion::logical_expr::Accumulator;
-use datafusion::physical_expr_common::physical_expr::down_cast_any_ref;
+use datafusion_common::types::NativeType;
 use datafusion_common::{internal_err, Result, ScalarValue};
 use datafusion_expr::function::{AccumulatorArgs, StateFieldsArgs};
 use datafusion_expr::{AggregateUDFImpl, Signature, Volatility};
+use datafusion_physical_expr::expressions::format_state_name;
 use datafusion_physical_expr::expressions::StatsType;
-use datafusion_physical_expr::{expressions::format_state_name, PhysicalExpr};
 
 /// STDDEV and STDDEV_SAMP (standard deviation) aggregate expression
 /// The implementation mostly is the same as the DataFusion's implementation. The reason
@@ -39,7 +39,6 @@ use datafusion_physical_expr::{expressions::format_state_name, PhysicalExpr};
 pub struct Stddev {
     name: String,
     signature: Signature,
-    expr: Arc<dyn PhysicalExpr>,
     stats_type: StatsType,
     null_on_divide_by_zero: bool,
 }
@@ -47,7 +46,6 @@ pub struct Stddev {
 impl Stddev {
     /// Create a new STDDEV aggregate function
     pub fn new(
-        expr: Arc<dyn PhysicalExpr>,
         name: impl Into<String>,
         data_type: DataType,
         stats_type: StatsType,
@@ -57,8 +55,14 @@ impl Stddev {
         assert!(matches!(data_type, DataType::Float64));
         Self {
             name: name.into(),
-            signature: Signature::coercible(vec![DataType::Float64], Volatility::Immutable),
-            expr,
+            signature: Signature::coercible(
+                vec![
+                    datafusion_expr_common::signature::TypeSignatureClass::Native(Arc::new(
+                        NativeType::Float64,
+                    )),
+                ],
+                Volatility::Immutable,
+            ),
             stats_type,
             null_on_divide_by_zero,
         }
@@ -121,20 +125,6 @@ impl AggregateUDFImpl for Stddev {
     }
 }
 
-impl PartialEq<dyn Any> for Stddev {
-    fn eq(&self, other: &dyn Any) -> bool {
-        down_cast_any_ref(other)
-            .downcast_ref::<Self>()
-            .map(|x| {
-                self.name == x.name
-                    && self.expr.eq(&x.expr)
-                    && self.null_on_divide_by_zero == x.null_on_divide_by_zero
-                    && self.stats_type == x.stats_type
-            })
-            .unwrap_or(false)
-    }
-}
-
 /// An accumulator to compute the standard deviation
 #[derive(Debug)]
 pub struct StddevAccumulator {
diff --git a/src/strings.rs b/src/strings.rs
index a8aab6aee969..c2706b589652 100644
--- a/src/strings.rs
+++ b/src/strings.rs
@@ -27,19 +27,18 @@ use arrow::{
 };
 use arrow_schema::{DataType, Schema};
 use datafusion::logical_expr::ColumnarValue;
-use datafusion::physical_expr_common::physical_expr::down_cast_any_ref;
 use datafusion_common::{DataFusionError, ScalarValue::Utf8};
 use datafusion_physical_expr::PhysicalExpr;
 use std::{
     any::Any,
     fmt::{Display, Formatter},
-    hash::{Hash, Hasher},
+    hash::Hash,
     sync::Arc,
 };
 
 macro_rules! make_predicate_function {
     ($name: ident, $kernel: ident, $str_scalar_kernel: ident) => {
-        #[derive(Debug, Hash)]
+        #[derive(Debug, Eq)]
         pub struct $name {
             left: Arc<dyn PhysicalExpr>,
             right: Arc<dyn PhysicalExpr>,
@@ -57,12 +56,16 @@ macro_rules! make_predicate_function {
             }
         }
 
-        impl PartialEq<dyn Any> for $name {
-            fn eq(&self, other: &dyn Any) -> bool {
-                down_cast_any_ref(other)
-                    .downcast_ref::<Self>()
-                    .map(|x| self.left.eq(&x.left) && self.right.eq(&x.right))
-                    .unwrap_or(false)
+        impl Hash for $name {
+            fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
+                self.left.hash(state);
+                self.right.hash(state);
+            }
+        }
+
+        impl PartialEq for $name {
+            fn eq(&self, other: &Self) -> bool {
+                self.left.eq(&other.left) && self.right.eq(&other.right)
             }
         }
 
@@ -122,13 +125,6 @@ macro_rules! make_predicate_function {
                     children[1].clone(),
                 )))
             }
-
-            fn dyn_hash(&self, state: &mut dyn Hasher) {
-                let mut s = state;
-                self.left.hash(&mut s);
-                self.right.hash(&mut s);
-                self.hash(&mut s);
-            }
         }
     };
 }
@@ -141,18 +137,43 @@ make_predicate_function!(EndsWith, ends_with_dyn, ends_with_utf8_scalar_dyn);
 
 make_predicate_function!(Contains, contains_dyn, contains_utf8_scalar_dyn);
 
-#[derive(Debug, Hash)]
+#[derive(Debug, Eq)]
 pub struct SubstringExpr {
     pub child: Arc<dyn PhysicalExpr>,
     pub start: i64,
     pub len: u64,
 }
 
-#[derive(Debug, Hash)]
+impl Hash for SubstringExpr {
+    fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
+        self.child.hash(state);
+        self.start.hash(state);
+        self.len.hash(state);
+    }
+}
+
+impl PartialEq for SubstringExpr {
+    fn eq(&self, other: &Self) -> bool {
+        self.child.eq(&other.child) && self.start.eq(&other.start) && self.len.eq(&other.len)
+    }
+}
+#[derive(Debug, Eq)]
 pub struct StringSpaceExpr {
     pub child: Arc<dyn PhysicalExpr>,
 }
 
+impl Hash for StringSpaceExpr {
+    fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
+        self.child.hash(state);
+    }
+}
+
+impl PartialEq for StringSpaceExpr {
+    fn eq(&self, other: &Self) -> bool {
+        self.child.eq(&other.child)
+    }
+}
+
 impl SubstringExpr {
     pub fn new(child: Arc<dyn PhysicalExpr>, start: i64, len: u64) -> Self {
         Self { child, start, len }
@@ -181,15 +202,6 @@ impl Display for StringSpaceExpr {
     }
 }
 
-impl PartialEq<dyn Any> for SubstringExpr {
-    fn eq(&self, other: &dyn Any) -> bool {
-        down_cast_any_ref(other)
-            .downcast_ref::<Self>()
-            .map(|x| self.child.eq(&x.child) && self.start.eq(&x.start) && self.len.eq(&x.len))
-            .unwrap_or(false)
-    }
-}
-
 impl PhysicalExpr for SubstringExpr {
     fn as_any(&self) -> &dyn Any {
         self
@@ -231,23 +243,6 @@ impl PhysicalExpr for SubstringExpr {
             self.len,
         )))
     }
-
-    fn dyn_hash(&self, state: &mut dyn Hasher) {
-        let mut s = state;
-        self.child.hash(&mut s);
-        self.start.hash(&mut s);
-        self.len.hash(&mut s);
-        self.hash(&mut s);
-    }
-}
-
-impl PartialEq<dyn Any> for StringSpaceExpr {
-    fn eq(&self, other: &dyn Any) -> bool {
-        down_cast_any_ref(other)
-            .downcast_ref::<Self>()
-            .map(|x| self.child.eq(&x.child))
-            .unwrap_or(false)
-    }
 }
 
 impl PhysicalExpr for StringSpaceExpr {
@@ -292,10 +287,4 @@ impl PhysicalExpr for StringSpaceExpr {
     ) -> datafusion_common::Result<Arc<dyn PhysicalExpr>> {
         Ok(Arc::new(StringSpaceExpr::new(Arc::clone(&children[0]))))
     }
-
-    fn dyn_hash(&self, state: &mut dyn Hasher) {
-        let mut s = state;
-        self.child.hash(&mut s);
-        self.hash(&mut s);
-    }
 }
diff --git a/src/structs.rs b/src/structs.rs
index cda8246d90b4..7cc49e428101 100644
--- a/src/structs.rs
+++ b/src/structs.rs
@@ -19,17 +19,16 @@ use arrow::record_batch::RecordBatch;
 use arrow_array::{Array, StructArray};
 use arrow_schema::{DataType, Field, Schema};
 use datafusion::logical_expr::ColumnarValue;
-use datafusion::physical_expr_common::physical_expr::down_cast_any_ref;
 use datafusion_common::{DataFusionError, Result as DataFusionResult, ScalarValue};
 use datafusion_physical_expr::PhysicalExpr;
 use std::{
     any::Any,
     fmt::{Display, Formatter},
-    hash::{Hash, Hasher},
+    hash::Hash,
     sync::Arc,
 };
 
-#[derive(Debug, Hash)]
+#[derive(Debug, Hash, PartialEq, Eq)]
 pub struct CreateNamedStruct {
     values: Vec<Arc<dyn PhysicalExpr>>,
     names: Vec<String>,
@@ -95,13 +94,6 @@ impl PhysicalExpr for CreateNamedStruct {
             self.names.clone(),
         )))
     }
-
-    fn dyn_hash(&self, state: &mut dyn Hasher) {
-        let mut s = state;
-        self.values.hash(&mut s);
-        self.names.hash(&mut s);
-        self.hash(&mut s);
-    }
 }
 
 impl Display for CreateNamedStruct {
@@ -114,29 +106,24 @@ impl Display for CreateNamedStruct {
     }
 }
 
-impl PartialEq<dyn Any> for CreateNamedStruct {
-    fn eq(&self, other: &dyn Any) -> bool {
-        down_cast_any_ref(other)
-            .downcast_ref::<Self>()
-            .map(|x| {
-                self.values
-                    .iter()
-                    .zip(x.values.iter())
-                    .all(|(a, b)| a.eq(b))
-                    && self.values.len() == x.values.len()
-                    && self.names.iter().zip(x.names.iter()).all(|(a, b)| a.eq(b))
-                    && self.names.len() == x.names.len()
-            })
-            .unwrap_or(false)
-    }
-}
-
-#[derive(Debug, Hash)]
+#[derive(Debug, Eq)]
 pub struct GetStructField {
     child: Arc<dyn PhysicalExpr>,
     ordinal: usize,
 }
 
+impl Hash for GetStructField {
+    fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
+        self.child.hash(state);
+        self.ordinal.hash(state);
+    }
+}
+impl PartialEq for GetStructField {
+    fn eq(&self, other: &Self) -> bool {
+        self.child.eq(&other.child) && self.ordinal.eq(&other.ordinal)
+    }
+}
+
 impl GetStructField {
     pub fn new(child: Arc<dyn PhysicalExpr>, ordinal: usize) -> Self {
         Self { child, ordinal }
@@ -203,13 +190,6 @@ impl PhysicalExpr for GetStructField {
             self.ordinal,
         )))
     }
-
-    fn dyn_hash(&self, state: &mut dyn Hasher) {
-        let mut s = state;
-        self.child.hash(&mut s);
-        self.ordinal.hash(&mut s);
-        self.hash(&mut s);
-    }
 }
 
 impl Display for GetStructField {
@@ -222,15 +202,6 @@ impl Display for GetStructField {
     }
 }
 
-impl PartialEq<dyn Any> for GetStructField {
-    fn eq(&self, other: &dyn Any) -> bool {
-        down_cast_any_ref(other)
-            .downcast_ref::<Self>()
-            .map(|x| self.child.eq(&x.child) && self.ordinal.eq(&x.ordinal))
-            .unwrap_or(false)
-    }
-}
-
 #[cfg(test)]
 mod test {
     use super::CreateNamedStruct;
diff --git a/src/sum_decimal.rs b/src/sum_decimal.rs
index ab142aee6925..f3f34d9bfa9d 100644
--- a/src/sum_decimal.rs
+++ b/src/sum_decimal.rs
@@ -25,20 +25,16 @@ use arrow_array::{
 };
 use arrow_schema::{DataType, Field};
 use datafusion::logical_expr::{Accumulator, EmitTo, GroupsAccumulator};
-use datafusion::physical_expr_common::physical_expr::down_cast_any_ref;
 use datafusion_common::{DataFusionError, Result as DFResult, ScalarValue};
 use datafusion_expr::function::{AccumulatorArgs, StateFieldsArgs};
 use datafusion_expr::Volatility::Immutable;
 use datafusion_expr::{AggregateUDFImpl, ReversedUDAF, Signature};
-use datafusion_physical_expr::PhysicalExpr;
 use std::{any::Any, ops::BitAnd, sync::Arc};
 
 #[derive(Debug)]
 pub struct SumDecimal {
     /// Aggregate function signature
     signature: Signature,
-    /// The expression that provides the input decimal values to be summed
-    expr: Arc<dyn PhysicalExpr>,
     /// The data type of the SUM result. This will always be a decimal type
     /// with the same precision and scale as specified in this struct
     result_type: DataType,
@@ -49,7 +45,7 @@ pub struct SumDecimal {
 }
 
 impl SumDecimal {
-    pub fn try_new(expr: Arc<dyn PhysicalExpr>, data_type: DataType) -> DFResult<Self> {
+    pub fn try_new(data_type: DataType) -> DFResult<Self> {
         // The `data_type` is the SUM result type passed from Spark side
         let (precision, scale) = match data_type {
             DataType::Decimal128(p, s) => (p, s),
@@ -61,7 +57,6 @@ impl SumDecimal {
         };
         Ok(Self {
             signature: Signature::user_defined(Immutable),
-            expr,
             result_type: data_type,
             precision,
             scale,
@@ -132,20 +127,6 @@ impl AggregateUDFImpl for SumDecimal {
     }
 }
 
-impl PartialEq<dyn Any> for SumDecimal {
-    fn eq(&self, other: &dyn Any) -> bool {
-        down_cast_any_ref(other)
-            .downcast_ref::<Self>()
-            .map(|x| {
-                // note that we do not compare result_type because this
-                // is guaranteed to match if the precision and scale
-                // match
-                self.precision == x.precision && self.scale == x.scale && self.expr.eq(&x.expr)
-            })
-            .unwrap_or(false)
-    }
-}
-
 #[derive(Debug)]
 struct SumDecimalAccumulator {
     sum: i128,
@@ -491,13 +472,13 @@ mod tests {
     use datafusion_common::Result;
     use datafusion_expr::AggregateUDF;
     use datafusion_physical_expr::aggregate::AggregateExprBuilder;
-    use datafusion_physical_expr::expressions::{Column, Literal};
+    use datafusion_physical_expr::expressions::Column;
+    use datafusion_physical_expr::PhysicalExpr;
     use futures::StreamExt;
 
     #[test]
     fn invalid_data_type() {
-        let expr = Arc::new(Literal::new(ScalarValue::Int32(Some(1))));
-        assert!(SumDecimal::try_new(expr, DataType::Int32).is_err());
+        assert!(SumDecimal::try_new(DataType::Int32).is_err());
     }
 
     #[tokio::test]
@@ -518,7 +499,6 @@ mod tests {
             Arc::new(MemoryExec::try_new(partitions, Arc::clone(&schema), None).unwrap());
 
         let aggregate_udf = Arc::new(AggregateUDF::new_from_impl(SumDecimal::try_new(
-            Arc::clone(&c1),
             data_type.clone(),
         )?));
 
diff --git a/src/temporal.rs b/src/temporal.rs
index 91953dd60047..fb549f9ce818 100644
--- a/src/temporal.rs
+++ b/src/temporal.rs
@@ -15,36 +15,45 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use std::{
-    any::Any,
-    fmt::{Debug, Display, Formatter},
-    hash::{Hash, Hasher},
-    sync::Arc,
-};
-
+use crate::utils::array_with_timezone;
 use arrow::{
     compute::{date_part, DatePart},
     record_batch::RecordBatch,
 };
 use arrow_schema::{DataType, Schema, TimeUnit::Microsecond};
 use datafusion::logical_expr::ColumnarValue;
-use datafusion::physical_expr_common::physical_expr::down_cast_any_ref;
 use datafusion_common::{DataFusionError, ScalarValue::Utf8};
 use datafusion_physical_expr::PhysicalExpr;
-
-use crate::utils::array_with_timezone;
+use std::hash::Hash;
+use std::{
+    any::Any,
+    fmt::{Debug, Display, Formatter},
+    sync::Arc,
+};
 
 use crate::kernels::temporal::{
     date_trunc_array_fmt_dyn, date_trunc_dyn, timestamp_trunc_array_fmt_dyn, timestamp_trunc_dyn,
 };
 
-#[derive(Debug, Hash)]
+#[derive(Debug, Eq)]
 pub struct HourExpr {
     /// An array with DataType::Timestamp(TimeUnit::Microsecond, None)
     child: Arc<dyn PhysicalExpr>,
     timezone: String,
 }
 
+impl Hash for HourExpr {
+    fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
+        self.child.hash(state);
+        self.timezone.hash(state);
+    }
+}
+impl PartialEq for HourExpr {
+    fn eq(&self, other: &Self) -> bool {
+        self.child.eq(&other.child) && self.timezone.eq(&other.timezone)
+    }
+}
+
 impl HourExpr {
     pub fn new(child: Arc<dyn PhysicalExpr>, timezone: String) -> Self {
         HourExpr { child, timezone }
@@ -61,15 +70,6 @@ impl Display for HourExpr {
     }
 }
 
-impl PartialEq<dyn Any> for HourExpr {
-    fn eq(&self, other: &dyn Any) -> bool {
-        down_cast_any_ref(other)
-            .downcast_ref::<Self>()
-            .map(|x| self.child.eq(&x.child) && self.timezone.eq(&x.timezone))
-            .unwrap_or(false)
-    }
-}
-
 impl PhysicalExpr for HourExpr {
     fn as_any(&self) -> &dyn Any {
         self
@@ -123,22 +123,27 @@ impl PhysicalExpr for HourExpr {
             self.timezone.clone(),
         )))
     }
-
-    fn dyn_hash(&self, state: &mut dyn Hasher) {
-        let mut s = state;
-        self.child.hash(&mut s);
-        self.timezone.hash(&mut s);
-        self.hash(&mut s);
-    }
 }
 
-#[derive(Debug, Hash)]
+#[derive(Debug, Eq)]
 pub struct MinuteExpr {
     /// An array with DataType::Timestamp(TimeUnit::Microsecond, None)
     child: Arc<dyn PhysicalExpr>,
     timezone: String,
 }
 
+impl Hash for MinuteExpr {
+    fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
+        self.child.hash(state);
+        self.timezone.hash(state);
+    }
+}
+impl PartialEq for MinuteExpr {
+    fn eq(&self, other: &Self) -> bool {
+        self.child.eq(&other.child) && self.timezone.eq(&other.timezone)
+    }
+}
+
 impl MinuteExpr {
     pub fn new(child: Arc<dyn PhysicalExpr>, timezone: String) -> Self {
         MinuteExpr { child, timezone }
@@ -155,15 +160,6 @@ impl Display for MinuteExpr {
     }
 }
 
-impl PartialEq<dyn Any> for MinuteExpr {
-    fn eq(&self, other: &dyn Any) -> bool {
-        down_cast_any_ref(other)
-            .downcast_ref::<Self>()
-            .map(|x| self.child.eq(&x.child) && self.timezone.eq(&x.timezone))
-            .unwrap_or(false)
-    }
-}
-
 impl PhysicalExpr for MinuteExpr {
     fn as_any(&self) -> &dyn Any {
         self
@@ -217,22 +213,27 @@ impl PhysicalExpr for MinuteExpr {
             self.timezone.clone(),
         )))
     }
-
-    fn dyn_hash(&self, state: &mut dyn Hasher) {
-        let mut s = state;
-        self.child.hash(&mut s);
-        self.timezone.hash(&mut s);
-        self.hash(&mut s);
-    }
 }
 
-#[derive(Debug, Hash)]
+#[derive(Debug, Eq)]
 pub struct SecondExpr {
     /// An array with DataType::Timestamp(TimeUnit::Microsecond, None)
     child: Arc<dyn PhysicalExpr>,
     timezone: String,
 }
 
+impl Hash for SecondExpr {
+    fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
+        self.child.hash(state);
+        self.timezone.hash(state);
+    }
+}
+impl PartialEq for SecondExpr {
+    fn eq(&self, other: &Self) -> bool {
+        self.child.eq(&other.child) && self.timezone.eq(&other.timezone)
+    }
+}
+
 impl SecondExpr {
     pub fn new(child: Arc<dyn PhysicalExpr>, timezone: String) -> Self {
         SecondExpr { child, timezone }
@@ -249,15 +250,6 @@ impl Display for SecondExpr {
     }
 }
 
-impl PartialEq<dyn Any> for SecondExpr {
-    fn eq(&self, other: &dyn Any) -> bool {
-        down_cast_any_ref(other)
-            .downcast_ref::<Self>()
-            .map(|x| self.child.eq(&x.child) && self.timezone.eq(&x.timezone))
-            .unwrap_or(false)
-    }
-}
-
 impl PhysicalExpr for SecondExpr {
     fn as_any(&self) -> &dyn Any {
         self
@@ -311,16 +303,9 @@ impl PhysicalExpr for SecondExpr {
             self.timezone.clone(),
         )))
     }
-
-    fn dyn_hash(&self, state: &mut dyn Hasher) {
-        let mut s = state;
-        self.child.hash(&mut s);
-        self.timezone.hash(&mut s);
-        self.hash(&mut s);
-    }
 }
 
-#[derive(Debug, Hash)]
+#[derive(Debug, Eq)]
 pub struct DateTruncExpr {
     /// An array with DataType::Date32
     child: Arc<dyn PhysicalExpr>,
@@ -328,6 +313,18 @@ pub struct DateTruncExpr {
     format: Arc<dyn PhysicalExpr>,
 }
 
+impl Hash for DateTruncExpr {
+    fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
+        self.child.hash(state);
+        self.format.hash(state);
+    }
+}
+impl PartialEq for DateTruncExpr {
+    fn eq(&self, other: &Self) -> bool {
+        self.child.eq(&other.child) && self.format.eq(&other.format)
+    }
+}
+
 impl DateTruncExpr {
     pub fn new(child: Arc<dyn PhysicalExpr>, format: Arc<dyn PhysicalExpr>) -> Self {
         DateTruncExpr { child, format }
@@ -344,15 +341,6 @@ impl Display for DateTruncExpr {
     }
 }
 
-impl PartialEq<dyn Any> for DateTruncExpr {
-    fn eq(&self, other: &dyn Any) -> bool {
-        down_cast_any_ref(other)
-            .downcast_ref::<Self>()
-            .map(|x| self.child.eq(&x.child) && self.format.eq(&x.format))
-            .unwrap_or(false)
-    }
-}
-
 impl PhysicalExpr for DateTruncExpr {
     fn as_any(&self) -> &dyn Any {
         self
@@ -398,16 +386,9 @@ impl PhysicalExpr for DateTruncExpr {
             Arc::clone(&self.format),
         )))
     }
-
-    fn dyn_hash(&self, state: &mut dyn Hasher) {
-        let mut s = state;
-        self.child.hash(&mut s);
-        self.format.hash(&mut s);
-        self.hash(&mut s);
-    }
 }
 
-#[derive(Debug, Hash)]
+#[derive(Debug, Eq)]
 pub struct TimestampTruncExpr {
     /// An array with DataType::Timestamp(TimeUnit::Microsecond, None)
     child: Arc<dyn PhysicalExpr>,
@@ -422,6 +403,21 @@ pub struct TimestampTruncExpr {
     timezone: String,
 }
 
+impl Hash for TimestampTruncExpr {
+    fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
+        self.child.hash(state);
+        self.format.hash(state);
+        self.timezone.hash(state);
+    }
+}
+impl PartialEq for TimestampTruncExpr {
+    fn eq(&self, other: &Self) -> bool {
+        self.child.eq(&other.child)
+            && self.format.eq(&other.format)
+            && self.timezone.eq(&other.timezone)
+    }
+}
+
 impl TimestampTruncExpr {
     pub fn new(
         child: Arc<dyn PhysicalExpr>,
@@ -446,19 +442,6 @@ impl Display for TimestampTruncExpr {
     }
 }
 
-impl PartialEq<dyn Any> for TimestampTruncExpr {
-    fn eq(&self, other: &dyn Any) -> bool {
-        down_cast_any_ref(other)
-            .downcast_ref::<Self>()
-            .map(|x| {
-                self.child.eq(&x.child)
-                    && self.format.eq(&x.format)
-                    && self.timezone.eq(&x.timezone)
-            })
-            .unwrap_or(false)
-    }
-}
-
 impl PhysicalExpr for TimestampTruncExpr {
     fn as_any(&self) -> &dyn Any {
         self
@@ -524,12 +507,4 @@ impl PhysicalExpr for TimestampTruncExpr {
             self.timezone.clone(),
         )))
     }
-
-    fn dyn_hash(&self, state: &mut dyn Hasher) {
-        let mut s = state;
-        self.child.hash(&mut s);
-        self.format.hash(&mut s);
-        self.timezone.hash(&mut s);
-        self.hash(&mut s);
-    }
 }
diff --git a/src/to_json.rs b/src/to_json.rs
index 1f68eb860639..91b46c6f0418 100644
--- a/src/to_json.rs
+++ b/src/to_json.rs
@@ -29,11 +29,11 @@ use datafusion_expr::ColumnarValue;
 use datafusion_physical_expr::PhysicalExpr;
 use std::any::Any;
 use std::fmt::{Debug, Display, Formatter};
-use std::hash::{Hash, Hasher};
+use std::hash::Hash;
 use std::sync::Arc;
 
 /// to_json function
-#[derive(Debug, Hash)]
+#[derive(Debug, Eq)]
 pub struct ToJson {
     /// The input to convert to JSON
     expr: Arc<dyn PhysicalExpr>,
@@ -41,6 +41,18 @@ pub struct ToJson {
     timezone: String,
 }
 
+impl Hash for ToJson {
+    fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
+        self.expr.hash(state);
+        self.timezone.hash(state);
+    }
+}
+impl PartialEq for ToJson {
+    fn eq(&self, other: &Self) -> bool {
+        self.expr.eq(&other.expr) && self.timezone.eq(&other.timezone)
+    }
+}
+
 impl ToJson {
     pub fn new(expr: Arc<dyn PhysicalExpr>, timezone: &str) -> Self {
         Self {
@@ -101,13 +113,6 @@ impl PhysicalExpr for ToJson {
             &self.timezone,
         )))
     }
-
-    fn dyn_hash(&self, state: &mut dyn Hasher) {
-        let mut s = state;
-        self.expr.hash(&mut s);
-        self.timezone.hash(&mut s);
-        self.hash(&mut s);
-    }
 }
 
 /// Convert an array into a JSON value string representation
diff --git a/src/unbound.rs b/src/unbound.rs
index a6babd0f7ef1..14f68c9cd6fb 100644
--- a/src/unbound.rs
+++ b/src/unbound.rs
@@ -17,15 +17,10 @@
 
 use arrow_array::RecordBatch;
 use arrow_schema::{DataType, Schema};
-use datafusion::physical_expr_common::physical_expr::down_cast_any_ref;
 use datafusion::physical_plan::ColumnarValue;
 use datafusion_common::{internal_err, Result};
 use datafusion_physical_expr::PhysicalExpr;
-use std::{
-    any::Any,
-    hash::{Hash, Hasher},
-    sync::Arc,
-};
+use std::{hash::Hash, sync::Arc};
 
 /// This is similar to `UnKnownColumn` in DataFusion, but it has data type.
 /// This is only used when the column is not bound to a schema, for example, the
@@ -93,18 +88,4 @@ impl PhysicalExpr for UnboundColumn {
     ) -> Result<Arc<dyn PhysicalExpr>> {
         Ok(self)
     }
-
-    fn dyn_hash(&self, state: &mut dyn Hasher) {
-        let mut s = state;
-        self.hash(&mut s);
-    }
-}
-
-impl PartialEq<dyn Any> for UnboundColumn {
-    fn eq(&self, other: &dyn Any) -> bool {
-        down_cast_any_ref(other)
-            .downcast_ref::<Self>()
-            .map(|x| self == x)
-            .unwrap_or(false)
-    }
 }
diff --git a/src/variance.rs b/src/variance.rs
index 4370d89ff48d..e71d713f5929 100644
--- a/src/variance.rs
+++ b/src/variance.rs
@@ -15,20 +15,19 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use std::{any::Any, sync::Arc};
+use std::any::Any;
 
 use arrow::{
     array::{ArrayRef, Float64Array},
     datatypes::{DataType, Field},
 };
 use datafusion::logical_expr::Accumulator;
-use datafusion::physical_expr_common::physical_expr::down_cast_any_ref;
 use datafusion_common::{downcast_value, DataFusionError, Result, ScalarValue};
 use datafusion_expr::function::{AccumulatorArgs, StateFieldsArgs};
 use datafusion_expr::Volatility::Immutable;
 use datafusion_expr::{AggregateUDFImpl, Signature};
+use datafusion_physical_expr::expressions::format_state_name;
 use datafusion_physical_expr::expressions::StatsType;
-use datafusion_physical_expr::{expressions::format_state_name, PhysicalExpr};
 
 /// VAR_SAMP and VAR_POP aggregate expression
 /// The implementation mostly is the same as the DataFusion's implementation. The reason
@@ -39,7 +38,6 @@ use datafusion_physical_expr::{expressions::format_state_name, PhysicalExpr};
 pub struct Variance {
     name: String,
     signature: Signature,
-    expr: Arc<dyn PhysicalExpr>,
     stats_type: StatsType,
     null_on_divide_by_zero: bool,
 }
@@ -47,7 +45,6 @@ pub struct Variance {
 impl Variance {
     /// Create a new VARIANCE aggregate function
     pub fn new(
-        expr: Arc<dyn PhysicalExpr>,
         name: impl Into<String>,
         data_type: DataType,
         stats_type: StatsType,
@@ -58,7 +55,6 @@ impl Variance {
         Self {
             name: name.into(),
             signature: Signature::numeric(1, Immutable),
-            expr,
             stats_type,
             null_on_divide_by_zero,
         }
@@ -118,17 +114,6 @@ impl AggregateUDFImpl for Variance {
     }
 }
 
-impl PartialEq<dyn Any> for Variance {
-    fn eq(&self, other: &dyn Any) -> bool {
-        down_cast_any_ref(other)
-            .downcast_ref::<Self>()
-            .map(|x| {
-                self.name == x.name && self.expr.eq(&x.expr) && self.stats_type == x.stats_type
-            })
-            .unwrap_or(false)
-    }
-}
-
 /// An accumulator to compute variance
 #[derive(Debug)]
 pub struct VarianceAccumulator {

From 4af3a2ffe78b34360c5265668c4100daa610dbe0 Mon Sep 17 00:00:00 2001
From: Raz Luvaton <16746759+rluvaton@users.noreply.github.com>
Date: Mon, 6 Jan 2025 18:53:23 +0200
Subject: [PATCH 44/68] extract struct expressions to folders based on spark
 grouping (#1216)

---
 src/lib.rs                                    |   5 +-
 .../create_named_struct.rs}                   | 100 +-------------
 src/struct_funcs/get_struct_field.rs          | 125 ++++++++++++++++++
 src/struct_funcs/mod.rs                       |  22 +++
 4 files changed, 152 insertions(+), 100 deletions(-)
 rename src/{structs.rs => struct_funcs/create_named_struct.rs} (64%)
 create mode 100644 src/struct_funcs/get_struct_field.rs
 create mode 100644 src/struct_funcs/mod.rs

diff --git a/src/lib.rs b/src/lib.rs
index f35873100412..827da5d7d4f4 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -47,7 +47,7 @@ pub use schema_adapter::SparkSchemaAdapterFactory;
 pub mod spark_hash;
 mod stddev;
 pub use stddev::Stddev;
-mod structs;
+mod struct_funcs;
 mod sum_decimal;
 pub use sum_decimal::SumDecimal;
 mod negative;
@@ -72,7 +72,8 @@ pub use error::{SparkError, SparkResult};
 pub use if_expr::IfExpr;
 pub use list::{ArrayInsert, GetArrayStructFields, ListExtract};
 pub use regexp::RLike;
-pub use structs::{CreateNamedStruct, GetStructField};
+pub use struct_funcs::*;
+
 pub use temporal::{DateTruncExpr, HourExpr, MinuteExpr, SecondExpr, TimestampTruncExpr};
 pub use to_json::ToJson;
 
diff --git a/src/structs.rs b/src/struct_funcs/create_named_struct.rs
similarity index 64%
rename from src/structs.rs
rename to src/struct_funcs/create_named_struct.rs
index 7cc49e428101..df63127412fe 100644
--- a/src/structs.rs
+++ b/src/struct_funcs/create_named_struct.rs
@@ -16,10 +16,10 @@
 // under the License.
 
 use arrow::record_batch::RecordBatch;
-use arrow_array::{Array, StructArray};
+use arrow_array::StructArray;
 use arrow_schema::{DataType, Field, Schema};
 use datafusion::logical_expr::ColumnarValue;
-use datafusion_common::{DataFusionError, Result as DataFusionResult, ScalarValue};
+use datafusion_common::Result as DataFusionResult;
 use datafusion_physical_expr::PhysicalExpr;
 use std::{
     any::Any,
@@ -106,102 +106,6 @@ impl Display for CreateNamedStruct {
     }
 }
 
-#[derive(Debug, Eq)]
-pub struct GetStructField {
-    child: Arc<dyn PhysicalExpr>,
-    ordinal: usize,
-}
-
-impl Hash for GetStructField {
-    fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
-        self.child.hash(state);
-        self.ordinal.hash(state);
-    }
-}
-impl PartialEq for GetStructField {
-    fn eq(&self, other: &Self) -> bool {
-        self.child.eq(&other.child) && self.ordinal.eq(&other.ordinal)
-    }
-}
-
-impl GetStructField {
-    pub fn new(child: Arc<dyn PhysicalExpr>, ordinal: usize) -> Self {
-        Self { child, ordinal }
-    }
-
-    fn child_field(&self, input_schema: &Schema) -> DataFusionResult<Arc<Field>> {
-        match self.child.data_type(input_schema)? {
-            DataType::Struct(fields) => Ok(Arc::clone(&fields[self.ordinal])),
-            data_type => Err(DataFusionError::Plan(format!(
-                "Expect struct field, got {:?}",
-                data_type
-            ))),
-        }
-    }
-}
-
-impl PhysicalExpr for GetStructField {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
-    fn data_type(&self, input_schema: &Schema) -> DataFusionResult<DataType> {
-        Ok(self.child_field(input_schema)?.data_type().clone())
-    }
-
-    fn nullable(&self, input_schema: &Schema) -> DataFusionResult<bool> {
-        Ok(self.child_field(input_schema)?.is_nullable())
-    }
-
-    fn evaluate(&self, batch: &RecordBatch) -> DataFusionResult<ColumnarValue> {
-        let child_value = self.child.evaluate(batch)?;
-
-        match child_value {
-            ColumnarValue::Array(array) => {
-                let struct_array = array
-                    .as_any()
-                    .downcast_ref::<StructArray>()
-                    .expect("A struct is expected");
-
-                Ok(ColumnarValue::Array(Arc::clone(
-                    struct_array.column(self.ordinal),
-                )))
-            }
-            ColumnarValue::Scalar(ScalarValue::Struct(struct_array)) => Ok(ColumnarValue::Array(
-                Arc::clone(struct_array.column(self.ordinal)),
-            )),
-            value => Err(DataFusionError::Execution(format!(
-                "Expected a struct array, got {:?}",
-                value
-            ))),
-        }
-    }
-
-    fn children(&self) -> Vec<&Arc<dyn PhysicalExpr>> {
-        vec![&self.child]
-    }
-
-    fn with_new_children(
-        self: Arc<Self>,
-        children: Vec<Arc<dyn PhysicalExpr>>,
-    ) -> datafusion_common::Result<Arc<dyn PhysicalExpr>> {
-        Ok(Arc::new(GetStructField::new(
-            Arc::clone(&children[0]),
-            self.ordinal,
-        )))
-    }
-}
-
-impl Display for GetStructField {
-    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
-        write!(
-            f,
-            "GetStructField [child: {:?}, ordinal: {:?}]",
-            self.child, self.ordinal
-        )
-    }
-}
-
 #[cfg(test)]
 mod test {
     use super::CreateNamedStruct;
diff --git a/src/struct_funcs/get_struct_field.rs b/src/struct_funcs/get_struct_field.rs
new file mode 100644
index 000000000000..c4e1a1e23934
--- /dev/null
+++ b/src/struct_funcs/get_struct_field.rs
@@ -0,0 +1,125 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::record_batch::RecordBatch;
+use arrow_array::{Array, StructArray};
+use arrow_schema::{DataType, Field, Schema};
+use datafusion::logical_expr::ColumnarValue;
+use datafusion_common::{DataFusionError, Result as DataFusionResult, ScalarValue};
+use datafusion_physical_expr::PhysicalExpr;
+use std::{
+    any::Any,
+    fmt::{Display, Formatter},
+    hash::Hash,
+    sync::Arc,
+};
+
+#[derive(Debug, Eq)]
+pub struct GetStructField {
+    child: Arc<dyn PhysicalExpr>,
+    ordinal: usize,
+}
+
+impl Hash for GetStructField {
+    fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
+        self.child.hash(state);
+        self.ordinal.hash(state);
+    }
+}
+impl PartialEq for GetStructField {
+    fn eq(&self, other: &Self) -> bool {
+        self.child.eq(&other.child) && self.ordinal.eq(&other.ordinal)
+    }
+}
+
+impl GetStructField {
+    pub fn new(child: Arc<dyn PhysicalExpr>, ordinal: usize) -> Self {
+        Self { child, ordinal }
+    }
+
+    fn child_field(&self, input_schema: &Schema) -> DataFusionResult<Arc<Field>> {
+        match self.child.data_type(input_schema)? {
+            DataType::Struct(fields) => Ok(Arc::clone(&fields[self.ordinal])),
+            data_type => Err(DataFusionError::Plan(format!(
+                "Expect struct field, got {:?}",
+                data_type
+            ))),
+        }
+    }
+}
+
+impl PhysicalExpr for GetStructField {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn data_type(&self, input_schema: &Schema) -> DataFusionResult<DataType> {
+        Ok(self.child_field(input_schema)?.data_type().clone())
+    }
+
+    fn nullable(&self, input_schema: &Schema) -> DataFusionResult<bool> {
+        Ok(self.child_field(input_schema)?.is_nullable())
+    }
+
+    fn evaluate(&self, batch: &RecordBatch) -> DataFusionResult<ColumnarValue> {
+        let child_value = self.child.evaluate(batch)?;
+
+        match child_value {
+            ColumnarValue::Array(array) => {
+                let struct_array = array
+                    .as_any()
+                    .downcast_ref::<StructArray>()
+                    .expect("A struct is expected");
+
+                Ok(ColumnarValue::Array(Arc::clone(
+                    struct_array.column(self.ordinal),
+                )))
+            }
+            ColumnarValue::Scalar(ScalarValue::Struct(struct_array)) => Ok(ColumnarValue::Array(
+                Arc::clone(struct_array.column(self.ordinal)),
+            )),
+            value => Err(DataFusionError::Execution(format!(
+                "Expected a struct array, got {:?}",
+                value
+            ))),
+        }
+    }
+
+    fn children(&self) -> Vec<&Arc<dyn PhysicalExpr>> {
+        vec![&self.child]
+    }
+
+    fn with_new_children(
+        self: Arc<Self>,
+        children: Vec<Arc<dyn PhysicalExpr>>,
+    ) -> datafusion_common::Result<Arc<dyn PhysicalExpr>> {
+        Ok(Arc::new(GetStructField::new(
+            Arc::clone(&children[0]),
+            self.ordinal,
+        )))
+    }
+}
+
+impl Display for GetStructField {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        write!(
+            f,
+            "GetStructField [child: {:?}, ordinal: {:?}]",
+            self.child, self.ordinal
+        )
+    }
+}
diff --git a/src/struct_funcs/mod.rs b/src/struct_funcs/mod.rs
new file mode 100644
index 000000000000..86edcceac918
--- /dev/null
+++ b/src/struct_funcs/mod.rs
@@ -0,0 +1,22 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+mod create_named_struct;
+mod get_struct_field;
+
+pub use create_named_struct::CreateNamedStruct;
+pub use get_struct_field::GetStructField;

From 11421f7720fbe64578c654baa5f5494322e01f76 Mon Sep 17 00:00:00 2001
From: Raz Luvaton <16746759+rluvaton@users.noreply.github.com>
Date: Mon, 6 Jan 2025 18:55:30 +0200
Subject: [PATCH 45/68] chore: extract static invoke expressions to folders
 based on spark grouping (#1217)

* extract static invoke expressions to folders based on spark grouping

* Update native/spark-expr/src/static_invoke/mod.rs

Co-authored-by: Andy Grove <agrove@apache.org>

---------

Co-authored-by: Andy Grove <agrove@apache.org>
---
 src/comet_scalar_funcs.rs                     |  5 +-
 src/lib.rs                                    |  2 +
 src/scalar_funcs.rs                           | 59 +-------------
 src/static_invoke/char_varchar_utils/mod.rs   | 20 +++++
 .../char_varchar_utils/read_side_padding.rs   | 76 +++++++++++++++++++
 src/static_invoke/mod.rs                      | 20 +++++
 6 files changed, 124 insertions(+), 58 deletions(-)
 create mode 100644 src/static_invoke/char_varchar_utils/mod.rs
 create mode 100644 src/static_invoke/char_varchar_utils/read_side_padding.rs
 create mode 100644 src/static_invoke/mod.rs

diff --git a/src/comet_scalar_funcs.rs b/src/comet_scalar_funcs.rs
index 71ff0e9dccad..a4c2e9d70ea2 100644
--- a/src/comet_scalar_funcs.rs
+++ b/src/comet_scalar_funcs.rs
@@ -20,9 +20,10 @@ use crate::scalar_funcs::hash_expressions::{
 };
 use crate::scalar_funcs::{
     spark_ceil, spark_date_add, spark_date_sub, spark_decimal_div, spark_floor, spark_hex,
-    spark_isnan, spark_make_decimal, spark_murmur3_hash, spark_read_side_padding, spark_round,
-    spark_unhex, spark_unscaled_value, spark_xxhash64, SparkChrFunc,
+    spark_isnan, spark_make_decimal, spark_murmur3_hash, spark_round, spark_unhex,
+    spark_unscaled_value, spark_xxhash64, SparkChrFunc,
 };
+use crate::spark_read_side_padding;
 use arrow_schema::DataType;
 use datafusion_common::{DataFusionError, Result as DataFusionResult};
 use datafusion_expr::registry::FunctionRegistry;
diff --git a/src/lib.rs b/src/lib.rs
index 827da5d7d4f4..9cf0de30b7d2 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -42,7 +42,9 @@ mod list;
 mod regexp;
 pub mod scalar_funcs;
 mod schema_adapter;
+mod static_invoke;
 pub use schema_adapter::SparkSchemaAdapterFactory;
+pub use static_invoke::*;
 
 pub mod spark_hash;
 mod stddev;
diff --git a/src/scalar_funcs.rs b/src/scalar_funcs.rs
index 2961f038dca0..345a800a30fa 100644
--- a/src/scalar_funcs.rs
+++ b/src/scalar_funcs.rs
@@ -20,25 +20,23 @@ use arrow::datatypes::IntervalDayTime;
 use arrow::{
     array::{
         ArrayRef, AsArray, Decimal128Builder, Float32Array, Float64Array, Int16Array, Int32Array,
-        Int64Array, Int64Builder, Int8Array, OffsetSizeTrait,
+        Int64Array, Int64Builder, Int8Array,
     },
     datatypes::{validate_decimal_precision, Decimal128Type, Int64Type},
 };
-use arrow_array::builder::{GenericStringBuilder, IntervalDayTimeBuilder};
+use arrow_array::builder::IntervalDayTimeBuilder;
 use arrow_array::types::{Int16Type, Int32Type, Int8Type};
 use arrow_array::{Array, ArrowNativeTypeOp, BooleanArray, Datum, Decimal128Array};
 use arrow_schema::{ArrowError, DataType, DECIMAL128_MAX_PRECISION};
 use datafusion::physical_expr_common::datum;
 use datafusion::{functions::math::round::round, physical_plan::ColumnarValue};
 use datafusion_common::{
-    cast::as_generic_string_array, exec_err, internal_err, DataFusionError,
-    Result as DataFusionResult, ScalarValue,
+    exec_err, internal_err, DataFusionError, Result as DataFusionResult, ScalarValue,
 };
 use num::{
     integer::{div_ceil, div_floor},
     BigInt, Signed, ToPrimitive,
 };
-use std::fmt::Write;
 use std::{cmp::min, sync::Arc};
 
 mod unhex;
@@ -390,57 +388,6 @@ pub fn spark_round(
     }
 }
 
-/// Similar to DataFusion `rpad`, but not to truncate when the string is already longer than length
-pub fn spark_read_side_padding(args: &[ColumnarValue]) -> Result<ColumnarValue, DataFusionError> {
-    match args {
-        [ColumnarValue::Array(array), ColumnarValue::Scalar(ScalarValue::Int32(Some(length)))] => {
-            match array.data_type() {
-                DataType::Utf8 => spark_read_side_padding_internal::<i32>(array, *length),
-                DataType::LargeUtf8 => spark_read_side_padding_internal::<i64>(array, *length),
-                // TODO: handle Dictionary types
-                other => Err(DataFusionError::Internal(format!(
-                    "Unsupported data type {other:?} for function read_side_padding",
-                ))),
-            }
-        }
-        other => Err(DataFusionError::Internal(format!(
-            "Unsupported arguments {other:?} for function read_side_padding",
-        ))),
-    }
-}
-
-fn spark_read_side_padding_internal<T: OffsetSizeTrait>(
-    array: &ArrayRef,
-    length: i32,
-) -> Result<ColumnarValue, DataFusionError> {
-    let string_array = as_generic_string_array::<T>(array)?;
-    let length = 0.max(length) as usize;
-    let space_string = " ".repeat(length);
-
-    let mut builder =
-        GenericStringBuilder::<T>::with_capacity(string_array.len(), string_array.len() * length);
-
-    for string in string_array.iter() {
-        match string {
-            Some(string) => {
-                // It looks Spark's UTF8String is closer to chars rather than graphemes
-                // https://stackoverflow.com/a/46290728
-                let char_len = string.chars().count();
-                if length <= char_len {
-                    builder.append_value(string);
-                } else {
-                    // write_str updates only the value buffer, not null nor offset buffer
-                    // This is convenient for concatenating str(s)
-                    builder.write_str(string)?;
-                    builder.append_value(&space_string[char_len..]);
-                }
-            }
-            _ => builder.append_null(),
-        }
-    }
-    Ok(ColumnarValue::Array(Arc::new(builder.finish())))
-}
-
 // Let Decimal(p3, s3) as return type i.e. Decimal(p1, s1) / Decimal(p2, s2) = Decimal(p3, s3).
 // Conversely, Decimal(p1, s1) = Decimal(p2, s2) * Decimal(p3, s3). This means that, in order to
 // get enough scale that matches with Spark behavior, it requires to widen s1 to s2 + s3 + 1. Since
diff --git a/src/static_invoke/char_varchar_utils/mod.rs b/src/static_invoke/char_varchar_utils/mod.rs
new file mode 100644
index 000000000000..fff6134dab8b
--- /dev/null
+++ b/src/static_invoke/char_varchar_utils/mod.rs
@@ -0,0 +1,20 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+mod read_side_padding;
+
+pub use read_side_padding::spark_read_side_padding;
diff --git a/src/static_invoke/char_varchar_utils/read_side_padding.rs b/src/static_invoke/char_varchar_utils/read_side_padding.rs
new file mode 100644
index 000000000000..15807bf57d4e
--- /dev/null
+++ b/src/static_invoke/char_varchar_utils/read_side_padding.rs
@@ -0,0 +1,76 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::array::{ArrayRef, OffsetSizeTrait};
+use arrow_array::builder::GenericStringBuilder;
+use arrow_array::Array;
+use arrow_schema::DataType;
+use datafusion::physical_plan::ColumnarValue;
+use datafusion_common::{cast::as_generic_string_array, DataFusionError, ScalarValue};
+use std::fmt::Write;
+use std::sync::Arc;
+
+/// Similar to DataFusion `rpad`, but not to truncate when the string is already longer than length
+pub fn spark_read_side_padding(args: &[ColumnarValue]) -> Result<ColumnarValue, DataFusionError> {
+    match args {
+        [ColumnarValue::Array(array), ColumnarValue::Scalar(ScalarValue::Int32(Some(length)))] => {
+            match array.data_type() {
+                DataType::Utf8 => spark_read_side_padding_internal::<i32>(array, *length),
+                DataType::LargeUtf8 => spark_read_side_padding_internal::<i64>(array, *length),
+                // TODO: handle Dictionary types
+                other => Err(DataFusionError::Internal(format!(
+                    "Unsupported data type {other:?} for function read_side_padding",
+                ))),
+            }
+        }
+        other => Err(DataFusionError::Internal(format!(
+            "Unsupported arguments {other:?} for function read_side_padding",
+        ))),
+    }
+}
+
+fn spark_read_side_padding_internal<T: OffsetSizeTrait>(
+    array: &ArrayRef,
+    length: i32,
+) -> Result<ColumnarValue, DataFusionError> {
+    let string_array = as_generic_string_array::<T>(array)?;
+    let length = 0.max(length) as usize;
+    let space_string = " ".repeat(length);
+
+    let mut builder =
+        GenericStringBuilder::<T>::with_capacity(string_array.len(), string_array.len() * length);
+
+    for string in string_array.iter() {
+        match string {
+            Some(string) => {
+                // It looks Spark's UTF8String is closer to chars rather than graphemes
+                // https://stackoverflow.com/a/46290728
+                let char_len = string.chars().count();
+                if length <= char_len {
+                    builder.append_value(string);
+                } else {
+                    // write_str updates only the value buffer, not null nor offset buffer
+                    // This is convenient for concatenating str(s)
+                    builder.write_str(string)?;
+                    builder.append_value(&space_string[char_len..]);
+                }
+            }
+            _ => builder.append_null(),
+        }
+    }
+    Ok(ColumnarValue::Array(Arc::new(builder.finish())))
+}
diff --git a/src/static_invoke/mod.rs b/src/static_invoke/mod.rs
new file mode 100644
index 000000000000..4072e13b7075
--- /dev/null
+++ b/src/static_invoke/mod.rs
@@ -0,0 +1,20 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+mod char_varchar_utils;
+
+pub use char_varchar_utils::spark_read_side_padding;

From 37b7acb3d78e05e261db2b4cbc826a5894150d49 Mon Sep 17 00:00:00 2001
From: Raz Luvaton <16746759+rluvaton@users.noreply.github.com>
Date: Tue, 7 Jan 2025 18:29:48 +0200
Subject: [PATCH 46/68] chore: extract agg_funcs expressions to folders based
 on spark grouping (#1224)

* extract agg_funcs expressions to folders based on spark grouping

* fix rebase
---
 src/{ => agg_funcs}/avg.rs         |  0
 src/{ => agg_funcs}/avg_decimal.rs |  0
 src/{ => agg_funcs}/correlation.rs |  4 ++--
 src/{ => agg_funcs}/covariance.rs  |  0
 src/agg_funcs/mod.rs               | 32 ++++++++++++++++++++++++++++++
 src/{ => agg_funcs}/stddev.rs      |  2 +-
 src/{ => agg_funcs}/sum_decimal.rs |  0
 src/{ => agg_funcs}/variance.rs    |  0
 src/lib.rs                         | 19 ++++--------------
 9 files changed, 39 insertions(+), 18 deletions(-)
 rename src/{ => agg_funcs}/avg.rs (100%)
 rename src/{ => agg_funcs}/avg_decimal.rs (100%)
 rename src/{ => agg_funcs}/correlation.rs (98%)
 rename src/{ => agg_funcs}/covariance.rs (100%)
 create mode 100644 src/agg_funcs/mod.rs
 rename src/{ => agg_funcs}/stddev.rs (99%)
 rename src/{ => agg_funcs}/sum_decimal.rs (100%)
 rename src/{ => agg_funcs}/variance.rs (100%)

diff --git a/src/avg.rs b/src/agg_funcs/avg.rs
similarity index 100%
rename from src/avg.rs
rename to src/agg_funcs/avg.rs
diff --git a/src/avg_decimal.rs b/src/agg_funcs/avg_decimal.rs
similarity index 100%
rename from src/avg_decimal.rs
rename to src/agg_funcs/avg_decimal.rs
diff --git a/src/correlation.rs b/src/agg_funcs/correlation.rs
similarity index 98%
rename from src/correlation.rs
rename to src/agg_funcs/correlation.rs
index e4ddab95de73..5d6f9e0b439e 100644
--- a/src/correlation.rs
+++ b/src/agg_funcs/correlation.rs
@@ -19,8 +19,8 @@ use arrow::compute::{and, filter, is_not_null};
 
 use std::{any::Any, sync::Arc};
 
-use crate::covariance::CovarianceAccumulator;
-use crate::stddev::StddevAccumulator;
+use crate::agg_funcs::covariance::CovarianceAccumulator;
+use crate::agg_funcs::stddev::StddevAccumulator;
 use arrow::{
     array::ArrayRef,
     datatypes::{DataType, Field},
diff --git a/src/covariance.rs b/src/agg_funcs/covariance.rs
similarity index 100%
rename from src/covariance.rs
rename to src/agg_funcs/covariance.rs
diff --git a/src/agg_funcs/mod.rs b/src/agg_funcs/mod.rs
new file mode 100644
index 000000000000..252da788900d
--- /dev/null
+++ b/src/agg_funcs/mod.rs
@@ -0,0 +1,32 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+mod avg;
+mod avg_decimal;
+mod correlation;
+mod covariance;
+mod stddev;
+mod sum_decimal;
+mod variance;
+
+pub use avg::Avg;
+pub use avg_decimal::AvgDecimal;
+pub use correlation::Correlation;
+pub use covariance::Covariance;
+pub use stddev::Stddev;
+pub use sum_decimal::SumDecimal;
+pub use variance::Variance;
diff --git a/src/stddev.rs b/src/agg_funcs/stddev.rs
similarity index 99%
rename from src/stddev.rs
rename to src/agg_funcs/stddev.rs
index 1ec5ffb69ac8..39dffa1c8e08 100644
--- a/src/stddev.rs
+++ b/src/agg_funcs/stddev.rs
@@ -17,7 +17,7 @@
 
 use std::{any::Any, sync::Arc};
 
-use crate::variance::VarianceAccumulator;
+use crate::agg_funcs::variance::VarianceAccumulator;
 use arrow::{
     array::ArrayRef,
     datatypes::{DataType, Field},
diff --git a/src/sum_decimal.rs b/src/agg_funcs/sum_decimal.rs
similarity index 100%
rename from src/sum_decimal.rs
rename to src/agg_funcs/sum_decimal.rs
diff --git a/src/variance.rs b/src/agg_funcs/variance.rs
similarity index 100%
rename from src/variance.rs
rename to src/agg_funcs/variance.rs
diff --git a/src/lib.rs b/src/lib.rs
index 9cf0de30b7d2..3c2a70197034 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -23,18 +23,10 @@ mod cast;
 mod error;
 mod if_expr;
 
-mod avg;
-pub use avg::Avg;
 mod bitwise_not;
 pub use bitwise_not::{bitwise_not, BitwiseNotExpr};
-mod avg_decimal;
-pub use avg_decimal::AvgDecimal;
 mod checkoverflow;
 pub use checkoverflow::CheckOverflow;
-mod correlation;
-pub use correlation::Correlation;
-mod covariance;
-pub use covariance::Covariance;
 mod strings;
 pub use strings::{Contains, EndsWith, Like, StartsWith, StringSpaceExpr, SubstringExpr};
 mod kernels;
@@ -46,13 +38,9 @@ mod static_invoke;
 pub use schema_adapter::SparkSchemaAdapterFactory;
 pub use static_invoke::*;
 
+mod negative;
 pub mod spark_hash;
-mod stddev;
-pub use stddev::Stddev;
 mod struct_funcs;
-mod sum_decimal;
-pub use sum_decimal::SumDecimal;
-mod negative;
 pub use negative::{create_negate_expr, NegativeExpr};
 mod normalize_nan;
 mod temporal;
@@ -65,9 +53,10 @@ pub use unbound::UnboundColumn;
 pub mod utils;
 pub use normalize_nan::NormalizeNaNAndZero;
 
-mod variance;
-pub use variance::Variance;
+mod agg_funcs;
 mod comet_scalar_funcs;
+pub use agg_funcs::*;
+
 pub use cast::{spark_cast, Cast, SparkCastOptions};
 pub use comet_scalar_funcs::create_comet_physical_fun;
 pub use error::{SparkError, SparkResult};

From ac22924aa9a949eb4e6543f28cf0092b50ab1e86 Mon Sep 17 00:00:00 2001
From: Raz Luvaton <16746759+rluvaton@users.noreply.github.com>
Date: Tue, 7 Jan 2025 21:59:15 +0200
Subject: [PATCH 47/68] extract datetime_funcs expressions to folders based on
 spark grouping (#1222)

Co-authored-by: Andy Grove <agrove@apache.org>
---
 src/comet_scalar_funcs.rs             |   8 +-
 src/datetime_funcs/date_arithmetic.rs | 102 ++++++
 src/datetime_funcs/date_trunc.rs      | 113 ++++++
 src/datetime_funcs/hour.rs            | 122 ++++++
 src/datetime_funcs/minute.rs          | 122 ++++++
 src/datetime_funcs/mod.rs             |  30 ++
 src/datetime_funcs/second.rs          | 122 ++++++
 src/datetime_funcs/timestamp_trunc.rs | 152 ++++++++
 src/lib.rs                            |   7 +-
 src/scalar_funcs.rs                   |  82 +----
 src/temporal.rs                       | 510 --------------------------
 11 files changed, 773 insertions(+), 597 deletions(-)
 create mode 100644 src/datetime_funcs/date_arithmetic.rs
 create mode 100644 src/datetime_funcs/date_trunc.rs
 create mode 100644 src/datetime_funcs/hour.rs
 create mode 100644 src/datetime_funcs/minute.rs
 create mode 100644 src/datetime_funcs/mod.rs
 create mode 100644 src/datetime_funcs/second.rs
 create mode 100644 src/datetime_funcs/timestamp_trunc.rs
 delete mode 100644 src/temporal.rs

diff --git a/src/comet_scalar_funcs.rs b/src/comet_scalar_funcs.rs
index a4c2e9d70ea2..781249c49aa8 100644
--- a/src/comet_scalar_funcs.rs
+++ b/src/comet_scalar_funcs.rs
@@ -19,11 +19,11 @@ use crate::scalar_funcs::hash_expressions::{
     spark_sha224, spark_sha256, spark_sha384, spark_sha512,
 };
 use crate::scalar_funcs::{
-    spark_ceil, spark_date_add, spark_date_sub, spark_decimal_div, spark_floor, spark_hex,
-    spark_isnan, spark_make_decimal, spark_murmur3_hash, spark_round, spark_unhex,
-    spark_unscaled_value, spark_xxhash64, SparkChrFunc,
+    spark_ceil, spark_decimal_div, spark_floor, spark_hex, spark_isnan, spark_make_decimal,
+    spark_murmur3_hash, spark_round, spark_unhex, spark_unscaled_value, spark_xxhash64,
+    SparkChrFunc,
 };
-use crate::spark_read_side_padding;
+use crate::{spark_date_add, spark_date_sub, spark_read_side_padding};
 use arrow_schema::DataType;
 use datafusion_common::{DataFusionError, Result as DataFusionResult};
 use datafusion_expr::registry::FunctionRegistry;
diff --git a/src/datetime_funcs/date_arithmetic.rs b/src/datetime_funcs/date_arithmetic.rs
new file mode 100644
index 000000000000..cc4da9af705d
--- /dev/null
+++ b/src/datetime_funcs/date_arithmetic.rs
@@ -0,0 +1,102 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::array::{ArrayRef, AsArray};
+use arrow::compute::kernels::numeric::{add, sub};
+use arrow::datatypes::IntervalDayTime;
+use arrow_array::builder::IntervalDayTimeBuilder;
+use arrow_array::types::{Int16Type, Int32Type, Int8Type};
+use arrow_array::{Array, Datum};
+use arrow_schema::{ArrowError, DataType};
+use datafusion::physical_expr_common::datum;
+use datafusion::physical_plan::ColumnarValue;
+use datafusion_common::{DataFusionError, ScalarValue};
+use std::sync::Arc;
+
+macro_rules! scalar_date_arithmetic {
+    ($start:expr, $days:expr, $op:expr) => {{
+        let interval = IntervalDayTime::new(*$days as i32, 0);
+        let interval_cv = ColumnarValue::Scalar(ScalarValue::IntervalDayTime(Some(interval)));
+        datum::apply($start, &interval_cv, $op)
+    }};
+}
+macro_rules! array_date_arithmetic {
+    ($days:expr, $interval_builder:expr, $intType:ty) => {{
+        for day in $days.as_primitive::<$intType>().into_iter() {
+            if let Some(non_null_day) = day {
+                $interval_builder.append_value(IntervalDayTime::new(non_null_day as i32, 0));
+            } else {
+                $interval_builder.append_null();
+            }
+        }
+    }};
+}
+
+/// Spark-compatible `date_add` and `date_sub` expressions, which assumes days for the second
+/// argument, but we cannot directly add that to a Date32. We generate an IntervalDayTime from the
+/// second argument and use DataFusion's interface to apply Arrow's operators.
+fn spark_date_arithmetic(
+    args: &[ColumnarValue],
+    op: impl Fn(&dyn Datum, &dyn Datum) -> Result<ArrayRef, ArrowError>,
+) -> Result<ColumnarValue, DataFusionError> {
+    let start = &args[0];
+    match &args[1] {
+        ColumnarValue::Scalar(ScalarValue::Int8(Some(days))) => {
+            scalar_date_arithmetic!(start, days, op)
+        }
+        ColumnarValue::Scalar(ScalarValue::Int16(Some(days))) => {
+            scalar_date_arithmetic!(start, days, op)
+        }
+        ColumnarValue::Scalar(ScalarValue::Int32(Some(days))) => {
+            scalar_date_arithmetic!(start, days, op)
+        }
+        ColumnarValue::Array(days) => {
+            let mut interval_builder = IntervalDayTimeBuilder::with_capacity(days.len());
+            match days.data_type() {
+                DataType::Int8 => {
+                    array_date_arithmetic!(days, interval_builder, Int8Type)
+                }
+                DataType::Int16 => {
+                    array_date_arithmetic!(days, interval_builder, Int16Type)
+                }
+                DataType::Int32 => {
+                    array_date_arithmetic!(days, interval_builder, Int32Type)
+                }
+                _ => {
+                    return Err(DataFusionError::Internal(format!(
+                        "Unsupported data types {:?} for date arithmetic.",
+                        args,
+                    )))
+                }
+            }
+            let interval_cv = ColumnarValue::Array(Arc::new(interval_builder.finish()));
+            datum::apply(start, &interval_cv, op)
+        }
+        _ => Err(DataFusionError::Internal(format!(
+            "Unsupported data types {:?} for date arithmetic.",
+            args,
+        ))),
+    }
+}
+
+pub fn spark_date_add(args: &[ColumnarValue]) -> Result<ColumnarValue, DataFusionError> {
+    spark_date_arithmetic(args, add)
+}
+
+pub fn spark_date_sub(args: &[ColumnarValue]) -> Result<ColumnarValue, DataFusionError> {
+    spark_date_arithmetic(args, sub)
+}
diff --git a/src/datetime_funcs/date_trunc.rs b/src/datetime_funcs/date_trunc.rs
new file mode 100644
index 000000000000..5c044945d04c
--- /dev/null
+++ b/src/datetime_funcs/date_trunc.rs
@@ -0,0 +1,113 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::record_batch::RecordBatch;
+use arrow_schema::{DataType, Schema};
+use datafusion::logical_expr::ColumnarValue;
+use datafusion_common::{DataFusionError, ScalarValue::Utf8};
+use datafusion_physical_expr::PhysicalExpr;
+use std::hash::Hash;
+use std::{
+    any::Any,
+    fmt::{Debug, Display, Formatter},
+    sync::Arc,
+};
+
+use crate::kernels::temporal::{date_trunc_array_fmt_dyn, date_trunc_dyn};
+
+#[derive(Debug, Eq)]
+pub struct DateTruncExpr {
+    /// An array with DataType::Date32
+    child: Arc<dyn PhysicalExpr>,
+    /// Scalar UTF8 string matching the valid values in Spark SQL: https://spark.apache.org/docs/latest/api/sql/index.html#trunc
+    format: Arc<dyn PhysicalExpr>,
+}
+
+impl Hash for DateTruncExpr {
+    fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
+        self.child.hash(state);
+        self.format.hash(state);
+    }
+}
+impl PartialEq for DateTruncExpr {
+    fn eq(&self, other: &Self) -> bool {
+        self.child.eq(&other.child) && self.format.eq(&other.format)
+    }
+}
+
+impl DateTruncExpr {
+    pub fn new(child: Arc<dyn PhysicalExpr>, format: Arc<dyn PhysicalExpr>) -> Self {
+        DateTruncExpr { child, format }
+    }
+}
+
+impl Display for DateTruncExpr {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        write!(
+            f,
+            "DateTrunc [child:{}, format: {}]",
+            self.child, self.format
+        )
+    }
+}
+
+impl PhysicalExpr for DateTruncExpr {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn data_type(&self, input_schema: &Schema) -> datafusion_common::Result<DataType> {
+        self.child.data_type(input_schema)
+    }
+
+    fn nullable(&self, _: &Schema) -> datafusion_common::Result<bool> {
+        Ok(true)
+    }
+
+    fn evaluate(&self, batch: &RecordBatch) -> datafusion_common::Result<ColumnarValue> {
+        let date = self.child.evaluate(batch)?;
+        let format = self.format.evaluate(batch)?;
+        match (date, format) {
+            (ColumnarValue::Array(date), ColumnarValue::Scalar(Utf8(Some(format)))) => {
+                let result = date_trunc_dyn(&date, format)?;
+                Ok(ColumnarValue::Array(result))
+            }
+            (ColumnarValue::Array(date), ColumnarValue::Array(formats)) => {
+                let result = date_trunc_array_fmt_dyn(&date, &formats)?;
+                Ok(ColumnarValue::Array(result))
+            }
+            _ => Err(DataFusionError::Execution(
+                "Invalid input to function DateTrunc. Expected (PrimitiveArray<Date32>, Scalar) or \
+                    (PrimitiveArray<Date32>, StringArray)".to_string(),
+            )),
+        }
+    }
+
+    fn children(&self) -> Vec<&Arc<dyn PhysicalExpr>> {
+        vec![&self.child]
+    }
+
+    fn with_new_children(
+        self: Arc<Self>,
+        children: Vec<Arc<dyn PhysicalExpr>>,
+    ) -> Result<Arc<dyn PhysicalExpr>, DataFusionError> {
+        Ok(Arc::new(DateTruncExpr::new(
+            Arc::clone(&children[0]),
+            Arc::clone(&self.format),
+        )))
+    }
+}
diff --git a/src/datetime_funcs/hour.rs b/src/datetime_funcs/hour.rs
new file mode 100644
index 000000000000..faf9529a5130
--- /dev/null
+++ b/src/datetime_funcs/hour.rs
@@ -0,0 +1,122 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::utils::array_with_timezone;
+use arrow::{
+    compute::{date_part, DatePart},
+    record_batch::RecordBatch,
+};
+use arrow_schema::{DataType, Schema, TimeUnit::Microsecond};
+use datafusion::logical_expr::ColumnarValue;
+use datafusion_common::DataFusionError;
+use datafusion_physical_expr::PhysicalExpr;
+use std::hash::Hash;
+use std::{
+    any::Any,
+    fmt::{Debug, Display, Formatter},
+    sync::Arc,
+};
+
+#[derive(Debug, Eq)]
+pub struct HourExpr {
+    /// An array with DataType::Timestamp(TimeUnit::Microsecond, None)
+    child: Arc<dyn PhysicalExpr>,
+    timezone: String,
+}
+
+impl Hash for HourExpr {
+    fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
+        self.child.hash(state);
+        self.timezone.hash(state);
+    }
+}
+impl PartialEq for HourExpr {
+    fn eq(&self, other: &Self) -> bool {
+        self.child.eq(&other.child) && self.timezone.eq(&other.timezone)
+    }
+}
+
+impl HourExpr {
+    pub fn new(child: Arc<dyn PhysicalExpr>, timezone: String) -> Self {
+        HourExpr { child, timezone }
+    }
+}
+
+impl Display for HourExpr {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        write!(
+            f,
+            "Hour [timezone:{}, child: {}]",
+            self.timezone, self.child
+        )
+    }
+}
+
+impl PhysicalExpr for HourExpr {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn data_type(&self, input_schema: &Schema) -> datafusion_common::Result<DataType> {
+        match self.child.data_type(input_schema).unwrap() {
+            DataType::Dictionary(key_type, _) => {
+                Ok(DataType::Dictionary(key_type, Box::new(DataType::Int32)))
+            }
+            _ => Ok(DataType::Int32),
+        }
+    }
+
+    fn nullable(&self, _: &Schema) -> datafusion_common::Result<bool> {
+        Ok(true)
+    }
+
+    fn evaluate(&self, batch: &RecordBatch) -> datafusion_common::Result<ColumnarValue> {
+        let arg = self.child.evaluate(batch)?;
+        match arg {
+            ColumnarValue::Array(array) => {
+                let array = array_with_timezone(
+                    array,
+                    self.timezone.clone(),
+                    Some(&DataType::Timestamp(
+                        Microsecond,
+                        Some(self.timezone.clone().into()),
+                    )),
+                )?;
+                let result = date_part(&array, DatePart::Hour)?;
+
+                Ok(ColumnarValue::Array(result))
+            }
+            _ => Err(DataFusionError::Execution(
+                "Hour(scalar) should be fold in Spark JVM side.".to_string(),
+            )),
+        }
+    }
+
+    fn children(&self) -> Vec<&Arc<dyn PhysicalExpr>> {
+        vec![&self.child]
+    }
+
+    fn with_new_children(
+        self: Arc<Self>,
+        children: Vec<Arc<dyn PhysicalExpr>>,
+    ) -> Result<Arc<dyn PhysicalExpr>, DataFusionError> {
+        Ok(Arc::new(HourExpr::new(
+            Arc::clone(&children[0]),
+            self.timezone.clone(),
+        )))
+    }
+}
diff --git a/src/datetime_funcs/minute.rs b/src/datetime_funcs/minute.rs
new file mode 100644
index 000000000000..b7facc167334
--- /dev/null
+++ b/src/datetime_funcs/minute.rs
@@ -0,0 +1,122 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::utils::array_with_timezone;
+use arrow::{
+    compute::{date_part, DatePart},
+    record_batch::RecordBatch,
+};
+use arrow_schema::{DataType, Schema, TimeUnit::Microsecond};
+use datafusion::logical_expr::ColumnarValue;
+use datafusion_common::DataFusionError;
+use datafusion_physical_expr::PhysicalExpr;
+use std::hash::Hash;
+use std::{
+    any::Any,
+    fmt::{Debug, Display, Formatter},
+    sync::Arc,
+};
+
+#[derive(Debug, Eq)]
+pub struct MinuteExpr {
+    /// An array with DataType::Timestamp(TimeUnit::Microsecond, None)
+    child: Arc<dyn PhysicalExpr>,
+    timezone: String,
+}
+
+impl Hash for MinuteExpr {
+    fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
+        self.child.hash(state);
+        self.timezone.hash(state);
+    }
+}
+impl PartialEq for MinuteExpr {
+    fn eq(&self, other: &Self) -> bool {
+        self.child.eq(&other.child) && self.timezone.eq(&other.timezone)
+    }
+}
+
+impl MinuteExpr {
+    pub fn new(child: Arc<dyn PhysicalExpr>, timezone: String) -> Self {
+        MinuteExpr { child, timezone }
+    }
+}
+
+impl Display for MinuteExpr {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        write!(
+            f,
+            "Minute [timezone:{}, child: {}]",
+            self.timezone, self.child
+        )
+    }
+}
+
+impl PhysicalExpr for MinuteExpr {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn data_type(&self, input_schema: &Schema) -> datafusion_common::Result<DataType> {
+        match self.child.data_type(input_schema).unwrap() {
+            DataType::Dictionary(key_type, _) => {
+                Ok(DataType::Dictionary(key_type, Box::new(DataType::Int32)))
+            }
+            _ => Ok(DataType::Int32),
+        }
+    }
+
+    fn nullable(&self, _: &Schema) -> datafusion_common::Result<bool> {
+        Ok(true)
+    }
+
+    fn evaluate(&self, batch: &RecordBatch) -> datafusion_common::Result<ColumnarValue> {
+        let arg = self.child.evaluate(batch)?;
+        match arg {
+            ColumnarValue::Array(array) => {
+                let array = array_with_timezone(
+                    array,
+                    self.timezone.clone(),
+                    Some(&DataType::Timestamp(
+                        Microsecond,
+                        Some(self.timezone.clone().into()),
+                    )),
+                )?;
+                let result = date_part(&array, DatePart::Minute)?;
+
+                Ok(ColumnarValue::Array(result))
+            }
+            _ => Err(DataFusionError::Execution(
+                "Minute(scalar) should be fold in Spark JVM side.".to_string(),
+            )),
+        }
+    }
+
+    fn children(&self) -> Vec<&Arc<dyn PhysicalExpr>> {
+        vec![&self.child]
+    }
+
+    fn with_new_children(
+        self: Arc<Self>,
+        children: Vec<Arc<dyn PhysicalExpr>>,
+    ) -> Result<Arc<dyn PhysicalExpr>, DataFusionError> {
+        Ok(Arc::new(MinuteExpr::new(
+            Arc::clone(&children[0]),
+            self.timezone.clone(),
+        )))
+    }
+}
diff --git a/src/datetime_funcs/mod.rs b/src/datetime_funcs/mod.rs
new file mode 100644
index 000000000000..1f4d427282a3
--- /dev/null
+++ b/src/datetime_funcs/mod.rs
@@ -0,0 +1,30 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+mod date_arithmetic;
+mod date_trunc;
+mod hour;
+mod minute;
+mod second;
+mod timestamp_trunc;
+
+pub use date_arithmetic::{spark_date_add, spark_date_sub};
+pub use date_trunc::DateTruncExpr;
+pub use hour::HourExpr;
+pub use minute::MinuteExpr;
+pub use second::SecondExpr;
+pub use timestamp_trunc::TimestampTruncExpr;
diff --git a/src/datetime_funcs/second.rs b/src/datetime_funcs/second.rs
new file mode 100644
index 000000000000..76a4dd9a2ca8
--- /dev/null
+++ b/src/datetime_funcs/second.rs
@@ -0,0 +1,122 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::utils::array_with_timezone;
+use arrow::{
+    compute::{date_part, DatePart},
+    record_batch::RecordBatch,
+};
+use arrow_schema::{DataType, Schema, TimeUnit::Microsecond};
+use datafusion::logical_expr::ColumnarValue;
+use datafusion_common::DataFusionError;
+use datafusion_physical_expr::PhysicalExpr;
+use std::hash::Hash;
+use std::{
+    any::Any,
+    fmt::{Debug, Display, Formatter},
+    sync::Arc,
+};
+
+#[derive(Debug, Eq)]
+pub struct SecondExpr {
+    /// An array with DataType::Timestamp(TimeUnit::Microsecond, None)
+    child: Arc<dyn PhysicalExpr>,
+    timezone: String,
+}
+
+impl Hash for SecondExpr {
+    fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
+        self.child.hash(state);
+        self.timezone.hash(state);
+    }
+}
+impl PartialEq for SecondExpr {
+    fn eq(&self, other: &Self) -> bool {
+        self.child.eq(&other.child) && self.timezone.eq(&other.timezone)
+    }
+}
+
+impl SecondExpr {
+    pub fn new(child: Arc<dyn PhysicalExpr>, timezone: String) -> Self {
+        SecondExpr { child, timezone }
+    }
+}
+
+impl Display for SecondExpr {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        write!(
+            f,
+            "Second (timezone:{}, child: {}]",
+            self.timezone, self.child
+        )
+    }
+}
+
+impl PhysicalExpr for SecondExpr {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn data_type(&self, input_schema: &Schema) -> datafusion_common::Result<DataType> {
+        match self.child.data_type(input_schema).unwrap() {
+            DataType::Dictionary(key_type, _) => {
+                Ok(DataType::Dictionary(key_type, Box::new(DataType::Int32)))
+            }
+            _ => Ok(DataType::Int32),
+        }
+    }
+
+    fn nullable(&self, _: &Schema) -> datafusion_common::Result<bool> {
+        Ok(true)
+    }
+
+    fn evaluate(&self, batch: &RecordBatch) -> datafusion_common::Result<ColumnarValue> {
+        let arg = self.child.evaluate(batch)?;
+        match arg {
+            ColumnarValue::Array(array) => {
+                let array = array_with_timezone(
+                    array,
+                    self.timezone.clone(),
+                    Some(&DataType::Timestamp(
+                        Microsecond,
+                        Some(self.timezone.clone().into()),
+                    )),
+                )?;
+                let result = date_part(&array, DatePart::Second)?;
+
+                Ok(ColumnarValue::Array(result))
+            }
+            _ => Err(DataFusionError::Execution(
+                "Second(scalar) should be fold in Spark JVM side.".to_string(),
+            )),
+        }
+    }
+
+    fn children(&self) -> Vec<&Arc<dyn PhysicalExpr>> {
+        vec![&self.child]
+    }
+
+    fn with_new_children(
+        self: Arc<Self>,
+        children: Vec<Arc<dyn PhysicalExpr>>,
+    ) -> Result<Arc<dyn PhysicalExpr>, DataFusionError> {
+        Ok(Arc::new(SecondExpr::new(
+            Arc::clone(&children[0]),
+            self.timezone.clone(),
+        )))
+    }
+}
diff --git a/src/datetime_funcs/timestamp_trunc.rs b/src/datetime_funcs/timestamp_trunc.rs
new file mode 100644
index 000000000000..349992322f9b
--- /dev/null
+++ b/src/datetime_funcs/timestamp_trunc.rs
@@ -0,0 +1,152 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::utils::array_with_timezone;
+use arrow::record_batch::RecordBatch;
+use arrow_schema::{DataType, Schema, TimeUnit::Microsecond};
+use datafusion::logical_expr::ColumnarValue;
+use datafusion_common::{DataFusionError, ScalarValue::Utf8};
+use datafusion_physical_expr::PhysicalExpr;
+use std::hash::Hash;
+use std::{
+    any::Any,
+    fmt::{Debug, Display, Formatter},
+    sync::Arc,
+};
+
+use crate::kernels::temporal::{timestamp_trunc_array_fmt_dyn, timestamp_trunc_dyn};
+
+#[derive(Debug, Eq)]
+pub struct TimestampTruncExpr {
+    /// An array with DataType::Timestamp(TimeUnit::Microsecond, None)
+    child: Arc<dyn PhysicalExpr>,
+    /// Scalar UTF8 string matching the valid values in Spark SQL: https://spark.apache.org/docs/latest/api/sql/index.html#date_trunc
+    format: Arc<dyn PhysicalExpr>,
+    /// String containing a timezone name. The name must be found in the standard timezone
+    /// database (https://en.wikipedia.org/wiki/List_of_tz_database_time_zones). The string is
+    /// later parsed into a chrono::TimeZone.
+    /// Timestamp arrays in this implementation are kept in arrays of UTC timestamps (in micros)
+    /// along with a single value for the associated TimeZone. The timezone offset is applied
+    /// just before any operations on the timestamp
+    timezone: String,
+}
+
+impl Hash for TimestampTruncExpr {
+    fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
+        self.child.hash(state);
+        self.format.hash(state);
+        self.timezone.hash(state);
+    }
+}
+impl PartialEq for TimestampTruncExpr {
+    fn eq(&self, other: &Self) -> bool {
+        self.child.eq(&other.child)
+            && self.format.eq(&other.format)
+            && self.timezone.eq(&other.timezone)
+    }
+}
+
+impl TimestampTruncExpr {
+    pub fn new(
+        child: Arc<dyn PhysicalExpr>,
+        format: Arc<dyn PhysicalExpr>,
+        timezone: String,
+    ) -> Self {
+        TimestampTruncExpr {
+            child,
+            format,
+            timezone,
+        }
+    }
+}
+
+impl Display for TimestampTruncExpr {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        write!(
+            f,
+            "TimestampTrunc [child:{}, format:{}, timezone: {}]",
+            self.child, self.format, self.timezone
+        )
+    }
+}
+
+impl PhysicalExpr for TimestampTruncExpr {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn data_type(&self, input_schema: &Schema) -> datafusion_common::Result<DataType> {
+        match self.child.data_type(input_schema)? {
+            DataType::Dictionary(key_type, _) => Ok(DataType::Dictionary(
+                key_type,
+                Box::new(DataType::Timestamp(Microsecond, None)),
+            )),
+            _ => Ok(DataType::Timestamp(Microsecond, None)),
+        }
+    }
+
+    fn nullable(&self, _: &Schema) -> datafusion_common::Result<bool> {
+        Ok(true)
+    }
+
+    fn evaluate(&self, batch: &RecordBatch) -> datafusion_common::Result<ColumnarValue> {
+        let timestamp = self.child.evaluate(batch)?;
+        let format = self.format.evaluate(batch)?;
+        let tz = self.timezone.clone();
+        match (timestamp, format) {
+            (ColumnarValue::Array(ts), ColumnarValue::Scalar(Utf8(Some(format)))) => {
+                let ts = array_with_timezone(
+                    ts,
+                    tz.clone(),
+                    Some(&DataType::Timestamp(Microsecond, Some(tz.into()))),
+                )?;
+                let result = timestamp_trunc_dyn(&ts, format)?;
+                Ok(ColumnarValue::Array(result))
+            }
+            (ColumnarValue::Array(ts), ColumnarValue::Array(formats)) => {
+                let ts = array_with_timezone(
+                    ts,
+                    tz.clone(),
+                    Some(&DataType::Timestamp(Microsecond, Some(tz.into()))),
+                )?;
+                let result = timestamp_trunc_array_fmt_dyn(&ts, &formats)?;
+                Ok(ColumnarValue::Array(result))
+            }
+            _ => Err(DataFusionError::Execution(
+                "Invalid input to function TimestampTrunc. \
+                    Expected (PrimitiveArray<TimestampMicrosecondType>, Scalar, String) or \
+                    (PrimitiveArray<TimestampMicrosecondType>, StringArray, String)"
+                    .to_string(),
+            )),
+        }
+    }
+
+    fn children(&self) -> Vec<&Arc<dyn PhysicalExpr>> {
+        vec![&self.child]
+    }
+
+    fn with_new_children(
+        self: Arc<Self>,
+        children: Vec<Arc<dyn PhysicalExpr>>,
+    ) -> Result<Arc<dyn PhysicalExpr>, DataFusionError> {
+        Ok(Arc::new(TimestampTruncExpr::new(
+            Arc::clone(&children[0]),
+            Arc::clone(&self.format),
+            self.timezone.clone(),
+        )))
+    }
+}
diff --git a/src/lib.rs b/src/lib.rs
index 3c2a70197034..5bffb9ee1c8b 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -43,7 +43,6 @@ pub mod spark_hash;
 mod struct_funcs;
 pub use negative::{create_negate_expr, NegativeExpr};
 mod normalize_nan;
-mod temporal;
 
 pub mod test_common;
 pub mod timezone;
@@ -55,17 +54,19 @@ pub use normalize_nan::NormalizeNaNAndZero;
 
 mod agg_funcs;
 mod comet_scalar_funcs;
+mod datetime_funcs;
 pub use agg_funcs::*;
 
+pub use crate::{CreateNamedStruct, GetStructField};
+pub use crate::{DateTruncExpr, HourExpr, MinuteExpr, SecondExpr, TimestampTruncExpr};
 pub use cast::{spark_cast, Cast, SparkCastOptions};
 pub use comet_scalar_funcs::create_comet_physical_fun;
+pub use datetime_funcs::*;
 pub use error::{SparkError, SparkResult};
 pub use if_expr::IfExpr;
 pub use list::{ArrayInsert, GetArrayStructFields, ListExtract};
 pub use regexp::RLike;
 pub use struct_funcs::*;
-
-pub use temporal::{DateTruncExpr, HourExpr, MinuteExpr, SecondExpr, TimestampTruncExpr};
 pub use to_json::ToJson;
 
 /// Spark supports three evaluation modes when evaluating expressions, which affect
diff --git a/src/scalar_funcs.rs b/src/scalar_funcs.rs
index 345a800a30fa..e11d1c5db1ba 100644
--- a/src/scalar_funcs.rs
+++ b/src/scalar_funcs.rs
@@ -15,8 +15,6 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use arrow::compute::kernels::numeric::{add, sub};
-use arrow::datatypes::IntervalDayTime;
 use arrow::{
     array::{
         ArrayRef, AsArray, Decimal128Builder, Float32Array, Float64Array, Int16Array, Int32Array,
@@ -24,11 +22,8 @@ use arrow::{
     },
     datatypes::{validate_decimal_precision, Decimal128Type, Int64Type},
 };
-use arrow_array::builder::IntervalDayTimeBuilder;
-use arrow_array::types::{Int16Type, Int32Type, Int8Type};
-use arrow_array::{Array, ArrowNativeTypeOp, BooleanArray, Datum, Decimal128Array};
-use arrow_schema::{ArrowError, DataType, DECIMAL128_MAX_PRECISION};
-use datafusion::physical_expr_common::datum;
+use arrow_array::{Array, ArrowNativeTypeOp, BooleanArray, Decimal128Array};
+use arrow_schema::{DataType, DECIMAL128_MAX_PRECISION};
 use datafusion::{functions::math::round::round, physical_plan::ColumnarValue};
 use datafusion_common::{
     exec_err, internal_err, DataFusionError, Result as DataFusionResult, ScalarValue,
@@ -498,76 +493,3 @@ pub fn spark_isnan(args: &[ColumnarValue]) -> Result<ColumnarValue, DataFusionEr
         },
     }
 }
-
-macro_rules! scalar_date_arithmetic {
-    ($start:expr, $days:expr, $op:expr) => {{
-        let interval = IntervalDayTime::new(*$days as i32, 0);
-        let interval_cv = ColumnarValue::Scalar(ScalarValue::IntervalDayTime(Some(interval)));
-        datum::apply($start, &interval_cv, $op)
-    }};
-}
-macro_rules! array_date_arithmetic {
-    ($days:expr, $interval_builder:expr, $intType:ty) => {{
-        for day in $days.as_primitive::<$intType>().into_iter() {
-            if let Some(non_null_day) = day {
-                $interval_builder.append_value(IntervalDayTime::new(non_null_day as i32, 0));
-            } else {
-                $interval_builder.append_null();
-            }
-        }
-    }};
-}
-
-/// Spark-compatible `date_add` and `date_sub` expressions, which assumes days for the second
-/// argument, but we cannot directly add that to a Date32. We generate an IntervalDayTime from the
-/// second argument and use DataFusion's interface to apply Arrow's operators.
-fn spark_date_arithmetic(
-    args: &[ColumnarValue],
-    op: impl Fn(&dyn Datum, &dyn Datum) -> Result<ArrayRef, ArrowError>,
-) -> Result<ColumnarValue, DataFusionError> {
-    let start = &args[0];
-    match &args[1] {
-        ColumnarValue::Scalar(ScalarValue::Int8(Some(days))) => {
-            scalar_date_arithmetic!(start, days, op)
-        }
-        ColumnarValue::Scalar(ScalarValue::Int16(Some(days))) => {
-            scalar_date_arithmetic!(start, days, op)
-        }
-        ColumnarValue::Scalar(ScalarValue::Int32(Some(days))) => {
-            scalar_date_arithmetic!(start, days, op)
-        }
-        ColumnarValue::Array(days) => {
-            let mut interval_builder = IntervalDayTimeBuilder::with_capacity(days.len());
-            match days.data_type() {
-                DataType::Int8 => {
-                    array_date_arithmetic!(days, interval_builder, Int8Type)
-                }
-                DataType::Int16 => {
-                    array_date_arithmetic!(days, interval_builder, Int16Type)
-                }
-                DataType::Int32 => {
-                    array_date_arithmetic!(days, interval_builder, Int32Type)
-                }
-                _ => {
-                    return Err(DataFusionError::Internal(format!(
-                        "Unsupported data types {:?} for date arithmetic.",
-                        args,
-                    )))
-                }
-            }
-            let interval_cv = ColumnarValue::Array(Arc::new(interval_builder.finish()));
-            datum::apply(start, &interval_cv, op)
-        }
-        _ => Err(DataFusionError::Internal(format!(
-            "Unsupported data types {:?} for date arithmetic.",
-            args,
-        ))),
-    }
-}
-pub fn spark_date_add(args: &[ColumnarValue]) -> Result<ColumnarValue, DataFusionError> {
-    spark_date_arithmetic(args, add)
-}
-
-pub fn spark_date_sub(args: &[ColumnarValue]) -> Result<ColumnarValue, DataFusionError> {
-    spark_date_arithmetic(args, sub)
-}
diff --git a/src/temporal.rs b/src/temporal.rs
deleted file mode 100644
index fb549f9ce818..000000000000
--- a/src/temporal.rs
+++ /dev/null
@@ -1,510 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-use crate::utils::array_with_timezone;
-use arrow::{
-    compute::{date_part, DatePart},
-    record_batch::RecordBatch,
-};
-use arrow_schema::{DataType, Schema, TimeUnit::Microsecond};
-use datafusion::logical_expr::ColumnarValue;
-use datafusion_common::{DataFusionError, ScalarValue::Utf8};
-use datafusion_physical_expr::PhysicalExpr;
-use std::hash::Hash;
-use std::{
-    any::Any,
-    fmt::{Debug, Display, Formatter},
-    sync::Arc,
-};
-
-use crate::kernels::temporal::{
-    date_trunc_array_fmt_dyn, date_trunc_dyn, timestamp_trunc_array_fmt_dyn, timestamp_trunc_dyn,
-};
-
-#[derive(Debug, Eq)]
-pub struct HourExpr {
-    /// An array with DataType::Timestamp(TimeUnit::Microsecond, None)
-    child: Arc<dyn PhysicalExpr>,
-    timezone: String,
-}
-
-impl Hash for HourExpr {
-    fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
-        self.child.hash(state);
-        self.timezone.hash(state);
-    }
-}
-impl PartialEq for HourExpr {
-    fn eq(&self, other: &Self) -> bool {
-        self.child.eq(&other.child) && self.timezone.eq(&other.timezone)
-    }
-}
-
-impl HourExpr {
-    pub fn new(child: Arc<dyn PhysicalExpr>, timezone: String) -> Self {
-        HourExpr { child, timezone }
-    }
-}
-
-impl Display for HourExpr {
-    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
-        write!(
-            f,
-            "Hour [timezone:{}, child: {}]",
-            self.timezone, self.child
-        )
-    }
-}
-
-impl PhysicalExpr for HourExpr {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
-    fn data_type(&self, input_schema: &Schema) -> datafusion_common::Result<DataType> {
-        match self.child.data_type(input_schema).unwrap() {
-            DataType::Dictionary(key_type, _) => {
-                Ok(DataType::Dictionary(key_type, Box::new(DataType::Int32)))
-            }
-            _ => Ok(DataType::Int32),
-        }
-    }
-
-    fn nullable(&self, _: &Schema) -> datafusion_common::Result<bool> {
-        Ok(true)
-    }
-
-    fn evaluate(&self, batch: &RecordBatch) -> datafusion_common::Result<ColumnarValue> {
-        let arg = self.child.evaluate(batch)?;
-        match arg {
-            ColumnarValue::Array(array) => {
-                let array = array_with_timezone(
-                    array,
-                    self.timezone.clone(),
-                    Some(&DataType::Timestamp(
-                        Microsecond,
-                        Some(self.timezone.clone().into()),
-                    )),
-                )?;
-                let result = date_part(&array, DatePart::Hour)?;
-
-                Ok(ColumnarValue::Array(result))
-            }
-            _ => Err(DataFusionError::Execution(
-                "Hour(scalar) should be fold in Spark JVM side.".to_string(),
-            )),
-        }
-    }
-
-    fn children(&self) -> Vec<&Arc<dyn PhysicalExpr>> {
-        vec![&self.child]
-    }
-
-    fn with_new_children(
-        self: Arc<Self>,
-        children: Vec<Arc<dyn PhysicalExpr>>,
-    ) -> Result<Arc<dyn PhysicalExpr>, DataFusionError> {
-        Ok(Arc::new(HourExpr::new(
-            Arc::clone(&children[0]),
-            self.timezone.clone(),
-        )))
-    }
-}
-
-#[derive(Debug, Eq)]
-pub struct MinuteExpr {
-    /// An array with DataType::Timestamp(TimeUnit::Microsecond, None)
-    child: Arc<dyn PhysicalExpr>,
-    timezone: String,
-}
-
-impl Hash for MinuteExpr {
-    fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
-        self.child.hash(state);
-        self.timezone.hash(state);
-    }
-}
-impl PartialEq for MinuteExpr {
-    fn eq(&self, other: &Self) -> bool {
-        self.child.eq(&other.child) && self.timezone.eq(&other.timezone)
-    }
-}
-
-impl MinuteExpr {
-    pub fn new(child: Arc<dyn PhysicalExpr>, timezone: String) -> Self {
-        MinuteExpr { child, timezone }
-    }
-}
-
-impl Display for MinuteExpr {
-    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
-        write!(
-            f,
-            "Minute [timezone:{}, child: {}]",
-            self.timezone, self.child
-        )
-    }
-}
-
-impl PhysicalExpr for MinuteExpr {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
-    fn data_type(&self, input_schema: &Schema) -> datafusion_common::Result<DataType> {
-        match self.child.data_type(input_schema).unwrap() {
-            DataType::Dictionary(key_type, _) => {
-                Ok(DataType::Dictionary(key_type, Box::new(DataType::Int32)))
-            }
-            _ => Ok(DataType::Int32),
-        }
-    }
-
-    fn nullable(&self, _: &Schema) -> datafusion_common::Result<bool> {
-        Ok(true)
-    }
-
-    fn evaluate(&self, batch: &RecordBatch) -> datafusion_common::Result<ColumnarValue> {
-        let arg = self.child.evaluate(batch)?;
-        match arg {
-            ColumnarValue::Array(array) => {
-                let array = array_with_timezone(
-                    array,
-                    self.timezone.clone(),
-                    Some(&DataType::Timestamp(
-                        Microsecond,
-                        Some(self.timezone.clone().into()),
-                    )),
-                )?;
-                let result = date_part(&array, DatePart::Minute)?;
-
-                Ok(ColumnarValue::Array(result))
-            }
-            _ => Err(DataFusionError::Execution(
-                "Minute(scalar) should be fold in Spark JVM side.".to_string(),
-            )),
-        }
-    }
-
-    fn children(&self) -> Vec<&Arc<dyn PhysicalExpr>> {
-        vec![&self.child]
-    }
-
-    fn with_new_children(
-        self: Arc<Self>,
-        children: Vec<Arc<dyn PhysicalExpr>>,
-    ) -> Result<Arc<dyn PhysicalExpr>, DataFusionError> {
-        Ok(Arc::new(MinuteExpr::new(
-            Arc::clone(&children[0]),
-            self.timezone.clone(),
-        )))
-    }
-}
-
-#[derive(Debug, Eq)]
-pub struct SecondExpr {
-    /// An array with DataType::Timestamp(TimeUnit::Microsecond, None)
-    child: Arc<dyn PhysicalExpr>,
-    timezone: String,
-}
-
-impl Hash for SecondExpr {
-    fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
-        self.child.hash(state);
-        self.timezone.hash(state);
-    }
-}
-impl PartialEq for SecondExpr {
-    fn eq(&self, other: &Self) -> bool {
-        self.child.eq(&other.child) && self.timezone.eq(&other.timezone)
-    }
-}
-
-impl SecondExpr {
-    pub fn new(child: Arc<dyn PhysicalExpr>, timezone: String) -> Self {
-        SecondExpr { child, timezone }
-    }
-}
-
-impl Display for SecondExpr {
-    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
-        write!(
-            f,
-            "Second (timezone:{}, child: {}]",
-            self.timezone, self.child
-        )
-    }
-}
-
-impl PhysicalExpr for SecondExpr {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
-    fn data_type(&self, input_schema: &Schema) -> datafusion_common::Result<DataType> {
-        match self.child.data_type(input_schema).unwrap() {
-            DataType::Dictionary(key_type, _) => {
-                Ok(DataType::Dictionary(key_type, Box::new(DataType::Int32)))
-            }
-            _ => Ok(DataType::Int32),
-        }
-    }
-
-    fn nullable(&self, _: &Schema) -> datafusion_common::Result<bool> {
-        Ok(true)
-    }
-
-    fn evaluate(&self, batch: &RecordBatch) -> datafusion_common::Result<ColumnarValue> {
-        let arg = self.child.evaluate(batch)?;
-        match arg {
-            ColumnarValue::Array(array) => {
-                let array = array_with_timezone(
-                    array,
-                    self.timezone.clone(),
-                    Some(&DataType::Timestamp(
-                        Microsecond,
-                        Some(self.timezone.clone().into()),
-                    )),
-                )?;
-                let result = date_part(&array, DatePart::Second)?;
-
-                Ok(ColumnarValue::Array(result))
-            }
-            _ => Err(DataFusionError::Execution(
-                "Second(scalar) should be fold in Spark JVM side.".to_string(),
-            )),
-        }
-    }
-
-    fn children(&self) -> Vec<&Arc<dyn PhysicalExpr>> {
-        vec![&self.child]
-    }
-
-    fn with_new_children(
-        self: Arc<Self>,
-        children: Vec<Arc<dyn PhysicalExpr>>,
-    ) -> Result<Arc<dyn PhysicalExpr>, DataFusionError> {
-        Ok(Arc::new(SecondExpr::new(
-            Arc::clone(&children[0]),
-            self.timezone.clone(),
-        )))
-    }
-}
-
-#[derive(Debug, Eq)]
-pub struct DateTruncExpr {
-    /// An array with DataType::Date32
-    child: Arc<dyn PhysicalExpr>,
-    /// Scalar UTF8 string matching the valid values in Spark SQL: https://spark.apache.org/docs/latest/api/sql/index.html#trunc
-    format: Arc<dyn PhysicalExpr>,
-}
-
-impl Hash for DateTruncExpr {
-    fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
-        self.child.hash(state);
-        self.format.hash(state);
-    }
-}
-impl PartialEq for DateTruncExpr {
-    fn eq(&self, other: &Self) -> bool {
-        self.child.eq(&other.child) && self.format.eq(&other.format)
-    }
-}
-
-impl DateTruncExpr {
-    pub fn new(child: Arc<dyn PhysicalExpr>, format: Arc<dyn PhysicalExpr>) -> Self {
-        DateTruncExpr { child, format }
-    }
-}
-
-impl Display for DateTruncExpr {
-    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
-        write!(
-            f,
-            "DateTrunc [child:{}, format: {}]",
-            self.child, self.format
-        )
-    }
-}
-
-impl PhysicalExpr for DateTruncExpr {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
-    fn data_type(&self, input_schema: &Schema) -> datafusion_common::Result<DataType> {
-        self.child.data_type(input_schema)
-    }
-
-    fn nullable(&self, _: &Schema) -> datafusion_common::Result<bool> {
-        Ok(true)
-    }
-
-    fn evaluate(&self, batch: &RecordBatch) -> datafusion_common::Result<ColumnarValue> {
-        let date = self.child.evaluate(batch)?;
-        let format = self.format.evaluate(batch)?;
-        match (date, format) {
-            (ColumnarValue::Array(date), ColumnarValue::Scalar(Utf8(Some(format)))) => {
-                let result = date_trunc_dyn(&date, format)?;
-                Ok(ColumnarValue::Array(result))
-            }
-            (ColumnarValue::Array(date), ColumnarValue::Array(formats)) => {
-                let result = date_trunc_array_fmt_dyn(&date, &formats)?;
-                Ok(ColumnarValue::Array(result))
-            }
-            _ => Err(DataFusionError::Execution(
-                "Invalid input to function DateTrunc. Expected (PrimitiveArray<Date32>, Scalar) or \
-                    (PrimitiveArray<Date32>, StringArray)".to_string(),
-            )),
-        }
-    }
-
-    fn children(&self) -> Vec<&Arc<dyn PhysicalExpr>> {
-        vec![&self.child]
-    }
-
-    fn with_new_children(
-        self: Arc<Self>,
-        children: Vec<Arc<dyn PhysicalExpr>>,
-    ) -> Result<Arc<dyn PhysicalExpr>, DataFusionError> {
-        Ok(Arc::new(DateTruncExpr::new(
-            Arc::clone(&children[0]),
-            Arc::clone(&self.format),
-        )))
-    }
-}
-
-#[derive(Debug, Eq)]
-pub struct TimestampTruncExpr {
-    /// An array with DataType::Timestamp(TimeUnit::Microsecond, None)
-    child: Arc<dyn PhysicalExpr>,
-    /// Scalar UTF8 string matching the valid values in Spark SQL: https://spark.apache.org/docs/latest/api/sql/index.html#date_trunc
-    format: Arc<dyn PhysicalExpr>,
-    /// String containing a timezone name. The name must be found in the standard timezone
-    /// database (https://en.wikipedia.org/wiki/List_of_tz_database_time_zones). The string is
-    /// later parsed into a chrono::TimeZone.
-    /// Timestamp arrays in this implementation are kept in arrays of UTC timestamps (in micros)
-    /// along with a single value for the associated TimeZone. The timezone offset is applied
-    /// just before any operations on the timestamp
-    timezone: String,
-}
-
-impl Hash for TimestampTruncExpr {
-    fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
-        self.child.hash(state);
-        self.format.hash(state);
-        self.timezone.hash(state);
-    }
-}
-impl PartialEq for TimestampTruncExpr {
-    fn eq(&self, other: &Self) -> bool {
-        self.child.eq(&other.child)
-            && self.format.eq(&other.format)
-            && self.timezone.eq(&other.timezone)
-    }
-}
-
-impl TimestampTruncExpr {
-    pub fn new(
-        child: Arc<dyn PhysicalExpr>,
-        format: Arc<dyn PhysicalExpr>,
-        timezone: String,
-    ) -> Self {
-        TimestampTruncExpr {
-            child,
-            format,
-            timezone,
-        }
-    }
-}
-
-impl Display for TimestampTruncExpr {
-    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
-        write!(
-            f,
-            "TimestampTrunc [child:{}, format:{}, timezone: {}]",
-            self.child, self.format, self.timezone
-        )
-    }
-}
-
-impl PhysicalExpr for TimestampTruncExpr {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
-    fn data_type(&self, input_schema: &Schema) -> datafusion_common::Result<DataType> {
-        match self.child.data_type(input_schema)? {
-            DataType::Dictionary(key_type, _) => Ok(DataType::Dictionary(
-                key_type,
-                Box::new(DataType::Timestamp(Microsecond, None)),
-            )),
-            _ => Ok(DataType::Timestamp(Microsecond, None)),
-        }
-    }
-
-    fn nullable(&self, _: &Schema) -> datafusion_common::Result<bool> {
-        Ok(true)
-    }
-
-    fn evaluate(&self, batch: &RecordBatch) -> datafusion_common::Result<ColumnarValue> {
-        let timestamp = self.child.evaluate(batch)?;
-        let format = self.format.evaluate(batch)?;
-        let tz = self.timezone.clone();
-        match (timestamp, format) {
-            (ColumnarValue::Array(ts), ColumnarValue::Scalar(Utf8(Some(format)))) => {
-                let ts = array_with_timezone(
-                    ts,
-                    tz.clone(),
-                    Some(&DataType::Timestamp(Microsecond, Some(tz.into()))),
-                )?;
-                let result = timestamp_trunc_dyn(&ts, format)?;
-                Ok(ColumnarValue::Array(result))
-            }
-            (ColumnarValue::Array(ts), ColumnarValue::Array(formats)) => {
-                let ts = array_with_timezone(
-                    ts,
-                    tz.clone(),
-                    Some(&DataType::Timestamp(Microsecond, Some(tz.into()))),
-                )?;
-                let result = timestamp_trunc_array_fmt_dyn(&ts, &formats)?;
-                Ok(ColumnarValue::Array(result))
-            }
-            _ => Err(DataFusionError::Execution(
-                "Invalid input to function TimestampTrunc. \
-                    Expected (PrimitiveArray<TimestampMicrosecondType>, Scalar, String) or \
-                    (PrimitiveArray<TimestampMicrosecondType>, StringArray, String)"
-                    .to_string(),
-            )),
-        }
-    }
-
-    fn children(&self) -> Vec<&Arc<dyn PhysicalExpr>> {
-        vec![&self.child]
-    }
-
-    fn with_new_children(
-        self: Arc<Self>,
-        children: Vec<Arc<dyn PhysicalExpr>>,
-    ) -> Result<Arc<dyn PhysicalExpr>, DataFusionError> {
-        Ok(Arc::new(TimestampTruncExpr::new(
-            Arc::clone(&children[0]),
-            Arc::clone(&self.format),
-            self.timezone.clone(),
-        )))
-    }
-}

From 7a23f62d2c1da825d1ee768a9382848d71eeff6a Mon Sep 17 00:00:00 2001
From: Raz Luvaton <16746759+rluvaton@users.noreply.github.com>
Date: Wed, 8 Jan 2025 06:27:52 +0200
Subject: [PATCH 48/68] chore: extract strings file to `strings_func` like in
 spark grouping (#1215)

---
 src/lib.rs                                    |   6 +-
 src/string_funcs/mod.rs                       |  24 +++
 .../prediction.rs}                            | 153 ------------------
 src/string_funcs/string_space.rs              | 104 ++++++++++++
 src/string_funcs/substring.rs                 | 111 +++++++++++++
 5 files changed, 243 insertions(+), 155 deletions(-)
 create mode 100644 src/string_funcs/mod.rs
 rename src/{strings.rs => string_funcs/prediction.rs} (54%)
 create mode 100644 src/string_funcs/string_space.rs
 create mode 100644 src/string_funcs/substring.rs

diff --git a/src/lib.rs b/src/lib.rs
index 5bffb9ee1c8b..c7c54a4e9f9f 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -27,8 +27,7 @@ mod bitwise_not;
 pub use bitwise_not::{bitwise_not, BitwiseNotExpr};
 mod checkoverflow;
 pub use checkoverflow::CheckOverflow;
-mod strings;
-pub use strings::{Contains, EndsWith, Like, StartsWith, StringSpaceExpr, SubstringExpr};
+
 mod kernels;
 mod list;
 mod regexp;
@@ -54,6 +53,8 @@ pub use normalize_nan::NormalizeNaNAndZero;
 
 mod agg_funcs;
 mod comet_scalar_funcs;
+mod string_funcs;
+
 mod datetime_funcs;
 pub use agg_funcs::*;
 
@@ -66,6 +67,7 @@ pub use error::{SparkError, SparkResult};
 pub use if_expr::IfExpr;
 pub use list::{ArrayInsert, GetArrayStructFields, ListExtract};
 pub use regexp::RLike;
+pub use string_funcs::*;
 pub use struct_funcs::*;
 pub use to_json::ToJson;
 
diff --git a/src/string_funcs/mod.rs b/src/string_funcs/mod.rs
new file mode 100644
index 000000000000..2c2a5b37c70c
--- /dev/null
+++ b/src/string_funcs/mod.rs
@@ -0,0 +1,24 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+mod prediction;
+mod string_space;
+mod substring;
+
+pub use prediction::*;
+pub use string_space::StringSpaceExpr;
+pub use substring::SubstringExpr;
diff --git a/src/strings.rs b/src/string_funcs/prediction.rs
similarity index 54%
rename from src/strings.rs
rename to src/string_funcs/prediction.rs
index c2706b589652..d2ef82fcbe73 100644
--- a/src/strings.rs
+++ b/src/string_funcs/prediction.rs
@@ -17,7 +17,6 @@
 
 #![allow(deprecated)]
 
-use crate::kernels::strings::{string_space, substring};
 use arrow::{
     compute::{
         contains_dyn, contains_utf8_scalar_dyn, ends_with_dyn, ends_with_utf8_scalar_dyn, like_dyn,
@@ -136,155 +135,3 @@ make_predicate_function!(StartsWith, starts_with_dyn, starts_with_utf8_scalar_dy
 make_predicate_function!(EndsWith, ends_with_dyn, ends_with_utf8_scalar_dyn);
 
 make_predicate_function!(Contains, contains_dyn, contains_utf8_scalar_dyn);
-
-#[derive(Debug, Eq)]
-pub struct SubstringExpr {
-    pub child: Arc<dyn PhysicalExpr>,
-    pub start: i64,
-    pub len: u64,
-}
-
-impl Hash for SubstringExpr {
-    fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
-        self.child.hash(state);
-        self.start.hash(state);
-        self.len.hash(state);
-    }
-}
-
-impl PartialEq for SubstringExpr {
-    fn eq(&self, other: &Self) -> bool {
-        self.child.eq(&other.child) && self.start.eq(&other.start) && self.len.eq(&other.len)
-    }
-}
-#[derive(Debug, Eq)]
-pub struct StringSpaceExpr {
-    pub child: Arc<dyn PhysicalExpr>,
-}
-
-impl Hash for StringSpaceExpr {
-    fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
-        self.child.hash(state);
-    }
-}
-
-impl PartialEq for StringSpaceExpr {
-    fn eq(&self, other: &Self) -> bool {
-        self.child.eq(&other.child)
-    }
-}
-
-impl SubstringExpr {
-    pub fn new(child: Arc<dyn PhysicalExpr>, start: i64, len: u64) -> Self {
-        Self { child, start, len }
-    }
-}
-
-impl StringSpaceExpr {
-    pub fn new(child: Arc<dyn PhysicalExpr>) -> Self {
-        Self { child }
-    }
-}
-
-impl Display for SubstringExpr {
-    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
-        write!(
-            f,
-            "StringSpace [start: {}, len: {}, child: {}]",
-            self.start, self.len, self.child
-        )
-    }
-}
-
-impl Display for StringSpaceExpr {
-    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
-        write!(f, "StringSpace [child: {}] ", self.child)
-    }
-}
-
-impl PhysicalExpr for SubstringExpr {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
-    fn data_type(&self, input_schema: &Schema) -> datafusion_common::Result<DataType> {
-        self.child.data_type(input_schema)
-    }
-
-    fn nullable(&self, _: &Schema) -> datafusion_common::Result<bool> {
-        Ok(true)
-    }
-
-    fn evaluate(&self, batch: &RecordBatch) -> datafusion_common::Result<ColumnarValue> {
-        let arg = self.child.evaluate(batch)?;
-        match arg {
-            ColumnarValue::Array(array) => {
-                let result = substring(&array, self.start, self.len)?;
-
-                Ok(ColumnarValue::Array(result))
-            }
-            _ => Err(DataFusionError::Execution(
-                "Substring(scalar) should be fold in Spark JVM side.".to_string(),
-            )),
-        }
-    }
-
-    fn children(&self) -> Vec<&Arc<dyn PhysicalExpr>> {
-        vec![&self.child]
-    }
-
-    fn with_new_children(
-        self: Arc<Self>,
-        children: Vec<Arc<dyn PhysicalExpr>>,
-    ) -> datafusion_common::Result<Arc<dyn PhysicalExpr>> {
-        Ok(Arc::new(SubstringExpr::new(
-            Arc::clone(&children[0]),
-            self.start,
-            self.len,
-        )))
-    }
-}
-
-impl PhysicalExpr for StringSpaceExpr {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
-    fn data_type(&self, input_schema: &Schema) -> datafusion_common::Result<DataType> {
-        match self.child.data_type(input_schema)? {
-            DataType::Dictionary(key_type, _) => {
-                Ok(DataType::Dictionary(key_type, Box::new(DataType::Utf8)))
-            }
-            _ => Ok(DataType::Utf8),
-        }
-    }
-
-    fn nullable(&self, _: &Schema) -> datafusion_common::Result<bool> {
-        Ok(true)
-    }
-
-    fn evaluate(&self, batch: &RecordBatch) -> datafusion_common::Result<ColumnarValue> {
-        let arg = self.child.evaluate(batch)?;
-        match arg {
-            ColumnarValue::Array(array) => {
-                let result = string_space(&array)?;
-
-                Ok(ColumnarValue::Array(result))
-            }
-            _ => Err(DataFusionError::Execution(
-                "StringSpace(scalar) should be fold in Spark JVM side.".to_string(),
-            )),
-        }
-    }
-
-    fn children(&self) -> Vec<&Arc<dyn PhysicalExpr>> {
-        vec![&self.child]
-    }
-
-    fn with_new_children(
-        self: Arc<Self>,
-        children: Vec<Arc<dyn PhysicalExpr>>,
-    ) -> datafusion_common::Result<Arc<dyn PhysicalExpr>> {
-        Ok(Arc::new(StringSpaceExpr::new(Arc::clone(&children[0]))))
-    }
-}
diff --git a/src/string_funcs/string_space.rs b/src/string_funcs/string_space.rs
new file mode 100644
index 000000000000..db7092905780
--- /dev/null
+++ b/src/string_funcs/string_space.rs
@@ -0,0 +1,104 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#![allow(deprecated)]
+
+use crate::kernels::strings::string_space;
+use arrow::record_batch::RecordBatch;
+use arrow_schema::{DataType, Schema};
+use datafusion::logical_expr::ColumnarValue;
+use datafusion_common::DataFusionError;
+use datafusion_physical_expr::PhysicalExpr;
+use std::{
+    any::Any,
+    fmt::{Display, Formatter},
+    hash::Hash,
+    sync::Arc,
+};
+
+#[derive(Debug, Eq)]
+pub struct StringSpaceExpr {
+    pub child: Arc<dyn PhysicalExpr>,
+}
+
+impl Hash for StringSpaceExpr {
+    fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
+        self.child.hash(state);
+    }
+}
+
+impl PartialEq for StringSpaceExpr {
+    fn eq(&self, other: &Self) -> bool {
+        self.child.eq(&other.child)
+    }
+}
+
+impl StringSpaceExpr {
+    pub fn new(child: Arc<dyn PhysicalExpr>) -> Self {
+        Self { child }
+    }
+}
+
+impl Display for StringSpaceExpr {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        write!(f, "StringSpace [child: {}] ", self.child)
+    }
+}
+
+impl PhysicalExpr for StringSpaceExpr {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn data_type(&self, input_schema: &Schema) -> datafusion_common::Result<DataType> {
+        match self.child.data_type(input_schema)? {
+            DataType::Dictionary(key_type, _) => {
+                Ok(DataType::Dictionary(key_type, Box::new(DataType::Utf8)))
+            }
+            _ => Ok(DataType::Utf8),
+        }
+    }
+
+    fn nullable(&self, _: &Schema) -> datafusion_common::Result<bool> {
+        Ok(true)
+    }
+
+    fn evaluate(&self, batch: &RecordBatch) -> datafusion_common::Result<ColumnarValue> {
+        let arg = self.child.evaluate(batch)?;
+        match arg {
+            ColumnarValue::Array(array) => {
+                let result = string_space(&array)?;
+
+                Ok(ColumnarValue::Array(result))
+            }
+            _ => Err(DataFusionError::Execution(
+                "StringSpace(scalar) should be fold in Spark JVM side.".to_string(),
+            )),
+        }
+    }
+
+    fn children(&self) -> Vec<&Arc<dyn PhysicalExpr>> {
+        vec![&self.child]
+    }
+
+    fn with_new_children(
+        self: Arc<Self>,
+        children: Vec<Arc<dyn PhysicalExpr>>,
+    ) -> datafusion_common::Result<Arc<dyn PhysicalExpr>> {
+        Ok(Arc::new(StringSpaceExpr::new(Arc::clone(&children[0]))))
+    }
+}
diff --git a/src/string_funcs/substring.rs b/src/string_funcs/substring.rs
new file mode 100644
index 000000000000..741ea9139d61
--- /dev/null
+++ b/src/string_funcs/substring.rs
@@ -0,0 +1,111 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#![allow(deprecated)]
+
+use crate::kernels::strings::substring;
+use arrow::record_batch::RecordBatch;
+use arrow_schema::{DataType, Schema};
+use datafusion::logical_expr::ColumnarValue;
+use datafusion_common::DataFusionError;
+use datafusion_physical_expr::PhysicalExpr;
+use std::{
+    any::Any,
+    fmt::{Display, Formatter},
+    hash::Hash,
+    sync::Arc,
+};
+
+#[derive(Debug, Eq)]
+pub struct SubstringExpr {
+    pub child: Arc<dyn PhysicalExpr>,
+    pub start: i64,
+    pub len: u64,
+}
+
+impl Hash for SubstringExpr {
+    fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
+        self.child.hash(state);
+        self.start.hash(state);
+        self.len.hash(state);
+    }
+}
+
+impl PartialEq for SubstringExpr {
+    fn eq(&self, other: &Self) -> bool {
+        self.child.eq(&other.child) && self.start.eq(&other.start) && self.len.eq(&other.len)
+    }
+}
+
+impl SubstringExpr {
+    pub fn new(child: Arc<dyn PhysicalExpr>, start: i64, len: u64) -> Self {
+        Self { child, start, len }
+    }
+}
+
+impl Display for SubstringExpr {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        write!(
+            f,
+            "StringSpace [start: {}, len: {}, child: {}]",
+            self.start, self.len, self.child
+        )
+    }
+}
+
+impl PhysicalExpr for SubstringExpr {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn data_type(&self, input_schema: &Schema) -> datafusion_common::Result<DataType> {
+        self.child.data_type(input_schema)
+    }
+
+    fn nullable(&self, _: &Schema) -> datafusion_common::Result<bool> {
+        Ok(true)
+    }
+
+    fn evaluate(&self, batch: &RecordBatch) -> datafusion_common::Result<ColumnarValue> {
+        let arg = self.child.evaluate(batch)?;
+        match arg {
+            ColumnarValue::Array(array) => {
+                let result = substring(&array, self.start, self.len)?;
+
+                Ok(ColumnarValue::Array(result))
+            }
+            _ => Err(DataFusionError::Execution(
+                "Substring(scalar) should be fold in Spark JVM side.".to_string(),
+            )),
+        }
+    }
+
+    fn children(&self) -> Vec<&Arc<dyn PhysicalExpr>> {
+        vec![&self.child]
+    }
+
+    fn with_new_children(
+        self: Arc<Self>,
+        children: Vec<Arc<dyn PhysicalExpr>>,
+    ) -> datafusion_common::Result<Arc<dyn PhysicalExpr>> {
+        Ok(Arc::new(SubstringExpr::new(
+            Arc::clone(&children[0]),
+            self.start,
+            self.len,
+        )))
+    }
+}

From 5f773e6e8b9dfaacbd229940b55b80cea89abc5d Mon Sep 17 00:00:00 2001
From: Raz Luvaton <16746759+rluvaton@users.noreply.github.com>
Date: Wed, 8 Jan 2025 09:10:34 +0200
Subject: [PATCH 49/68] chore: extract predicate_functions expressions to
 folders based on spark grouping (#1218)

* extract predicate_functions expressions to folders based on spark grouping

* code review changes

---------

Co-authored-by: Andy Grove <agrove@apache.org>
---
 src/lib.rs                                  |  4 +-
 src/predicate_funcs/is_nan.rs               | 70 ++++++++++++++++++
 src/predicate_funcs/mod.rs                  | 22 ++++++
 src/{regexp.rs => predicate_funcs/rlike.rs} |  0
 src/scalar_funcs.rs                         | 81 ++++++++++++++++++++-
 5 files changed, 173 insertions(+), 4 deletions(-)
 create mode 100644 src/predicate_funcs/is_nan.rs
 create mode 100644 src/predicate_funcs/mod.rs
 rename src/{regexp.rs => predicate_funcs/rlike.rs} (100%)

diff --git a/src/lib.rs b/src/lib.rs
index c7c54a4e9f9f..c614e1f0a273 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -30,7 +30,6 @@ pub use checkoverflow::CheckOverflow;
 
 mod kernels;
 mod list;
-mod regexp;
 pub mod scalar_funcs;
 mod schema_adapter;
 mod static_invoke;
@@ -50,6 +49,8 @@ mod unbound;
 pub use unbound::UnboundColumn;
 pub mod utils;
 pub use normalize_nan::NormalizeNaNAndZero;
+mod predicate_funcs;
+pub use predicate_funcs::{spark_isnan, RLike};
 
 mod agg_funcs;
 mod comet_scalar_funcs;
@@ -66,7 +67,6 @@ pub use datetime_funcs::*;
 pub use error::{SparkError, SparkResult};
 pub use if_expr::IfExpr;
 pub use list::{ArrayInsert, GetArrayStructFields, ListExtract};
-pub use regexp::RLike;
 pub use string_funcs::*;
 pub use struct_funcs::*;
 pub use to_json::ToJson;
diff --git a/src/predicate_funcs/is_nan.rs b/src/predicate_funcs/is_nan.rs
new file mode 100644
index 000000000000..bf4d7e0f2605
--- /dev/null
+++ b/src/predicate_funcs/is_nan.rs
@@ -0,0 +1,70 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::array::{Float32Array, Float64Array};
+use arrow_array::{Array, BooleanArray};
+use arrow_schema::DataType;
+use datafusion::physical_plan::ColumnarValue;
+use datafusion_common::{DataFusionError, ScalarValue};
+use std::sync::Arc;
+
+/// Spark-compatible `isnan` expression
+pub fn spark_isnan(args: &[ColumnarValue]) -> Result<ColumnarValue, DataFusionError> {
+    fn set_nulls_to_false(is_nan: BooleanArray) -> ColumnarValue {
+        match is_nan.nulls() {
+            Some(nulls) => {
+                let is_not_null = nulls.inner();
+                ColumnarValue::Array(Arc::new(BooleanArray::new(
+                    is_nan.values() & is_not_null,
+                    None,
+                )))
+            }
+            None => ColumnarValue::Array(Arc::new(is_nan)),
+        }
+    }
+    let value = &args[0];
+    match value {
+        ColumnarValue::Array(array) => match array.data_type() {
+            DataType::Float64 => {
+                let array = array.as_any().downcast_ref::<Float64Array>().unwrap();
+                let is_nan = BooleanArray::from_unary(array, |x| x.is_nan());
+                Ok(set_nulls_to_false(is_nan))
+            }
+            DataType::Float32 => {
+                let array = array.as_any().downcast_ref::<Float32Array>().unwrap();
+                let is_nan = BooleanArray::from_unary(array, |x| x.is_nan());
+                Ok(set_nulls_to_false(is_nan))
+            }
+            other => Err(DataFusionError::Internal(format!(
+                "Unsupported data type {:?} for function isnan",
+                other,
+            ))),
+        },
+        ColumnarValue::Scalar(a) => match a {
+            ScalarValue::Float64(a) => Ok(ColumnarValue::Scalar(ScalarValue::Boolean(Some(
+                a.map(|x| x.is_nan()).unwrap_or(false),
+            )))),
+            ScalarValue::Float32(a) => Ok(ColumnarValue::Scalar(ScalarValue::Boolean(Some(
+                a.map(|x| x.is_nan()).unwrap_or(false),
+            )))),
+            _ => Err(DataFusionError::Internal(format!(
+                "Unsupported data type {:?} for function isnan",
+                value.data_type(),
+            ))),
+        },
+    }
+}
diff --git a/src/predicate_funcs/mod.rs b/src/predicate_funcs/mod.rs
new file mode 100644
index 000000000000..5f1f570c0541
--- /dev/null
+++ b/src/predicate_funcs/mod.rs
@@ -0,0 +1,22 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+mod is_nan;
+mod rlike;
+
+pub use is_nan::spark_isnan;
+pub use rlike::RLike;
diff --git a/src/regexp.rs b/src/predicate_funcs/rlike.rs
similarity index 100%
rename from src/regexp.rs
rename to src/predicate_funcs/rlike.rs
diff --git a/src/scalar_funcs.rs b/src/scalar_funcs.rs
index e11d1c5db1ba..9421d54fd70d 100644
--- a/src/scalar_funcs.rs
+++ b/src/scalar_funcs.rs
@@ -20,10 +20,14 @@ use arrow::{
         ArrayRef, AsArray, Decimal128Builder, Float32Array, Float64Array, Int16Array, Int32Array,
         Int64Array, Int64Builder, Int8Array,
     },
+    compute::kernels::numeric::{add, sub},
     datatypes::{validate_decimal_precision, Decimal128Type, Int64Type},
 };
-use arrow_array::{Array, ArrowNativeTypeOp, BooleanArray, Decimal128Array};
-use arrow_schema::{DataType, DECIMAL128_MAX_PRECISION};
+use arrow_array::builder::IntervalDayTimeBuilder;
+use arrow_array::types::{Int16Type, Int32Type, Int8Type, IntervalDayTime};
+use arrow_array::{Array, ArrowNativeTypeOp, BooleanArray, Datum, Decimal128Array};
+use arrow_schema::{ArrowError, DataType, DECIMAL128_MAX_PRECISION};
+use datafusion::physical_expr_common::datum;
 use datafusion::{functions::math::round::round, physical_plan::ColumnarValue};
 use datafusion_common::{
     exec_err, internal_err, DataFusionError, Result as DataFusionResult, ScalarValue,
@@ -447,6 +451,79 @@ pub fn spark_decimal_div(
     Ok(ColumnarValue::Array(Arc::new(result)))
 }
 
+macro_rules! scalar_date_arithmetic {
+    ($start:expr, $days:expr, $op:expr) => {{
+        let interval = IntervalDayTime::new(*$days as i32, 0);
+        let interval_cv = ColumnarValue::Scalar(ScalarValue::IntervalDayTime(Some(interval)));
+        datum::apply($start, &interval_cv, $op)
+    }};
+}
+macro_rules! array_date_arithmetic {
+    ($days:expr, $interval_builder:expr, $intType:ty) => {{
+        for day in $days.as_primitive::<$intType>().into_iter() {
+            if let Some(non_null_day) = day {
+                $interval_builder.append_value(IntervalDayTime::new(non_null_day as i32, 0));
+            } else {
+                $interval_builder.append_null();
+            }
+        }
+    }};
+}
+
+/// Spark-compatible `date_add` and `date_sub` expressions, which assumes days for the second
+/// argument, but we cannot directly add that to a Date32. We generate an IntervalDayTime from the
+/// second argument and use DataFusion's interface to apply Arrow's operators.
+fn spark_date_arithmetic(
+    args: &[ColumnarValue],
+    op: impl Fn(&dyn Datum, &dyn Datum) -> Result<ArrayRef, ArrowError>,
+) -> Result<ColumnarValue, DataFusionError> {
+    let start = &args[0];
+    match &args[1] {
+        ColumnarValue::Scalar(ScalarValue::Int8(Some(days))) => {
+            scalar_date_arithmetic!(start, days, op)
+        }
+        ColumnarValue::Scalar(ScalarValue::Int16(Some(days))) => {
+            scalar_date_arithmetic!(start, days, op)
+        }
+        ColumnarValue::Scalar(ScalarValue::Int32(Some(days))) => {
+            scalar_date_arithmetic!(start, days, op)
+        }
+        ColumnarValue::Array(days) => {
+            let mut interval_builder = IntervalDayTimeBuilder::with_capacity(days.len());
+            match days.data_type() {
+                DataType::Int8 => {
+                    array_date_arithmetic!(days, interval_builder, Int8Type)
+                }
+                DataType::Int16 => {
+                    array_date_arithmetic!(days, interval_builder, Int16Type)
+                }
+                DataType::Int32 => {
+                    array_date_arithmetic!(days, interval_builder, Int32Type)
+                }
+                _ => {
+                    return Err(DataFusionError::Internal(format!(
+                        "Unsupported data types {:?} for date arithmetic.",
+                        args,
+                    )))
+                }
+            }
+            let interval_cv = ColumnarValue::Array(Arc::new(interval_builder.finish()));
+            datum::apply(start, &interval_cv, op)
+        }
+        _ => Err(DataFusionError::Internal(format!(
+            "Unsupported data types {:?} for date arithmetic.",
+            args,
+        ))),
+    }
+}
+pub fn spark_date_add(args: &[ColumnarValue]) -> Result<ColumnarValue, DataFusionError> {
+    spark_date_arithmetic(args, add)
+}
+
+pub fn spark_date_sub(args: &[ColumnarValue]) -> Result<ColumnarValue, DataFusionError> {
+    spark_date_arithmetic(args, sub)
+}
+
 /// Spark-compatible `isnan` expression
 pub fn spark_isnan(args: &[ColumnarValue]) -> Result<ColumnarValue, DataFusionError> {
     fn set_nulls_to_false(is_nan: BooleanArray) -> ColumnarValue {

From 8e6e4443394e2369676a108772ab6135d0ce1f9d Mon Sep 17 00:00:00 2001
From: Raz Luvaton <16746759+rluvaton@users.noreply.github.com>
Date: Thu, 9 Jan 2025 00:07:06 +0200
Subject: [PATCH 50/68] extract json_funcs expressions to folders based on
 spark grouping (#1220)

Co-authored-by: Andy Grove <agrove@apache.org>
---
 src/json_funcs/mod.rs           | 20 ++++++++++++++++++++
 src/{ => json_funcs}/to_json.rs |  4 ++--
 src/lib.rs                      |  4 ++--
 3 files changed, 24 insertions(+), 4 deletions(-)
 create mode 100644 src/json_funcs/mod.rs
 rename src/{ => json_funcs}/to_json.rs (99%)

diff --git a/src/json_funcs/mod.rs b/src/json_funcs/mod.rs
new file mode 100644
index 000000000000..de3037590dba
--- /dev/null
+++ b/src/json_funcs/mod.rs
@@ -0,0 +1,20 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+mod to_json;
+
+pub use to_json::ToJson;
diff --git a/src/to_json.rs b/src/json_funcs/to_json.rs
similarity index 99%
rename from src/to_json.rs
rename to src/json_funcs/to_json.rs
index 91b46c6f0418..3389ea3a0e24 100644
--- a/src/to_json.rs
+++ b/src/json_funcs/to_json.rs
@@ -19,7 +19,7 @@
 // of the Spark-specific compatibility features that we need (including
 // being able to specify Spark-compatible cast from all types to string)
 
-use crate::cast::SparkCastOptions;
+use crate::SparkCastOptions;
 use crate::{spark_cast, EvalMode};
 use arrow_array::builder::StringBuilder;
 use arrow_array::{Array, ArrayRef, RecordBatch, StringArray, StructArray};
@@ -250,7 +250,7 @@ fn struct_to_json(array: &StructArray, timezone: &str) -> Result<ArrayRef> {
 
 #[cfg(test)]
 mod test {
-    use crate::to_json::struct_to_json;
+    use crate::json_funcs::to_json::struct_to_json;
     use arrow_array::types::Int32Type;
     use arrow_array::{Array, PrimitiveArray, StringArray};
     use arrow_array::{ArrayRef, BooleanArray, Int32Array, StructArray};
diff --git a/src/lib.rs b/src/lib.rs
index c614e1f0a273..ee58f6290d8a 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -42,9 +42,9 @@ mod struct_funcs;
 pub use negative::{create_negate_expr, NegativeExpr};
 mod normalize_nan;
 
+mod json_funcs;
 pub mod test_common;
 pub mod timezone;
-mod to_json;
 mod unbound;
 pub use unbound::UnboundColumn;
 pub mod utils;
@@ -66,10 +66,10 @@ pub use comet_scalar_funcs::create_comet_physical_fun;
 pub use datetime_funcs::*;
 pub use error::{SparkError, SparkResult};
 pub use if_expr::IfExpr;
+pub use json_funcs::ToJson;
 pub use list::{ArrayInsert, GetArrayStructFields, ListExtract};
 pub use string_funcs::*;
 pub use struct_funcs::*;
-pub use to_json::ToJson;
 
 /// Spark supports three evaluation modes when evaluating expressions, which affect
 /// the behavior when processing input values that are invalid or would result in an

From 31a725fc0b645ec1474035f0268483151722b705 Mon Sep 17 00:00:00 2001
From: Raz Luvaton <16746759+rluvaton@users.noreply.github.com>
Date: Thu, 9 Jan 2025 21:23:48 +0200
Subject: [PATCH 51/68] chore: extract hash_funcs expressions to folders based
 on spark grouping (#1221)

* extract hash_funcs expressions to folders based on spark grouping

* extract hash_funcs expressions to folders based on spark grouping

---------

Co-authored-by: Andy Grove <agrove@apache.org>
---
 src/comet_scalar_funcs.rs            |   7 +-
 src/hash_funcs/mod.rs                |  25 +
 src/hash_funcs/murmur3.rs            | 280 +++++++++++
 src/hash_funcs/sha2.rs               |  76 +++
 src/hash_funcs/utils.rs              | 393 +++++++++++++++
 src/hash_funcs/xxhash64.rs           | 264 ++++++++++
 src/lib.rs                           |   4 +-
 src/scalar_funcs.rs                  |   5 +-
 src/scalar_funcs/hash_expressions.rs | 166 -------
 src/scalar_funcs/hex.rs              |   2 +-
 src/spark_hash.rs                    | 712 ---------------------------
 11 files changed, 1045 insertions(+), 889 deletions(-)
 create mode 100644 src/hash_funcs/mod.rs
 create mode 100644 src/hash_funcs/murmur3.rs
 create mode 100644 src/hash_funcs/sha2.rs
 create mode 100644 src/hash_funcs/utils.rs
 create mode 100644 src/hash_funcs/xxhash64.rs
 delete mode 100644 src/scalar_funcs/hash_expressions.rs
 delete mode 100644 src/spark_hash.rs

diff --git a/src/comet_scalar_funcs.rs b/src/comet_scalar_funcs.rs
index 781249c49aa8..27c77d7f2be1 100644
--- a/src/comet_scalar_funcs.rs
+++ b/src/comet_scalar_funcs.rs
@@ -15,13 +15,10 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use crate::scalar_funcs::hash_expressions::{
-    spark_sha224, spark_sha256, spark_sha384, spark_sha512,
-};
+use crate::hash_funcs::*;
 use crate::scalar_funcs::{
     spark_ceil, spark_decimal_div, spark_floor, spark_hex, spark_isnan, spark_make_decimal,
-    spark_murmur3_hash, spark_round, spark_unhex, spark_unscaled_value, spark_xxhash64,
-    SparkChrFunc,
+    spark_round, spark_unhex, spark_unscaled_value, SparkChrFunc,
 };
 use crate::{spark_date_add, spark_date_sub, spark_read_side_padding};
 use arrow_schema::DataType;
diff --git a/src/hash_funcs/mod.rs b/src/hash_funcs/mod.rs
new file mode 100644
index 000000000000..7649c4c5476f
--- /dev/null
+++ b/src/hash_funcs/mod.rs
@@ -0,0 +1,25 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+pub mod murmur3;
+mod sha2;
+pub(super) mod utils;
+mod xxhash64;
+
+pub use murmur3::spark_murmur3_hash;
+pub use sha2::{spark_sha224, spark_sha256, spark_sha384, spark_sha512};
+pub use xxhash64::spark_xxhash64;
diff --git a/src/hash_funcs/murmur3.rs b/src/hash_funcs/murmur3.rs
new file mode 100644
index 000000000000..3ed70ba741c4
--- /dev/null
+++ b/src/hash_funcs/murmur3.rs
@@ -0,0 +1,280 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::create_hashes_internal;
+use arrow::compute::take;
+use arrow_array::types::ArrowDictionaryKeyType;
+use arrow_array::{Array, ArrayRef, ArrowNativeTypeOp, DictionaryArray, Int32Array};
+use arrow_buffer::ArrowNativeType;
+use datafusion_common::{internal_err, DataFusionError, ScalarValue};
+use datafusion_expr::ColumnarValue;
+use std::sync::Arc;
+
+/// Spark compatible murmur3 hash (just `hash` in Spark) in vectorized execution fashion
+pub fn spark_murmur3_hash(args: &[ColumnarValue]) -> Result<ColumnarValue, DataFusionError> {
+    let length = args.len();
+    let seed = &args[length - 1];
+    match seed {
+        ColumnarValue::Scalar(ScalarValue::Int32(Some(seed))) => {
+            // iterate over the arguments to find out the length of the array
+            let num_rows = args[0..args.len() - 1]
+                .iter()
+                .find_map(|arg| match arg {
+                    ColumnarValue::Array(array) => Some(array.len()),
+                    ColumnarValue::Scalar(_) => None,
+                })
+                .unwrap_or(1);
+            let mut hashes: Vec<u32> = vec![0_u32; num_rows];
+            hashes.fill(*seed as u32);
+            let arrays = args[0..args.len() - 1]
+                .iter()
+                .map(|arg| match arg {
+                    ColumnarValue::Array(array) => Arc::clone(array),
+                    ColumnarValue::Scalar(scalar) => {
+                        scalar.clone().to_array_of_size(num_rows).unwrap()
+                    }
+                })
+                .collect::<Vec<ArrayRef>>();
+            create_murmur3_hashes(&arrays, &mut hashes)?;
+            if num_rows == 1 {
+                Ok(ColumnarValue::Scalar(ScalarValue::Int32(Some(
+                    hashes[0] as i32,
+                ))))
+            } else {
+                let hashes: Vec<i32> = hashes.into_iter().map(|x| x as i32).collect();
+                Ok(ColumnarValue::Array(Arc::new(Int32Array::from(hashes))))
+            }
+        }
+        _ => {
+            internal_err!(
+                "The seed of function murmur3_hash must be an Int32 scalar value, but got: {:?}.",
+                seed
+            )
+        }
+    }
+}
+
+/// Spark-compatible murmur3 hash function
+#[inline]
+pub fn spark_compatible_murmur3_hash<T: AsRef<[u8]>>(data: T, seed: u32) -> u32 {
+    #[inline]
+    fn mix_k1(mut k1: i32) -> i32 {
+        k1 = k1.mul_wrapping(0xcc9e2d51u32 as i32);
+        k1 = k1.rotate_left(15);
+        k1 = k1.mul_wrapping(0x1b873593u32 as i32);
+        k1
+    }
+
+    #[inline]
+    fn mix_h1(mut h1: i32, k1: i32) -> i32 {
+        h1 ^= k1;
+        h1 = h1.rotate_left(13);
+        h1 = h1.mul_wrapping(5).add_wrapping(0xe6546b64u32 as i32);
+        h1
+    }
+
+    #[inline]
+    fn fmix(mut h1: i32, len: i32) -> i32 {
+        h1 ^= len;
+        h1 ^= (h1 as u32 >> 16) as i32;
+        h1 = h1.mul_wrapping(0x85ebca6bu32 as i32);
+        h1 ^= (h1 as u32 >> 13) as i32;
+        h1 = h1.mul_wrapping(0xc2b2ae35u32 as i32);
+        h1 ^= (h1 as u32 >> 16) as i32;
+        h1
+    }
+
+    #[inline]
+    unsafe fn hash_bytes_by_int(data: &[u8], seed: u32) -> i32 {
+        // safety: data length must be aligned to 4 bytes
+        let mut h1 = seed as i32;
+        for i in (0..data.len()).step_by(4) {
+            let ints = data.as_ptr().add(i) as *const i32;
+            let mut half_word = ints.read_unaligned();
+            if cfg!(target_endian = "big") {
+                half_word = half_word.reverse_bits();
+            }
+            h1 = mix_h1(h1, mix_k1(half_word));
+        }
+        h1
+    }
+    let data = data.as_ref();
+    let len = data.len();
+    let len_aligned = len - len % 4;
+
+    // safety:
+    // avoid boundary checking in performance critical codes.
+    // all operations are guaranteed to be safe
+    // data is &[u8] so we do not need to check for proper alignment
+    unsafe {
+        let mut h1 = if len_aligned > 0 {
+            hash_bytes_by_int(&data[0..len_aligned], seed)
+        } else {
+            seed as i32
+        };
+
+        for i in len_aligned..len {
+            let half_word = *data.get_unchecked(i) as i8 as i32;
+            h1 = mix_h1(h1, mix_k1(half_word));
+        }
+        fmix(h1, len as i32) as u32
+    }
+}
+
+/// Hash the values in a dictionary array
+fn create_hashes_dictionary<K: ArrowDictionaryKeyType>(
+    array: &ArrayRef,
+    hashes_buffer: &mut [u32],
+    first_col: bool,
+) -> datafusion_common::Result<()> {
+    let dict_array = array.as_any().downcast_ref::<DictionaryArray<K>>().unwrap();
+    if !first_col {
+        // unpack the dictionary array as each row may have a different hash input
+        let unpacked = take(dict_array.values().as_ref(), dict_array.keys(), None)?;
+        create_murmur3_hashes(&[unpacked], hashes_buffer)?;
+    } else {
+        // For the first column, hash each dictionary value once, and then use
+        // that computed hash for each key value to avoid a potentially
+        // expensive redundant hashing for large dictionary elements (e.g. strings)
+        let dict_values = Arc::clone(dict_array.values());
+        // same initial seed as Spark
+        let mut dict_hashes = vec![42; dict_values.len()];
+        create_murmur3_hashes(&[dict_values], &mut dict_hashes)?;
+        for (hash, key) in hashes_buffer.iter_mut().zip(dict_array.keys().iter()) {
+            if let Some(key) = key {
+                let idx = key.to_usize().ok_or_else(|| {
+                    DataFusionError::Internal(format!(
+                        "Can not convert key value {:?} to usize in dictionary of type {:?}",
+                        key,
+                        dict_array.data_type()
+                    ))
+                })?;
+                *hash = dict_hashes[idx]
+            } // no update for Null, consistent with other hashes
+        }
+    }
+    Ok(())
+}
+
+/// Creates hash values for every row, based on the values in the
+/// columns.
+///
+/// The number of rows to hash is determined by `hashes_buffer.len()`.
+/// `hashes_buffer` should be pre-sized appropriately
+pub fn create_murmur3_hashes<'a>(
+    arrays: &[ArrayRef],
+    hashes_buffer: &'a mut [u32],
+) -> datafusion_common::Result<&'a mut [u32]> {
+    create_hashes_internal!(
+        arrays,
+        hashes_buffer,
+        spark_compatible_murmur3_hash,
+        create_hashes_dictionary
+    );
+    Ok(hashes_buffer)
+}
+
+#[cfg(test)]
+mod tests {
+    use arrow::array::{Float32Array, Float64Array};
+    use std::sync::Arc;
+
+    use crate::murmur3::create_murmur3_hashes;
+    use crate::test_hashes_with_nulls;
+    use datafusion::arrow::array::{ArrayRef, Int32Array, Int64Array, Int8Array, StringArray};
+
+    fn test_murmur3_hash<I: Clone, T: arrow_array::Array + From<Vec<Option<I>>> + 'static>(
+        values: Vec<Option<I>>,
+        expected: Vec<u32>,
+    ) {
+        test_hashes_with_nulls!(create_murmur3_hashes, T, values, expected, u32);
+    }
+
+    #[test]
+    fn test_i8() {
+        test_murmur3_hash::<i8, Int8Array>(
+            vec![Some(1), Some(0), Some(-1), Some(i8::MAX), Some(i8::MIN)],
+            vec![0xdea578e3, 0x379fae8f, 0xa0590e3d, 0x43b4d8ed, 0x422a1365],
+        );
+    }
+
+    #[test]
+    fn test_i32() {
+        test_murmur3_hash::<i32, Int32Array>(
+            vec![Some(1), Some(0), Some(-1), Some(i32::MAX), Some(i32::MIN)],
+            vec![0xdea578e3, 0x379fae8f, 0xa0590e3d, 0x07fb67e7, 0x2b1f0fc6],
+        );
+    }
+
+    #[test]
+    fn test_i64() {
+        test_murmur3_hash::<i64, Int64Array>(
+            vec![Some(1), Some(0), Some(-1), Some(i64::MAX), Some(i64::MIN)],
+            vec![0x99f0149d, 0x9c67b85d, 0xc8008529, 0xa05b5d7b, 0xcd1e64fb],
+        );
+    }
+
+    #[test]
+    fn test_f32() {
+        test_murmur3_hash::<f32, Float32Array>(
+            vec![
+                Some(1.0),
+                Some(0.0),
+                Some(-0.0),
+                Some(-1.0),
+                Some(99999999999.99999999999),
+                Some(-99999999999.99999999999),
+            ],
+            vec![
+                0xe434cc39, 0x379fae8f, 0x379fae8f, 0xdc0da8eb, 0xcbdc340f, 0xc0361c86,
+            ],
+        );
+    }
+
+    #[test]
+    fn test_f64() {
+        test_murmur3_hash::<f64, Float64Array>(
+            vec![
+                Some(1.0),
+                Some(0.0),
+                Some(-0.0),
+                Some(-1.0),
+                Some(99999999999.99999999999),
+                Some(-99999999999.99999999999),
+            ],
+            vec![
+                0xe4876492, 0x9c67b85d, 0x9c67b85d, 0x13d81357, 0xb87e1595, 0xa0eef9f9,
+            ],
+        );
+    }
+
+    #[test]
+    fn test_str() {
+        let input = [
+            "hello", "bar", "", "😁", "天地", "a", "ab", "abc", "abcd", "abcde",
+        ]
+        .iter()
+        .map(|s| Some(s.to_string()))
+        .collect::<Vec<Option<String>>>();
+        let expected: Vec<u32> = vec![
+            3286402344, 2486176763, 142593372, 885025535, 2395000894, 1485273170, 0xfa37157b,
+            1322437556, 0xe860e5cc, 814637928,
+        ];
+
+        test_murmur3_hash::<String, StringArray>(input.clone(), expected);
+    }
+}
diff --git a/src/hash_funcs/sha2.rs b/src/hash_funcs/sha2.rs
new file mode 100644
index 000000000000..90917a9eb420
--- /dev/null
+++ b/src/hash_funcs/sha2.rs
@@ -0,0 +1,76 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::scalar_funcs::hex_strings;
+use arrow_array::{Array, StringArray};
+use datafusion::functions::crypto::{sha224, sha256, sha384, sha512};
+use datafusion_common::cast::as_binary_array;
+use datafusion_common::{exec_err, DataFusionError, ScalarValue};
+use datafusion_expr::{ColumnarValue, ScalarUDF};
+use std::sync::Arc;
+
+/// `sha224` function that simulates Spark's `sha2` expression with bit width 224
+pub fn spark_sha224(args: &[ColumnarValue]) -> Result<ColumnarValue, DataFusionError> {
+    wrap_digest_result_as_hex_string(args, sha224())
+}
+
+/// `sha256` function that simulates Spark's `sha2` expression with bit width 0 or 256
+pub fn spark_sha256(args: &[ColumnarValue]) -> Result<ColumnarValue, DataFusionError> {
+    wrap_digest_result_as_hex_string(args, sha256())
+}
+
+/// `sha384` function that simulates Spark's `sha2` expression with bit width 384
+pub fn spark_sha384(args: &[ColumnarValue]) -> Result<ColumnarValue, DataFusionError> {
+    wrap_digest_result_as_hex_string(args, sha384())
+}
+
+/// `sha512` function that simulates Spark's `sha2` expression with bit width 512
+pub fn spark_sha512(args: &[ColumnarValue]) -> Result<ColumnarValue, DataFusionError> {
+    wrap_digest_result_as_hex_string(args, sha512())
+}
+
+// Spark requires hex string as the result of sha2 functions, we have to wrap the
+// result of digest functions as hex string
+fn wrap_digest_result_as_hex_string(
+    args: &[ColumnarValue],
+    digest: Arc<ScalarUDF>,
+) -> Result<ColumnarValue, DataFusionError> {
+    let row_count = match &args[0] {
+        ColumnarValue::Array(array) => array.len(),
+        ColumnarValue::Scalar(_) => 1,
+    };
+    let value = digest.invoke_batch(args, row_count)?;
+    match value {
+        ColumnarValue::Array(array) => {
+            let binary_array = as_binary_array(&array)?;
+            let string_array: StringArray = binary_array
+                .iter()
+                .map(|opt| opt.map(hex_strings::<_>))
+                .collect();
+            Ok(ColumnarValue::Array(Arc::new(string_array)))
+        }
+        ColumnarValue::Scalar(ScalarValue::Binary(opt)) => Ok(ColumnarValue::Scalar(
+            ScalarValue::Utf8(opt.map(hex_strings::<_>)),
+        )),
+        _ => {
+            exec_err!(
+                "digest function should return binary value, but got: {:?}",
+                value.data_type()
+            )
+        }
+    }
+}
diff --git a/src/hash_funcs/utils.rs b/src/hash_funcs/utils.rs
new file mode 100644
index 000000000000..07ba1952d7d8
--- /dev/null
+++ b/src/hash_funcs/utils.rs
@@ -0,0 +1,393 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! This includes utilities for hashing and murmur3 hashing.
+
+#[macro_export]
+macro_rules! hash_array {
+    ($array_type: ident, $column: ident, $hashes: ident, $hash_method: ident) => {
+        let array = $column.as_any().downcast_ref::<$array_type>().unwrap();
+        if array.null_count() == 0 {
+            for (i, hash) in $hashes.iter_mut().enumerate() {
+                *hash = $hash_method(&array.value(i), *hash);
+            }
+        } else {
+            for (i, hash) in $hashes.iter_mut().enumerate() {
+                if !array.is_null(i) {
+                    *hash = $hash_method(&array.value(i), *hash);
+                }
+            }
+        }
+    };
+}
+
+#[macro_export]
+macro_rules! hash_array_boolean {
+    ($array_type: ident, $column: ident, $hash_input_type: ident, $hashes: ident, $hash_method: ident) => {
+        let array = $column.as_any().downcast_ref::<$array_type>().unwrap();
+        if array.null_count() == 0 {
+            for (i, hash) in $hashes.iter_mut().enumerate() {
+                *hash = $hash_method($hash_input_type::from(array.value(i)).to_le_bytes(), *hash);
+            }
+        } else {
+            for (i, hash) in $hashes.iter_mut().enumerate() {
+                if !array.is_null(i) {
+                    *hash =
+                        $hash_method($hash_input_type::from(array.value(i)).to_le_bytes(), *hash);
+                }
+            }
+        }
+    };
+}
+
+#[macro_export]
+macro_rules! hash_array_primitive {
+    ($array_type: ident, $column: ident, $ty: ident, $hashes: ident, $hash_method: ident) => {
+        let array = $column.as_any().downcast_ref::<$array_type>().unwrap();
+        let values = array.values();
+
+        if array.null_count() == 0 {
+            for (hash, value) in $hashes.iter_mut().zip(values.iter()) {
+                *hash = $hash_method((*value as $ty).to_le_bytes(), *hash);
+            }
+        } else {
+            for (i, (hash, value)) in $hashes.iter_mut().zip(values.iter()).enumerate() {
+                if !array.is_null(i) {
+                    *hash = $hash_method((*value as $ty).to_le_bytes(), *hash);
+                }
+            }
+        }
+    };
+}
+
+#[macro_export]
+macro_rules! hash_array_primitive_float {
+    ($array_type: ident, $column: ident, $ty: ident, $ty2: ident, $hashes: ident, $hash_method: ident) => {
+        let array = $column.as_any().downcast_ref::<$array_type>().unwrap();
+        let values = array.values();
+
+        if array.null_count() == 0 {
+            for (hash, value) in $hashes.iter_mut().zip(values.iter()) {
+                // Spark uses 0 as hash for -0.0, see `Murmur3Hash` expression.
+                if *value == 0.0 && value.is_sign_negative() {
+                    *hash = $hash_method((0 as $ty2).to_le_bytes(), *hash);
+                } else {
+                    *hash = $hash_method((*value as $ty).to_le_bytes(), *hash);
+                }
+            }
+        } else {
+            for (i, (hash, value)) in $hashes.iter_mut().zip(values.iter()).enumerate() {
+                if !array.is_null(i) {
+                    // Spark uses 0 as hash for -0.0, see `Murmur3Hash` expression.
+                    if *value == 0.0 && value.is_sign_negative() {
+                        *hash = $hash_method((0 as $ty2).to_le_bytes(), *hash);
+                    } else {
+                        *hash = $hash_method((*value as $ty).to_le_bytes(), *hash);
+                    }
+                }
+            }
+        }
+    };
+}
+
+#[macro_export]
+macro_rules! hash_array_decimal {
+    ($array_type:ident, $column: ident, $hashes: ident, $hash_method: ident) => {
+        let array = $column.as_any().downcast_ref::<$array_type>().unwrap();
+
+        if array.null_count() == 0 {
+            for (i, hash) in $hashes.iter_mut().enumerate() {
+                *hash = $hash_method(array.value(i).to_le_bytes(), *hash);
+            }
+        } else {
+            for (i, hash) in $hashes.iter_mut().enumerate() {
+                if !array.is_null(i) {
+                    *hash = $hash_method(array.value(i).to_le_bytes(), *hash);
+                }
+            }
+        }
+    };
+}
+
+/// Creates hash values for every row, based on the values in the
+/// columns.
+///
+/// The number of rows to hash is determined by `hashes_buffer.len()`.
+/// `hashes_buffer` should be pre-sized appropriately
+///
+/// `hash_method` is the hash function to use.
+/// `create_dictionary_hash_method` is the function to create hashes for dictionary arrays input.
+#[macro_export]
+macro_rules! create_hashes_internal {
+    ($arrays: ident, $hashes_buffer: ident, $hash_method: ident, $create_dictionary_hash_method: ident) => {
+        use arrow::datatypes::{DataType, TimeUnit};
+        use arrow_array::{types::*, *};
+
+        for (i, col) in $arrays.iter().enumerate() {
+            let first_col = i == 0;
+            match col.data_type() {
+                DataType::Boolean => {
+                    $crate::hash_array_boolean!(
+                        BooleanArray,
+                        col,
+                        i32,
+                        $hashes_buffer,
+                        $hash_method
+                    );
+                }
+                DataType::Int8 => {
+                    $crate::hash_array_primitive!(
+                        Int8Array,
+                        col,
+                        i32,
+                        $hashes_buffer,
+                        $hash_method
+                    );
+                }
+                DataType::Int16 => {
+                    $crate::hash_array_primitive!(
+                        Int16Array,
+                        col,
+                        i32,
+                        $hashes_buffer,
+                        $hash_method
+                    );
+                }
+                DataType::Int32 => {
+                    $crate::hash_array_primitive!(
+                        Int32Array,
+                        col,
+                        i32,
+                        $hashes_buffer,
+                        $hash_method
+                    );
+                }
+                DataType::Int64 => {
+                    $crate::hash_array_primitive!(
+                        Int64Array,
+                        col,
+                        i64,
+                        $hashes_buffer,
+                        $hash_method
+                    );
+                }
+                DataType::Float32 => {
+                    $crate::hash_array_primitive_float!(
+                        Float32Array,
+                        col,
+                        f32,
+                        i32,
+                        $hashes_buffer,
+                        $hash_method
+                    );
+                }
+                DataType::Float64 => {
+                    $crate::hash_array_primitive_float!(
+                        Float64Array,
+                        col,
+                        f64,
+                        i64,
+                        $hashes_buffer,
+                        $hash_method
+                    );
+                }
+                DataType::Timestamp(TimeUnit::Second, _) => {
+                    $crate::hash_array_primitive!(
+                        TimestampSecondArray,
+                        col,
+                        i64,
+                        $hashes_buffer,
+                        $hash_method
+                    );
+                }
+                DataType::Timestamp(TimeUnit::Millisecond, _) => {
+                    $crate::hash_array_primitive!(
+                        TimestampMillisecondArray,
+                        col,
+                        i64,
+                        $hashes_buffer,
+                        $hash_method
+                    );
+                }
+                DataType::Timestamp(TimeUnit::Microsecond, _) => {
+                    $crate::hash_array_primitive!(
+                        TimestampMicrosecondArray,
+                        col,
+                        i64,
+                        $hashes_buffer,
+                        $hash_method
+                    );
+                }
+                DataType::Timestamp(TimeUnit::Nanosecond, _) => {
+                    $crate::hash_array_primitive!(
+                        TimestampNanosecondArray,
+                        col,
+                        i64,
+                        $hashes_buffer,
+                        $hash_method
+                    );
+                }
+                DataType::Date32 => {
+                    $crate::hash_array_primitive!(
+                        Date32Array,
+                        col,
+                        i32,
+                        $hashes_buffer,
+                        $hash_method
+                    );
+                }
+                DataType::Date64 => {
+                    $crate::hash_array_primitive!(
+                        Date64Array,
+                        col,
+                        i64,
+                        $hashes_buffer,
+                        $hash_method
+                    );
+                }
+                DataType::Utf8 => {
+                    $crate::hash_array!(StringArray, col, $hashes_buffer, $hash_method);
+                }
+                DataType::LargeUtf8 => {
+                    $crate::hash_array!(LargeStringArray, col, $hashes_buffer, $hash_method);
+                }
+                DataType::Binary => {
+                    $crate::hash_array!(BinaryArray, col, $hashes_buffer, $hash_method);
+                }
+                DataType::LargeBinary => {
+                    $crate::hash_array!(LargeBinaryArray, col, $hashes_buffer, $hash_method);
+                }
+                DataType::FixedSizeBinary(_) => {
+                    $crate::hash_array!(FixedSizeBinaryArray, col, $hashes_buffer, $hash_method);
+                }
+                DataType::Decimal128(_, _) => {
+                    $crate::hash_array_decimal!(Decimal128Array, col, $hashes_buffer, $hash_method);
+                }
+                DataType::Dictionary(index_type, _) => match **index_type {
+                    DataType::Int8 => {
+                        $create_dictionary_hash_method::<Int8Type>(col, $hashes_buffer, first_col)?;
+                    }
+                    DataType::Int16 => {
+                        $create_dictionary_hash_method::<Int16Type>(
+                            col,
+                            $hashes_buffer,
+                            first_col,
+                        )?;
+                    }
+                    DataType::Int32 => {
+                        $create_dictionary_hash_method::<Int32Type>(
+                            col,
+                            $hashes_buffer,
+                            first_col,
+                        )?;
+                    }
+                    DataType::Int64 => {
+                        $create_dictionary_hash_method::<Int64Type>(
+                            col,
+                            $hashes_buffer,
+                            first_col,
+                        )?;
+                    }
+                    DataType::UInt8 => {
+                        $create_dictionary_hash_method::<UInt8Type>(
+                            col,
+                            $hashes_buffer,
+                            first_col,
+                        )?;
+                    }
+                    DataType::UInt16 => {
+                        $create_dictionary_hash_method::<UInt16Type>(
+                            col,
+                            $hashes_buffer,
+                            first_col,
+                        )?;
+                    }
+                    DataType::UInt32 => {
+                        $create_dictionary_hash_method::<UInt32Type>(
+                            col,
+                            $hashes_buffer,
+                            first_col,
+                        )?;
+                    }
+                    DataType::UInt64 => {
+                        $create_dictionary_hash_method::<UInt64Type>(
+                            col,
+                            $hashes_buffer,
+                            first_col,
+                        )?;
+                    }
+                    _ => {
+                        return Err(DataFusionError::Internal(format!(
+                            "Unsupported dictionary type in hasher hashing: {}",
+                            col.data_type(),
+                        )))
+                    }
+                },
+                _ => {
+                    // This is internal because we should have caught this before.
+                    return Err(DataFusionError::Internal(format!(
+                        "Unsupported data type in hasher: {}",
+                        col.data_type()
+                    )));
+                }
+            }
+        }
+    };
+}
+
+pub(crate) mod test_utils {
+
+    #[macro_export]
+    macro_rules! test_hashes_internal {
+        ($hash_method: ident, $input: expr, $initial_seeds: expr, $expected: expr) => {
+            let i = $input;
+            let mut hashes = $initial_seeds.clone();
+            $hash_method(&[i], &mut hashes).unwrap();
+            assert_eq!(hashes, $expected);
+        };
+    }
+
+    #[macro_export]
+    macro_rules! test_hashes_with_nulls {
+        ($method: ident, $t: ty, $values: ident, $expected: ident, $seed_type: ty) => {
+            // copied before inserting nulls
+            let mut input_with_nulls = $values.clone();
+            let mut expected_with_nulls = $expected.clone();
+            // test before inserting nulls
+            let len = $values.len();
+            let initial_seeds = vec![42 as $seed_type; len];
+            let i = Arc::new(<$t>::from($values)) as ArrayRef;
+            $crate::test_hashes_internal!($method, i, initial_seeds, $expected);
+
+            // test with nulls
+            let median = len / 2;
+            input_with_nulls.insert(0, None);
+            input_with_nulls.insert(median, None);
+            expected_with_nulls.insert(0, 42 as $seed_type);
+            expected_with_nulls.insert(median, 42 as $seed_type);
+            let len_with_nulls = len + 2;
+            let initial_seeds_with_nulls = vec![42 as $seed_type; len_with_nulls];
+            let nullable_input = Arc::new(<$t>::from(input_with_nulls)) as ArrayRef;
+            $crate::test_hashes_internal!(
+                $method,
+                nullable_input,
+                initial_seeds_with_nulls,
+                expected_with_nulls
+            );
+        };
+    }
+}
diff --git a/src/hash_funcs/xxhash64.rs b/src/hash_funcs/xxhash64.rs
new file mode 100644
index 000000000000..e96f178d8313
--- /dev/null
+++ b/src/hash_funcs/xxhash64.rs
@@ -0,0 +1,264 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::compute::take;
+use twox_hash::XxHash64;
+
+use datafusion::{
+    arrow::{
+        array::*,
+        datatypes::{ArrowDictionaryKeyType, ArrowNativeType},
+    },
+    common::{internal_err, ScalarValue},
+    error::{DataFusionError, Result},
+};
+
+use crate::create_hashes_internal;
+use arrow_array::{Array, ArrayRef, Int64Array};
+use datafusion_expr::ColumnarValue;
+use std::sync::Arc;
+
+/// Spark compatible xxhash64 in vectorized execution fashion
+pub fn spark_xxhash64(args: &[ColumnarValue]) -> Result<ColumnarValue, DataFusionError> {
+    let length = args.len();
+    let seed = &args[length - 1];
+    match seed {
+        ColumnarValue::Scalar(ScalarValue::Int64(Some(seed))) => {
+            // iterate over the arguments to find out the length of the array
+            let num_rows = args[0..args.len() - 1]
+                .iter()
+                .find_map(|arg| match arg {
+                    ColumnarValue::Array(array) => Some(array.len()),
+                    ColumnarValue::Scalar(_) => None,
+                })
+                .unwrap_or(1);
+            let mut hashes: Vec<u64> = vec![0_u64; num_rows];
+            hashes.fill(*seed as u64);
+            let arrays = args[0..args.len() - 1]
+                .iter()
+                .map(|arg| match arg {
+                    ColumnarValue::Array(array) => Arc::clone(array),
+                    ColumnarValue::Scalar(scalar) => {
+                        scalar.clone().to_array_of_size(num_rows).unwrap()
+                    }
+                })
+                .collect::<Vec<ArrayRef>>();
+            create_xxhash64_hashes(&arrays, &mut hashes)?;
+            if num_rows == 1 {
+                Ok(ColumnarValue::Scalar(ScalarValue::Int64(Some(
+                    hashes[0] as i64,
+                ))))
+            } else {
+                let hashes: Vec<i64> = hashes.into_iter().map(|x| x as i64).collect();
+                Ok(ColumnarValue::Array(Arc::new(Int64Array::from(hashes))))
+            }
+        }
+        _ => {
+            internal_err!(
+                "The seed of function xxhash64 must be an Int64 scalar value, but got: {:?}.",
+                seed
+            )
+        }
+    }
+}
+
+#[inline]
+fn spark_compatible_xxhash64<T: AsRef<[u8]>>(data: T, seed: u64) -> u64 {
+    XxHash64::oneshot(seed, data.as_ref())
+}
+
+// Hash the values in a dictionary array using xxhash64
+fn create_xxhash64_hashes_dictionary<K: ArrowDictionaryKeyType>(
+    array: &ArrayRef,
+    hashes_buffer: &mut [u64],
+    first_col: bool,
+) -> Result<()> {
+    let dict_array = array.as_any().downcast_ref::<DictionaryArray<K>>().unwrap();
+    if !first_col {
+        let unpacked = take(dict_array.values().as_ref(), dict_array.keys(), None)?;
+        create_xxhash64_hashes(&[unpacked], hashes_buffer)?;
+    } else {
+        // Hash each dictionary value once, and then use that computed
+        // hash for each key value to avoid a potentially expensive
+        // redundant hashing for large dictionary elements (e.g. strings)
+        let dict_values = Arc::clone(dict_array.values());
+        // same initial seed as Spark
+        let mut dict_hashes = vec![42u64; dict_values.len()];
+        create_xxhash64_hashes(&[dict_values], &mut dict_hashes)?;
+
+        for (hash, key) in hashes_buffer.iter_mut().zip(dict_array.keys().iter()) {
+            if let Some(key) = key {
+                let idx = key.to_usize().ok_or_else(|| {
+                    DataFusionError::Internal(format!(
+                        "Can not convert key value {:?} to usize in dictionary of type {:?}",
+                        key,
+                        dict_array.data_type()
+                    ))
+                })?;
+                *hash = dict_hashes[idx]
+            } // no update for Null, consistent with other hashes
+        }
+    }
+    Ok(())
+}
+
+/// Creates xxhash64 hash values for every row, based on the values in the
+/// columns.
+///
+/// The number of rows to hash is determined by `hashes_buffer.len()`.
+/// `hashes_buffer` should be pre-sized appropriately
+fn create_xxhash64_hashes<'a>(
+    arrays: &[ArrayRef],
+    hashes_buffer: &'a mut [u64],
+) -> Result<&'a mut [u64]> {
+    create_hashes_internal!(
+        arrays,
+        hashes_buffer,
+        spark_compatible_xxhash64,
+        create_xxhash64_hashes_dictionary
+    );
+    Ok(hashes_buffer)
+}
+
+#[cfg(test)]
+mod tests {
+    use arrow::array::{Float32Array, Float64Array};
+    use std::sync::Arc;
+
+    use super::create_xxhash64_hashes;
+    use crate::test_hashes_with_nulls;
+    use datafusion::arrow::array::{ArrayRef, Int32Array, Int64Array, Int8Array, StringArray};
+
+    fn test_xxhash64_hash<I: Clone, T: arrow_array::Array + From<Vec<Option<I>>> + 'static>(
+        values: Vec<Option<I>>,
+        expected: Vec<u64>,
+    ) {
+        test_hashes_with_nulls!(create_xxhash64_hashes, T, values, expected, u64);
+    }
+
+    #[test]
+    fn test_i8() {
+        test_xxhash64_hash::<i8, Int8Array>(
+            vec![Some(1), Some(0), Some(-1), Some(i8::MAX), Some(i8::MIN)],
+            vec![
+                0xa309b38455455929,
+                0x3229fbc4681e48f3,
+                0x1bfdda8861c06e45,
+                0x77cc15d9f9f2cdc2,
+                0x39bc22b9e94d81d0,
+            ],
+        );
+    }
+
+    #[test]
+    fn test_i32() {
+        test_xxhash64_hash::<i32, Int32Array>(
+            vec![Some(1), Some(0), Some(-1), Some(i32::MAX), Some(i32::MIN)],
+            vec![
+                0xa309b38455455929,
+                0x3229fbc4681e48f3,
+                0x1bfdda8861c06e45,
+                0x14f0ac009c21721c,
+                0x1cc7cb8d034769cd,
+            ],
+        );
+    }
+
+    #[test]
+    fn test_i64() {
+        test_xxhash64_hash::<i64, Int64Array>(
+            vec![Some(1), Some(0), Some(-1), Some(i64::MAX), Some(i64::MIN)],
+            vec![
+                0x9ed50fd59358d232,
+                0xb71b47ebda15746c,
+                0x358ae035bfb46fd2,
+                0xd2f1c616ae7eb306,
+                0x88608019c494c1f4,
+            ],
+        );
+    }
+
+    #[test]
+    fn test_f32() {
+        test_xxhash64_hash::<f32, Float32Array>(
+            vec![
+                Some(1.0),
+                Some(0.0),
+                Some(-0.0),
+                Some(-1.0),
+                Some(99999999999.99999999999),
+                Some(-99999999999.99999999999),
+            ],
+            vec![
+                0x9b92689757fcdbd,
+                0x3229fbc4681e48f3,
+                0x3229fbc4681e48f3,
+                0xa2becc0e61bb3823,
+                0x8f20ab82d4f3687f,
+                0xdce4982d97f7ac4,
+            ],
+        )
+    }
+
+    #[test]
+    fn test_f64() {
+        test_xxhash64_hash::<f64, Float64Array>(
+            vec![
+                Some(1.0),
+                Some(0.0),
+                Some(-0.0),
+                Some(-1.0),
+                Some(99999999999.99999999999),
+                Some(-99999999999.99999999999),
+            ],
+            vec![
+                0xe1fd6e07fee8ad53,
+                0xb71b47ebda15746c,
+                0xb71b47ebda15746c,
+                0x8cdde022746f8f1f,
+                0x793c5c88d313eac7,
+                0xc5e60e7b75d9b232,
+            ],
+        )
+    }
+
+    #[test]
+    fn test_str() {
+        let input = [
+            "hello", "bar", "", "😁", "天地", "a", "ab", "abc", "abcd", "abcde",
+        ]
+        .iter()
+        .map(|s| Some(s.to_string()))
+        .collect::<Vec<Option<String>>>();
+
+        test_xxhash64_hash::<String, StringArray>(
+            input,
+            vec![
+                0xc3629e6318d53932,
+                0xe7097b6a54378d8a,
+                0x98b1582b0977e704,
+                0xa80d9d5a6a523bd5,
+                0xfcba5f61ac666c61,
+                0x88e4fe59adf7b0cc,
+                0x259dd873209a3fe3,
+                0x13c1d910702770e6,
+                0xa17b5eb5dc364dff,
+                0xf241303e4a90f299,
+            ],
+        )
+    }
+}
diff --git a/src/lib.rs b/src/lib.rs
index ee58f6290d8a..22bec87ee6d4 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -37,7 +37,6 @@ pub use schema_adapter::SparkSchemaAdapterFactory;
 pub use static_invoke::*;
 
 mod negative;
-pub mod spark_hash;
 mod struct_funcs;
 pub use negative::{create_negate_expr, NegativeExpr};
 mod normalize_nan;
@@ -54,6 +53,8 @@ pub use predicate_funcs::{spark_isnan, RLike};
 
 mod agg_funcs;
 mod comet_scalar_funcs;
+pub mod hash_funcs;
+
 mod string_funcs;
 
 mod datetime_funcs;
@@ -65,6 +66,7 @@ pub use cast::{spark_cast, Cast, SparkCastOptions};
 pub use comet_scalar_funcs::create_comet_physical_fun;
 pub use datetime_funcs::*;
 pub use error::{SparkError, SparkResult};
+pub use hash_funcs::*;
 pub use if_expr::IfExpr;
 pub use json_funcs::ToJson;
 pub use list::{ArrayInsert, GetArrayStructFields, ListExtract};
diff --git a/src/scalar_funcs.rs b/src/scalar_funcs.rs
index 9421d54fd70d..52ece10e8df8 100644
--- a/src/scalar_funcs.rs
+++ b/src/scalar_funcs.rs
@@ -42,15 +42,12 @@ mod unhex;
 pub use unhex::spark_unhex;
 
 mod hex;
+pub(crate) use hex::hex_strings;
 pub use hex::spark_hex;
 
 mod chr;
 pub use chr::SparkChrFunc;
 
-pub mod hash_expressions;
-// exposed for benchmark only
-pub use hash_expressions::{spark_murmur3_hash, spark_xxhash64};
-
 #[inline]
 fn get_precision_scale(data_type: &DataType) -> (u8, i8) {
     let DataType::Decimal128(precision, scale) = data_type else {
diff --git a/src/scalar_funcs/hash_expressions.rs b/src/scalar_funcs/hash_expressions.rs
deleted file mode 100644
index af423677b717..000000000000
--- a/src/scalar_funcs/hash_expressions.rs
+++ /dev/null
@@ -1,166 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-use crate::scalar_funcs::hex::hex_strings;
-use crate::spark_hash::{create_murmur3_hashes, create_xxhash64_hashes};
-
-use arrow_array::{Array, ArrayRef, Int32Array, Int64Array, StringArray};
-use datafusion::functions::crypto::{sha224, sha256, sha384, sha512};
-use datafusion_common::cast::as_binary_array;
-use datafusion_common::{exec_err, internal_err, DataFusionError, ScalarValue};
-use datafusion_expr::{ColumnarValue, ScalarUDF};
-use std::sync::Arc;
-
-/// Spark compatible murmur3 hash (just `hash` in Spark) in vectorized execution fashion
-pub fn spark_murmur3_hash(args: &[ColumnarValue]) -> Result<ColumnarValue, DataFusionError> {
-    let length = args.len();
-    let seed = &args[length - 1];
-    match seed {
-        ColumnarValue::Scalar(ScalarValue::Int32(Some(seed))) => {
-            // iterate over the arguments to find out the length of the array
-            let num_rows = args[0..args.len() - 1]
-                .iter()
-                .find_map(|arg| match arg {
-                    ColumnarValue::Array(array) => Some(array.len()),
-                    ColumnarValue::Scalar(_) => None,
-                })
-                .unwrap_or(1);
-            let mut hashes: Vec<u32> = vec![0_u32; num_rows];
-            hashes.fill(*seed as u32);
-            let arrays = args[0..args.len() - 1]
-                .iter()
-                .map(|arg| match arg {
-                    ColumnarValue::Array(array) => Arc::clone(array),
-                    ColumnarValue::Scalar(scalar) => {
-                        scalar.clone().to_array_of_size(num_rows).unwrap()
-                    }
-                })
-                .collect::<Vec<ArrayRef>>();
-            create_murmur3_hashes(&arrays, &mut hashes)?;
-            if num_rows == 1 {
-                Ok(ColumnarValue::Scalar(ScalarValue::Int32(Some(
-                    hashes[0] as i32,
-                ))))
-            } else {
-                let hashes: Vec<i32> = hashes.into_iter().map(|x| x as i32).collect();
-                Ok(ColumnarValue::Array(Arc::new(Int32Array::from(hashes))))
-            }
-        }
-        _ => {
-            internal_err!(
-                "The seed of function murmur3_hash must be an Int32 scalar value, but got: {:?}.",
-                seed
-            )
-        }
-    }
-}
-
-/// Spark compatible xxhash64 in vectorized execution fashion
-pub fn spark_xxhash64(args: &[ColumnarValue]) -> Result<ColumnarValue, DataFusionError> {
-    let length = args.len();
-    let seed = &args[length - 1];
-    match seed {
-        ColumnarValue::Scalar(ScalarValue::Int64(Some(seed))) => {
-            // iterate over the arguments to find out the length of the array
-            let num_rows = args[0..args.len() - 1]
-                .iter()
-                .find_map(|arg| match arg {
-                    ColumnarValue::Array(array) => Some(array.len()),
-                    ColumnarValue::Scalar(_) => None,
-                })
-                .unwrap_or(1);
-            let mut hashes: Vec<u64> = vec![0_u64; num_rows];
-            hashes.fill(*seed as u64);
-            let arrays = args[0..args.len() - 1]
-                .iter()
-                .map(|arg| match arg {
-                    ColumnarValue::Array(array) => Arc::clone(array),
-                    ColumnarValue::Scalar(scalar) => {
-                        scalar.clone().to_array_of_size(num_rows).unwrap()
-                    }
-                })
-                .collect::<Vec<ArrayRef>>();
-            create_xxhash64_hashes(&arrays, &mut hashes)?;
-            if num_rows == 1 {
-                Ok(ColumnarValue::Scalar(ScalarValue::Int64(Some(
-                    hashes[0] as i64,
-                ))))
-            } else {
-                let hashes: Vec<i64> = hashes.into_iter().map(|x| x as i64).collect();
-                Ok(ColumnarValue::Array(Arc::new(Int64Array::from(hashes))))
-            }
-        }
-        _ => {
-            internal_err!(
-                "The seed of function xxhash64 must be an Int64 scalar value, but got: {:?}.",
-                seed
-            )
-        }
-    }
-}
-
-/// `sha224` function that simulates Spark's `sha2` expression with bit width 224
-pub fn spark_sha224(args: &[ColumnarValue]) -> Result<ColumnarValue, DataFusionError> {
-    wrap_digest_result_as_hex_string(args, sha224())
-}
-
-/// `sha256` function that simulates Spark's `sha2` expression with bit width 0 or 256
-pub fn spark_sha256(args: &[ColumnarValue]) -> Result<ColumnarValue, DataFusionError> {
-    wrap_digest_result_as_hex_string(args, sha256())
-}
-
-/// `sha384` function that simulates Spark's `sha2` expression with bit width 384
-pub fn spark_sha384(args: &[ColumnarValue]) -> Result<ColumnarValue, DataFusionError> {
-    wrap_digest_result_as_hex_string(args, sha384())
-}
-
-/// `sha512` function that simulates Spark's `sha2` expression with bit width 512
-pub fn spark_sha512(args: &[ColumnarValue]) -> Result<ColumnarValue, DataFusionError> {
-    wrap_digest_result_as_hex_string(args, sha512())
-}
-
-// Spark requires hex string as the result of sha2 functions, we have to wrap the
-// result of digest functions as hex string
-fn wrap_digest_result_as_hex_string(
-    args: &[ColumnarValue],
-    digest: Arc<ScalarUDF>,
-) -> Result<ColumnarValue, DataFusionError> {
-    let row_count = match &args[0] {
-        ColumnarValue::Array(array) => array.len(),
-        ColumnarValue::Scalar(_) => 1,
-    };
-    let value = digest.invoke_batch(args, row_count)?;
-    match value {
-        ColumnarValue::Array(array) => {
-            let binary_array = as_binary_array(&array)?;
-            let string_array: StringArray = binary_array
-                .iter()
-                .map(|opt| opt.map(hex_strings::<_>))
-                .collect();
-            Ok(ColumnarValue::Array(Arc::new(string_array)))
-        }
-        ColumnarValue::Scalar(ScalarValue::Binary(opt)) => Ok(ColumnarValue::Scalar(
-            ScalarValue::Utf8(opt.map(hex_strings::<_>)),
-        )),
-        _ => {
-            exec_err!(
-                "digest function should return binary value, but got: {:?}",
-                value.data_type()
-            )
-        }
-    }
-}
diff --git a/src/scalar_funcs/hex.rs b/src/scalar_funcs/hex.rs
index e572ba5ef39a..4ccd4f453879 100644
--- a/src/scalar_funcs/hex.rs
+++ b/src/scalar_funcs/hex.rs
@@ -52,7 +52,7 @@ fn hex_encode<T: AsRef<[u8]>>(data: T, lower_case: bool) -> String {
 }
 
 #[inline(always)]
-pub(super) fn hex_strings<T: AsRef<[u8]>>(data: T) -> String {
+pub(crate) fn hex_strings<T: AsRef<[u8]>>(data: T) -> String {
     hex_encode(data, true)
 }
 
diff --git a/src/spark_hash.rs b/src/spark_hash.rs
deleted file mode 100644
index 1402f717157a..000000000000
--- a/src/spark_hash.rs
+++ /dev/null
@@ -1,712 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-//! This includes utilities for hashing and murmur3 hashing.
-
-use arrow::{
-    compute::take,
-    datatypes::{ArrowNativeTypeOp, UInt16Type, UInt32Type, UInt64Type, UInt8Type},
-};
-use std::sync::Arc;
-use twox_hash::XxHash64;
-
-use datafusion::{
-    arrow::{
-        array::*,
-        datatypes::{
-            ArrowDictionaryKeyType, ArrowNativeType, DataType, Int16Type, Int32Type, Int64Type,
-            Int8Type, TimeUnit,
-        },
-    },
-    error::{DataFusionError, Result},
-};
-
-#[inline]
-pub(crate) fn spark_compatible_xxhash64<T: AsRef<[u8]>>(data: T, seed: u64) -> u64 {
-    XxHash64::oneshot(seed, data.as_ref())
-}
-
-/// Spark-compatible murmur3 hash function
-#[inline]
-pub fn spark_compatible_murmur3_hash<T: AsRef<[u8]>>(data: T, seed: u32) -> u32 {
-    #[inline]
-    fn mix_k1(mut k1: i32) -> i32 {
-        k1 = k1.mul_wrapping(0xcc9e2d51u32 as i32);
-        k1 = k1.rotate_left(15);
-        k1 = k1.mul_wrapping(0x1b873593u32 as i32);
-        k1
-    }
-
-    #[inline]
-    fn mix_h1(mut h1: i32, k1: i32) -> i32 {
-        h1 ^= k1;
-        h1 = h1.rotate_left(13);
-        h1 = h1.mul_wrapping(5).add_wrapping(0xe6546b64u32 as i32);
-        h1
-    }
-
-    #[inline]
-    fn fmix(mut h1: i32, len: i32) -> i32 {
-        h1 ^= len;
-        h1 ^= (h1 as u32 >> 16) as i32;
-        h1 = h1.mul_wrapping(0x85ebca6bu32 as i32);
-        h1 ^= (h1 as u32 >> 13) as i32;
-        h1 = h1.mul_wrapping(0xc2b2ae35u32 as i32);
-        h1 ^= (h1 as u32 >> 16) as i32;
-        h1
-    }
-
-    #[inline]
-    unsafe fn hash_bytes_by_int(data: &[u8], seed: u32) -> i32 {
-        // safety: data length must be aligned to 4 bytes
-        let mut h1 = seed as i32;
-        for i in (0..data.len()).step_by(4) {
-            let ints = data.as_ptr().add(i) as *const i32;
-            let mut half_word = ints.read_unaligned();
-            if cfg!(target_endian = "big") {
-                half_word = half_word.reverse_bits();
-            }
-            h1 = mix_h1(h1, mix_k1(half_word));
-        }
-        h1
-    }
-    let data = data.as_ref();
-    let len = data.len();
-    let len_aligned = len - len % 4;
-
-    // safety:
-    // avoid boundary checking in performance critical codes.
-    // all operations are guaranteed to be safe
-    // data is &[u8] so we do not need to check for proper alignment
-    unsafe {
-        let mut h1 = if len_aligned > 0 {
-            hash_bytes_by_int(&data[0..len_aligned], seed)
-        } else {
-            seed as i32
-        };
-
-        for i in len_aligned..len {
-            let half_word = *data.get_unchecked(i) as i8 as i32;
-            h1 = mix_h1(h1, mix_k1(half_word));
-        }
-        fmix(h1, len as i32) as u32
-    }
-}
-
-macro_rules! hash_array {
-    ($array_type: ident, $column: ident, $hashes: ident, $hash_method: ident) => {
-        let array = $column.as_any().downcast_ref::<$array_type>().unwrap();
-        if array.null_count() == 0 {
-            for (i, hash) in $hashes.iter_mut().enumerate() {
-                *hash = $hash_method(&array.value(i), *hash);
-            }
-        } else {
-            for (i, hash) in $hashes.iter_mut().enumerate() {
-                if !array.is_null(i) {
-                    *hash = $hash_method(&array.value(i), *hash);
-                }
-            }
-        }
-    };
-}
-
-macro_rules! hash_array_boolean {
-    ($array_type: ident, $column: ident, $hash_input_type: ident, $hashes: ident, $hash_method: ident) => {
-        let array = $column.as_any().downcast_ref::<$array_type>().unwrap();
-        if array.null_count() == 0 {
-            for (i, hash) in $hashes.iter_mut().enumerate() {
-                *hash = $hash_method($hash_input_type::from(array.value(i)).to_le_bytes(), *hash);
-            }
-        } else {
-            for (i, hash) in $hashes.iter_mut().enumerate() {
-                if !array.is_null(i) {
-                    *hash =
-                        $hash_method($hash_input_type::from(array.value(i)).to_le_bytes(), *hash);
-                }
-            }
-        }
-    };
-}
-
-macro_rules! hash_array_primitive {
-    ($array_type: ident, $column: ident, $ty: ident, $hashes: ident, $hash_method: ident) => {
-        let array = $column.as_any().downcast_ref::<$array_type>().unwrap();
-        let values = array.values();
-
-        if array.null_count() == 0 {
-            for (hash, value) in $hashes.iter_mut().zip(values.iter()) {
-                *hash = $hash_method((*value as $ty).to_le_bytes(), *hash);
-            }
-        } else {
-            for (i, (hash, value)) in $hashes.iter_mut().zip(values.iter()).enumerate() {
-                if !array.is_null(i) {
-                    *hash = $hash_method((*value as $ty).to_le_bytes(), *hash);
-                }
-            }
-        }
-    };
-}
-
-macro_rules! hash_array_primitive_float {
-    ($array_type: ident, $column: ident, $ty: ident, $ty2: ident, $hashes: ident, $hash_method: ident) => {
-        let array = $column.as_any().downcast_ref::<$array_type>().unwrap();
-        let values = array.values();
-
-        if array.null_count() == 0 {
-            for (hash, value) in $hashes.iter_mut().zip(values.iter()) {
-                // Spark uses 0 as hash for -0.0, see `Murmur3Hash` expression.
-                if *value == 0.0 && value.is_sign_negative() {
-                    *hash = $hash_method((0 as $ty2).to_le_bytes(), *hash);
-                } else {
-                    *hash = $hash_method((*value as $ty).to_le_bytes(), *hash);
-                }
-            }
-        } else {
-            for (i, (hash, value)) in $hashes.iter_mut().zip(values.iter()).enumerate() {
-                if !array.is_null(i) {
-                    // Spark uses 0 as hash for -0.0, see `Murmur3Hash` expression.
-                    if *value == 0.0 && value.is_sign_negative() {
-                        *hash = $hash_method((0 as $ty2).to_le_bytes(), *hash);
-                    } else {
-                        *hash = $hash_method((*value as $ty).to_le_bytes(), *hash);
-                    }
-                }
-            }
-        }
-    };
-}
-
-macro_rules! hash_array_decimal {
-    ($array_type:ident, $column: ident, $hashes: ident, $hash_method: ident) => {
-        let array = $column.as_any().downcast_ref::<$array_type>().unwrap();
-
-        if array.null_count() == 0 {
-            for (i, hash) in $hashes.iter_mut().enumerate() {
-                *hash = $hash_method(array.value(i).to_le_bytes(), *hash);
-            }
-        } else {
-            for (i, hash) in $hashes.iter_mut().enumerate() {
-                if !array.is_null(i) {
-                    *hash = $hash_method(array.value(i).to_le_bytes(), *hash);
-                }
-            }
-        }
-    };
-}
-
-/// Hash the values in a dictionary array
-fn create_hashes_dictionary<K: ArrowDictionaryKeyType>(
-    array: &ArrayRef,
-    hashes_buffer: &mut [u32],
-    first_col: bool,
-) -> Result<()> {
-    let dict_array = array.as_any().downcast_ref::<DictionaryArray<K>>().unwrap();
-    if !first_col {
-        // unpack the dictionary array as each row may have a different hash input
-        let unpacked = take(dict_array.values().as_ref(), dict_array.keys(), None)?;
-        create_murmur3_hashes(&[unpacked], hashes_buffer)?;
-    } else {
-        // For the first column, hash each dictionary value once, and then use
-        // that computed hash for each key value to avoid a potentially
-        // expensive redundant hashing for large dictionary elements (e.g. strings)
-        let dict_values = Arc::clone(dict_array.values());
-        // same initial seed as Spark
-        let mut dict_hashes = vec![42; dict_values.len()];
-        create_murmur3_hashes(&[dict_values], &mut dict_hashes)?;
-        for (hash, key) in hashes_buffer.iter_mut().zip(dict_array.keys().iter()) {
-            if let Some(key) = key {
-                let idx = key.to_usize().ok_or_else(|| {
-                    DataFusionError::Internal(format!(
-                        "Can not convert key value {:?} to usize in dictionary of type {:?}",
-                        key,
-                        dict_array.data_type()
-                    ))
-                })?;
-                *hash = dict_hashes[idx]
-            } // no update for Null, consistent with other hashes
-        }
-    }
-    Ok(())
-}
-
-// Hash the values in a dictionary array using xxhash64
-fn create_xxhash64_hashes_dictionary<K: ArrowDictionaryKeyType>(
-    array: &ArrayRef,
-    hashes_buffer: &mut [u64],
-    first_col: bool,
-) -> Result<()> {
-    let dict_array = array.as_any().downcast_ref::<DictionaryArray<K>>().unwrap();
-    if !first_col {
-        let unpacked = take(dict_array.values().as_ref(), dict_array.keys(), None)?;
-        create_xxhash64_hashes(&[unpacked], hashes_buffer)?;
-    } else {
-        // Hash each dictionary value once, and then use that computed
-        // hash for each key value to avoid a potentially expensive
-        // redundant hashing for large dictionary elements (e.g. strings)
-        let dict_values = Arc::clone(dict_array.values());
-        // same initial seed as Spark
-        let mut dict_hashes = vec![42u64; dict_values.len()];
-        create_xxhash64_hashes(&[dict_values], &mut dict_hashes)?;
-
-        for (hash, key) in hashes_buffer.iter_mut().zip(dict_array.keys().iter()) {
-            if let Some(key) = key {
-                let idx = key.to_usize().ok_or_else(|| {
-                    DataFusionError::Internal(format!(
-                        "Can not convert key value {:?} to usize in dictionary of type {:?}",
-                        key,
-                        dict_array.data_type()
-                    ))
-                })?;
-                *hash = dict_hashes[idx]
-            } // no update for Null, consistent with other hashes
-        }
-    }
-    Ok(())
-}
-
-/// Creates hash values for every row, based on the values in the
-/// columns.
-///
-/// The number of rows to hash is determined by `hashes_buffer.len()`.
-/// `hashes_buffer` should be pre-sized appropriately
-///
-/// `hash_method` is the hash function to use.
-/// `create_dictionary_hash_method` is the function to create hashes for dictionary arrays input.
-macro_rules! create_hashes_internal {
-    ($arrays: ident, $hashes_buffer: ident, $hash_method: ident, $create_dictionary_hash_method: ident) => {
-        for (i, col) in $arrays.iter().enumerate() {
-            let first_col = i == 0;
-            match col.data_type() {
-                DataType::Boolean => {
-                    hash_array_boolean!(BooleanArray, col, i32, $hashes_buffer, $hash_method);
-                }
-                DataType::Int8 => {
-                    hash_array_primitive!(Int8Array, col, i32, $hashes_buffer, $hash_method);
-                }
-                DataType::Int16 => {
-                    hash_array_primitive!(Int16Array, col, i32, $hashes_buffer, $hash_method);
-                }
-                DataType::Int32 => {
-                    hash_array_primitive!(Int32Array, col, i32, $hashes_buffer, $hash_method);
-                }
-                DataType::Int64 => {
-                    hash_array_primitive!(Int64Array, col, i64, $hashes_buffer, $hash_method);
-                }
-                DataType::Float32 => {
-                    hash_array_primitive_float!(
-                        Float32Array,
-                        col,
-                        f32,
-                        i32,
-                        $hashes_buffer,
-                        $hash_method
-                    );
-                }
-                DataType::Float64 => {
-                    hash_array_primitive_float!(
-                        Float64Array,
-                        col,
-                        f64,
-                        i64,
-                        $hashes_buffer,
-                        $hash_method
-                    );
-                }
-                DataType::Timestamp(TimeUnit::Second, _) => {
-                    hash_array_primitive!(
-                        TimestampSecondArray,
-                        col,
-                        i64,
-                        $hashes_buffer,
-                        $hash_method
-                    );
-                }
-                DataType::Timestamp(TimeUnit::Millisecond, _) => {
-                    hash_array_primitive!(
-                        TimestampMillisecondArray,
-                        col,
-                        i64,
-                        $hashes_buffer,
-                        $hash_method
-                    );
-                }
-                DataType::Timestamp(TimeUnit::Microsecond, _) => {
-                    hash_array_primitive!(
-                        TimestampMicrosecondArray,
-                        col,
-                        i64,
-                        $hashes_buffer,
-                        $hash_method
-                    );
-                }
-                DataType::Timestamp(TimeUnit::Nanosecond, _) => {
-                    hash_array_primitive!(
-                        TimestampNanosecondArray,
-                        col,
-                        i64,
-                        $hashes_buffer,
-                        $hash_method
-                    );
-                }
-                DataType::Date32 => {
-                    hash_array_primitive!(Date32Array, col, i32, $hashes_buffer, $hash_method);
-                }
-                DataType::Date64 => {
-                    hash_array_primitive!(Date64Array, col, i64, $hashes_buffer, $hash_method);
-                }
-                DataType::Utf8 => {
-                    hash_array!(StringArray, col, $hashes_buffer, $hash_method);
-                }
-                DataType::LargeUtf8 => {
-                    hash_array!(LargeStringArray, col, $hashes_buffer, $hash_method);
-                }
-                DataType::Binary => {
-                    hash_array!(BinaryArray, col, $hashes_buffer, $hash_method);
-                }
-                DataType::LargeBinary => {
-                    hash_array!(LargeBinaryArray, col, $hashes_buffer, $hash_method);
-                }
-                DataType::FixedSizeBinary(_) => {
-                    hash_array!(FixedSizeBinaryArray, col, $hashes_buffer, $hash_method);
-                }
-                DataType::Decimal128(_, _) => {
-                    hash_array_decimal!(Decimal128Array, col, $hashes_buffer, $hash_method);
-                }
-                DataType::Dictionary(index_type, _) => match **index_type {
-                    DataType::Int8 => {
-                        $create_dictionary_hash_method::<Int8Type>(col, $hashes_buffer, first_col)?;
-                    }
-                    DataType::Int16 => {
-                        $create_dictionary_hash_method::<Int16Type>(
-                            col,
-                            $hashes_buffer,
-                            first_col,
-                        )?;
-                    }
-                    DataType::Int32 => {
-                        $create_dictionary_hash_method::<Int32Type>(
-                            col,
-                            $hashes_buffer,
-                            first_col,
-                        )?;
-                    }
-                    DataType::Int64 => {
-                        $create_dictionary_hash_method::<Int64Type>(
-                            col,
-                            $hashes_buffer,
-                            first_col,
-                        )?;
-                    }
-                    DataType::UInt8 => {
-                        $create_dictionary_hash_method::<UInt8Type>(
-                            col,
-                            $hashes_buffer,
-                            first_col,
-                        )?;
-                    }
-                    DataType::UInt16 => {
-                        $create_dictionary_hash_method::<UInt16Type>(
-                            col,
-                            $hashes_buffer,
-                            first_col,
-                        )?;
-                    }
-                    DataType::UInt32 => {
-                        $create_dictionary_hash_method::<UInt32Type>(
-                            col,
-                            $hashes_buffer,
-                            first_col,
-                        )?;
-                    }
-                    DataType::UInt64 => {
-                        $create_dictionary_hash_method::<UInt64Type>(
-                            col,
-                            $hashes_buffer,
-                            first_col,
-                        )?;
-                    }
-                    _ => {
-                        return Err(DataFusionError::Internal(format!(
-                            "Unsupported dictionary type in hasher hashing: {}",
-                            col.data_type(),
-                        )))
-                    }
-                },
-                _ => {
-                    // This is internal because we should have caught this before.
-                    return Err(DataFusionError::Internal(format!(
-                        "Unsupported data type in hasher: {}",
-                        col.data_type()
-                    )));
-                }
-            }
-        }
-    };
-}
-
-/// Creates hash values for every row, based on the values in the
-/// columns.
-///
-/// The number of rows to hash is determined by `hashes_buffer.len()`.
-/// `hashes_buffer` should be pre-sized appropriately
-pub fn create_murmur3_hashes<'a>(
-    arrays: &[ArrayRef],
-    hashes_buffer: &'a mut [u32],
-) -> Result<&'a mut [u32]> {
-    create_hashes_internal!(
-        arrays,
-        hashes_buffer,
-        spark_compatible_murmur3_hash,
-        create_hashes_dictionary
-    );
-    Ok(hashes_buffer)
-}
-
-/// Creates xxhash64 hash values for every row, based on the values in the
-/// columns.
-///
-/// The number of rows to hash is determined by `hashes_buffer.len()`.
-/// `hashes_buffer` should be pre-sized appropriately
-pub fn create_xxhash64_hashes<'a>(
-    arrays: &[ArrayRef],
-    hashes_buffer: &'a mut [u64],
-) -> Result<&'a mut [u64]> {
-    create_hashes_internal!(
-        arrays,
-        hashes_buffer,
-        spark_compatible_xxhash64,
-        create_xxhash64_hashes_dictionary
-    );
-    Ok(hashes_buffer)
-}
-
-#[cfg(test)]
-mod tests {
-    use arrow::array::{Float32Array, Float64Array};
-    use std::sync::Arc;
-
-    use super::{create_murmur3_hashes, create_xxhash64_hashes};
-    use datafusion::arrow::array::{ArrayRef, Int32Array, Int64Array, Int8Array, StringArray};
-
-    macro_rules! test_hashes_internal {
-        ($hash_method: ident, $input: expr, $initial_seeds: expr, $expected: expr) => {
-            let i = $input;
-            let mut hashes = $initial_seeds.clone();
-            $hash_method(&[i], &mut hashes).unwrap();
-            assert_eq!(hashes, $expected);
-        };
-    }
-
-    macro_rules! test_hashes_with_nulls {
-        ($method: ident, $t: ty, $values: ident, $expected: ident, $seed_type: ty) => {
-            // copied before inserting nulls
-            let mut input_with_nulls = $values.clone();
-            let mut expected_with_nulls = $expected.clone();
-            // test before inserting nulls
-            let len = $values.len();
-            let initial_seeds = vec![42 as $seed_type; len];
-            let i = Arc::new(<$t>::from($values)) as ArrayRef;
-            test_hashes_internal!($method, i, initial_seeds, $expected);
-
-            // test with nulls
-            let median = len / 2;
-            input_with_nulls.insert(0, None);
-            input_with_nulls.insert(median, None);
-            expected_with_nulls.insert(0, 42 as $seed_type);
-            expected_with_nulls.insert(median, 42 as $seed_type);
-            let len_with_nulls = len + 2;
-            let initial_seeds_with_nulls = vec![42 as $seed_type; len_with_nulls];
-            let nullable_input = Arc::new(<$t>::from(input_with_nulls)) as ArrayRef;
-            test_hashes_internal!(
-                $method,
-                nullable_input,
-                initial_seeds_with_nulls,
-                expected_with_nulls
-            );
-        };
-    }
-
-    fn test_murmur3_hash<I: Clone, T: arrow_array::Array + From<Vec<Option<I>>> + 'static>(
-        values: Vec<Option<I>>,
-        expected: Vec<u32>,
-    ) {
-        test_hashes_with_nulls!(create_murmur3_hashes, T, values, expected, u32);
-    }
-
-    fn test_xxhash64_hash<I: Clone, T: arrow_array::Array + From<Vec<Option<I>>> + 'static>(
-        values: Vec<Option<I>>,
-        expected: Vec<u64>,
-    ) {
-        test_hashes_with_nulls!(create_xxhash64_hashes, T, values, expected, u64);
-    }
-
-    #[test]
-    fn test_i8() {
-        test_murmur3_hash::<i8, Int8Array>(
-            vec![Some(1), Some(0), Some(-1), Some(i8::MAX), Some(i8::MIN)],
-            vec![0xdea578e3, 0x379fae8f, 0xa0590e3d, 0x43b4d8ed, 0x422a1365],
-        );
-        test_xxhash64_hash::<i8, Int8Array>(
-            vec![Some(1), Some(0), Some(-1), Some(i8::MAX), Some(i8::MIN)],
-            vec![
-                0xa309b38455455929,
-                0x3229fbc4681e48f3,
-                0x1bfdda8861c06e45,
-                0x77cc15d9f9f2cdc2,
-                0x39bc22b9e94d81d0,
-            ],
-        );
-    }
-
-    #[test]
-    fn test_i32() {
-        test_murmur3_hash::<i32, Int32Array>(
-            vec![Some(1), Some(0), Some(-1), Some(i32::MAX), Some(i32::MIN)],
-            vec![0xdea578e3, 0x379fae8f, 0xa0590e3d, 0x07fb67e7, 0x2b1f0fc6],
-        );
-        test_xxhash64_hash::<i32, Int32Array>(
-            vec![Some(1), Some(0), Some(-1), Some(i32::MAX), Some(i32::MIN)],
-            vec![
-                0xa309b38455455929,
-                0x3229fbc4681e48f3,
-                0x1bfdda8861c06e45,
-                0x14f0ac009c21721c,
-                0x1cc7cb8d034769cd,
-            ],
-        );
-    }
-
-    #[test]
-    fn test_i64() {
-        test_murmur3_hash::<i64, Int64Array>(
-            vec![Some(1), Some(0), Some(-1), Some(i64::MAX), Some(i64::MIN)],
-            vec![0x99f0149d, 0x9c67b85d, 0xc8008529, 0xa05b5d7b, 0xcd1e64fb],
-        );
-        test_xxhash64_hash::<i64, Int64Array>(
-            vec![Some(1), Some(0), Some(-1), Some(i64::MAX), Some(i64::MIN)],
-            vec![
-                0x9ed50fd59358d232,
-                0xb71b47ebda15746c,
-                0x358ae035bfb46fd2,
-                0xd2f1c616ae7eb306,
-                0x88608019c494c1f4,
-            ],
-        );
-    }
-
-    #[test]
-    fn test_f32() {
-        test_murmur3_hash::<f32, Float32Array>(
-            vec![
-                Some(1.0),
-                Some(0.0),
-                Some(-0.0),
-                Some(-1.0),
-                Some(99999999999.99999999999),
-                Some(-99999999999.99999999999),
-            ],
-            vec![
-                0xe434cc39, 0x379fae8f, 0x379fae8f, 0xdc0da8eb, 0xcbdc340f, 0xc0361c86,
-            ],
-        );
-        test_xxhash64_hash::<f32, Float32Array>(
-            vec![
-                Some(1.0),
-                Some(0.0),
-                Some(-0.0),
-                Some(-1.0),
-                Some(99999999999.99999999999),
-                Some(-99999999999.99999999999),
-            ],
-            vec![
-                0x9b92689757fcdbd,
-                0x3229fbc4681e48f3,
-                0x3229fbc4681e48f3,
-                0xa2becc0e61bb3823,
-                0x8f20ab82d4f3687f,
-                0xdce4982d97f7ac4,
-            ],
-        )
-    }
-
-    #[test]
-    fn test_f64() {
-        test_murmur3_hash::<f64, Float64Array>(
-            vec![
-                Some(1.0),
-                Some(0.0),
-                Some(-0.0),
-                Some(-1.0),
-                Some(99999999999.99999999999),
-                Some(-99999999999.99999999999),
-            ],
-            vec![
-                0xe4876492, 0x9c67b85d, 0x9c67b85d, 0x13d81357, 0xb87e1595, 0xa0eef9f9,
-            ],
-        );
-
-        test_xxhash64_hash::<f64, Float64Array>(
-            vec![
-                Some(1.0),
-                Some(0.0),
-                Some(-0.0),
-                Some(-1.0),
-                Some(99999999999.99999999999),
-                Some(-99999999999.99999999999),
-            ],
-            vec![
-                0xe1fd6e07fee8ad53,
-                0xb71b47ebda15746c,
-                0xb71b47ebda15746c,
-                0x8cdde022746f8f1f,
-                0x793c5c88d313eac7,
-                0xc5e60e7b75d9b232,
-            ],
-        )
-    }
-
-    #[test]
-    fn test_str() {
-        let input = [
-            "hello", "bar", "", "😁", "天地", "a", "ab", "abc", "abcd", "abcde",
-        ]
-        .iter()
-        .map(|s| Some(s.to_string()))
-        .collect::<Vec<Option<String>>>();
-        let expected: Vec<u32> = vec![
-            3286402344, 2486176763, 142593372, 885025535, 2395000894, 1485273170, 0xfa37157b,
-            1322437556, 0xe860e5cc, 814637928,
-        ];
-
-        test_murmur3_hash::<String, StringArray>(input.clone(), expected);
-        test_xxhash64_hash::<String, StringArray>(
-            input,
-            vec![
-                0xc3629e6318d53932,
-                0xe7097b6a54378d8a,
-                0x98b1582b0977e704,
-                0xa80d9d5a6a523bd5,
-                0xfcba5f61ac666c61,
-                0x88e4fe59adf7b0cc,
-                0x259dd873209a3fe3,
-                0x13c1d910702770e6,
-                0xa17b5eb5dc364dff,
-                0xf241303e4a90f299,
-            ],
-        )
-    }
-}

From 2cd57b5dfa3af43abeb0baf6c0af873fcda21c3e Mon Sep 17 00:00:00 2001
From: Raz Luvaton <16746759+rluvaton@users.noreply.github.com>
Date: Sat, 18 Jan 2025 22:56:09 +0200
Subject: [PATCH 52/68] chore: extract conversion_funcs, conditional_funcs,
 bitwise_funcs and array_funcs expressions to folders based on spark grouping
 (#1223)

---
 src/{list.rs => array_funcs/array_insert.rs} | 418 +------------------
 src/array_funcs/get_array_struct_fields.rs   | 166 ++++++++
 src/array_funcs/list_extract.rs              | 310 ++++++++++++++
 src/array_funcs/mod.rs                       |  24 ++
 src/{ => bitwise_funcs}/bitwise_not.rs       |   0
 src/bitwise_funcs/mod.rs                     |  20 +
 src/{ => conditional_funcs}/if_expr.rs       |   0
 src/conditional_funcs/mod.rs                 |  20 +
 src/{ => conversion_funcs}/cast.rs           |   0
 src/conversion_funcs/mod.rs                  |  18 +
 src/lib.rs                                   |  17 +-
 11 files changed, 574 insertions(+), 419 deletions(-)
 rename src/{list.rs => array_funcs/array_insert.rs} (54%)
 create mode 100644 src/array_funcs/get_array_struct_fields.rs
 create mode 100644 src/array_funcs/list_extract.rs
 create mode 100644 src/array_funcs/mod.rs
 rename src/{ => bitwise_funcs}/bitwise_not.rs (100%)
 create mode 100644 src/bitwise_funcs/mod.rs
 rename src/{ => conditional_funcs}/if_expr.rs (100%)
 create mode 100644 src/conditional_funcs/mod.rs
 rename src/{ => conversion_funcs}/cast.rs (100%)
 create mode 100644 src/conversion_funcs/mod.rs

diff --git a/src/list.rs b/src/array_funcs/array_insert.rs
similarity index 54%
rename from src/list.rs
rename to src/array_funcs/array_insert.rs
index fc31b11a0beb..08fb78905633 100644
--- a/src/list.rs
+++ b/src/array_funcs/array_insert.rs
@@ -21,14 +21,12 @@ use arrow::{
     datatypes::ArrowNativeType,
     record_batch::RecordBatch,
 };
-use arrow_array::{
-    make_array, Array, ArrayRef, GenericListArray, Int32Array, OffsetSizeTrait, StructArray,
-};
-use arrow_schema::{DataType, Field, FieldRef, Schema};
+use arrow_array::{make_array, Array, ArrayRef, GenericListArray, Int32Array, OffsetSizeTrait};
+use arrow_schema::{DataType, Field, Schema};
 use datafusion::logical_expr::ColumnarValue;
 use datafusion_common::{
-    cast::{as_int32_array, as_large_list_array, as_list_array},
-    internal_err, DataFusionError, Result as DataFusionResult, ScalarValue,
+    cast::{as_large_list_array, as_list_array},
+    internal_err, DataFusionError, Result as DataFusionResult,
 };
 use datafusion_physical_expr::PhysicalExpr;
 use std::hash::Hash;
@@ -43,372 +41,6 @@ use std::{
 // https://github.com/apache/spark/blob/master/common/utils/src/main/java/org/apache/spark/unsafe/array/ByteArrayUtils.java
 const MAX_ROUNDED_ARRAY_LENGTH: usize = 2147483632;
 
-#[derive(Debug, Eq)]
-pub struct ListExtract {
-    child: Arc<dyn PhysicalExpr>,
-    ordinal: Arc<dyn PhysicalExpr>,
-    default_value: Option<Arc<dyn PhysicalExpr>>,
-    one_based: bool,
-    fail_on_error: bool,
-}
-
-impl Hash for ListExtract {
-    fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
-        self.child.hash(state);
-        self.ordinal.hash(state);
-        self.default_value.hash(state);
-        self.one_based.hash(state);
-        self.fail_on_error.hash(state);
-    }
-}
-impl PartialEq for ListExtract {
-    fn eq(&self, other: &Self) -> bool {
-        self.child.eq(&other.child)
-            && self.ordinal.eq(&other.ordinal)
-            && self.default_value.eq(&other.default_value)
-            && self.one_based.eq(&other.one_based)
-            && self.fail_on_error.eq(&other.fail_on_error)
-    }
-}
-
-impl ListExtract {
-    pub fn new(
-        child: Arc<dyn PhysicalExpr>,
-        ordinal: Arc<dyn PhysicalExpr>,
-        default_value: Option<Arc<dyn PhysicalExpr>>,
-        one_based: bool,
-        fail_on_error: bool,
-    ) -> Self {
-        Self {
-            child,
-            ordinal,
-            default_value,
-            one_based,
-            fail_on_error,
-        }
-    }
-
-    fn child_field(&self, input_schema: &Schema) -> DataFusionResult<FieldRef> {
-        match self.child.data_type(input_schema)? {
-            DataType::List(field) | DataType::LargeList(field) => Ok(field),
-            data_type => Err(DataFusionError::Internal(format!(
-                "Unexpected data type in ListExtract: {:?}",
-                data_type
-            ))),
-        }
-    }
-}
-
-impl PhysicalExpr for ListExtract {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
-    fn data_type(&self, input_schema: &Schema) -> DataFusionResult<DataType> {
-        Ok(self.child_field(input_schema)?.data_type().clone())
-    }
-
-    fn nullable(&self, input_schema: &Schema) -> DataFusionResult<bool> {
-        // Only non-nullable if fail_on_error is enabled and the element is non-nullable
-        Ok(!self.fail_on_error || self.child_field(input_schema)?.is_nullable())
-    }
-
-    fn evaluate(&self, batch: &RecordBatch) -> DataFusionResult<ColumnarValue> {
-        let child_value = self.child.evaluate(batch)?.into_array(batch.num_rows())?;
-        let ordinal_value = self.ordinal.evaluate(batch)?.into_array(batch.num_rows())?;
-
-        let default_value = self
-            .default_value
-            .as_ref()
-            .map(|d| {
-                d.evaluate(batch).map(|value| match value {
-                    ColumnarValue::Scalar(scalar)
-                        if !scalar.data_type().equals_datatype(child_value.data_type()) =>
-                    {
-                        scalar.cast_to(child_value.data_type())
-                    }
-                    ColumnarValue::Scalar(scalar) => Ok(scalar),
-                    v => Err(DataFusionError::Execution(format!(
-                        "Expected scalar default value for ListExtract, got {:?}",
-                        v
-                    ))),
-                })
-            })
-            .transpose()?
-            .unwrap_or(self.data_type(&batch.schema())?.try_into())?;
-
-        let adjust_index = if self.one_based {
-            one_based_index
-        } else {
-            zero_based_index
-        };
-
-        match child_value.data_type() {
-            DataType::List(_) => {
-                let list_array = as_list_array(&child_value)?;
-                let index_array = as_int32_array(&ordinal_value)?;
-
-                list_extract(
-                    list_array,
-                    index_array,
-                    &default_value,
-                    self.fail_on_error,
-                    adjust_index,
-                )
-            }
-            DataType::LargeList(_) => {
-                let list_array = as_large_list_array(&child_value)?;
-                let index_array = as_int32_array(&ordinal_value)?;
-
-                list_extract(
-                    list_array,
-                    index_array,
-                    &default_value,
-                    self.fail_on_error,
-                    adjust_index,
-                )
-            }
-            data_type => Err(DataFusionError::Internal(format!(
-                "Unexpected child type for ListExtract: {:?}",
-                data_type
-            ))),
-        }
-    }
-
-    fn children(&self) -> Vec<&Arc<dyn PhysicalExpr>> {
-        vec![&self.child, &self.ordinal]
-    }
-
-    fn with_new_children(
-        self: Arc<Self>,
-        children: Vec<Arc<dyn PhysicalExpr>>,
-    ) -> datafusion_common::Result<Arc<dyn PhysicalExpr>> {
-        match children.len() {
-            2 => Ok(Arc::new(ListExtract::new(
-                Arc::clone(&children[0]),
-                Arc::clone(&children[1]),
-                self.default_value.clone(),
-                self.one_based,
-                self.fail_on_error,
-            ))),
-            _ => internal_err!("ListExtract should have exactly two children"),
-        }
-    }
-}
-
-fn one_based_index(index: i32, len: usize) -> DataFusionResult<Option<usize>> {
-    if index == 0 {
-        return Err(DataFusionError::Execution(
-            "Invalid index of 0 for one-based ListExtract".to_string(),
-        ));
-    }
-
-    let abs_index = index.abs().as_usize();
-    if abs_index <= len {
-        if index > 0 {
-            Ok(Some(abs_index - 1))
-        } else {
-            Ok(Some(len - abs_index))
-        }
-    } else {
-        Ok(None)
-    }
-}
-
-fn zero_based_index(index: i32, len: usize) -> DataFusionResult<Option<usize>> {
-    if index < 0 {
-        Ok(None)
-    } else {
-        let positive_index = index.as_usize();
-        if positive_index < len {
-            Ok(Some(positive_index))
-        } else {
-            Ok(None)
-        }
-    }
-}
-
-fn list_extract<O: OffsetSizeTrait>(
-    list_array: &GenericListArray<O>,
-    index_array: &Int32Array,
-    default_value: &ScalarValue,
-    fail_on_error: bool,
-    adjust_index: impl Fn(i32, usize) -> DataFusionResult<Option<usize>>,
-) -> DataFusionResult<ColumnarValue> {
-    let values = list_array.values();
-    let offsets = list_array.offsets();
-
-    let data = values.to_data();
-
-    let default_data = default_value.to_array()?.to_data();
-
-    let mut mutable = MutableArrayData::new(vec![&data, &default_data], true, index_array.len());
-
-    for (row, (offset_window, index)) in offsets.windows(2).zip(index_array.values()).enumerate() {
-        let start = offset_window[0].as_usize();
-        let len = offset_window[1].as_usize() - start;
-
-        if let Some(i) = adjust_index(*index, len)? {
-            mutable.extend(0, start + i, start + i + 1);
-        } else if list_array.is_null(row) {
-            mutable.extend_nulls(1);
-        } else if fail_on_error {
-            return Err(DataFusionError::Execution(
-                "Index out of bounds for array".to_string(),
-            ));
-        } else {
-            mutable.extend(1, 0, 1);
-        }
-    }
-
-    let data = mutable.freeze();
-    Ok(ColumnarValue::Array(arrow::array::make_array(data)))
-}
-
-impl Display for ListExtract {
-    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
-        write!(
-            f,
-            "ListExtract [child: {:?}, ordinal: {:?}, default_value: {:?}, one_based: {:?}, fail_on_error: {:?}]",
-            self.child, self.ordinal,  self.default_value, self.one_based, self.fail_on_error
-        )
-    }
-}
-
-#[derive(Debug, Eq)]
-pub struct GetArrayStructFields {
-    child: Arc<dyn PhysicalExpr>,
-    ordinal: usize,
-}
-
-impl Hash for GetArrayStructFields {
-    fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
-        self.child.hash(state);
-        self.ordinal.hash(state);
-    }
-}
-impl PartialEq for GetArrayStructFields {
-    fn eq(&self, other: &Self) -> bool {
-        self.child.eq(&other.child) && self.ordinal.eq(&other.ordinal)
-    }
-}
-
-impl GetArrayStructFields {
-    pub fn new(child: Arc<dyn PhysicalExpr>, ordinal: usize) -> Self {
-        Self { child, ordinal }
-    }
-
-    fn list_field(&self, input_schema: &Schema) -> DataFusionResult<FieldRef> {
-        match self.child.data_type(input_schema)? {
-            DataType::List(field) | DataType::LargeList(field) => Ok(field),
-            data_type => Err(DataFusionError::Internal(format!(
-                "Unexpected data type in GetArrayStructFields: {:?}",
-                data_type
-            ))),
-        }
-    }
-
-    fn child_field(&self, input_schema: &Schema) -> DataFusionResult<FieldRef> {
-        match self.list_field(input_schema)?.data_type() {
-            DataType::Struct(fields) => Ok(Arc::clone(&fields[self.ordinal])),
-            data_type => Err(DataFusionError::Internal(format!(
-                "Unexpected data type in GetArrayStructFields: {:?}",
-                data_type
-            ))),
-        }
-    }
-}
-
-impl PhysicalExpr for GetArrayStructFields {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
-    fn data_type(&self, input_schema: &Schema) -> DataFusionResult<DataType> {
-        let struct_field = self.child_field(input_schema)?;
-        match self.child.data_type(input_schema)? {
-            DataType::List(_) => Ok(DataType::List(struct_field)),
-            DataType::LargeList(_) => Ok(DataType::LargeList(struct_field)),
-            data_type => Err(DataFusionError::Internal(format!(
-                "Unexpected data type in GetArrayStructFields: {:?}",
-                data_type
-            ))),
-        }
-    }
-
-    fn nullable(&self, input_schema: &Schema) -> DataFusionResult<bool> {
-        Ok(self.list_field(input_schema)?.is_nullable()
-            || self.child_field(input_schema)?.is_nullable())
-    }
-
-    fn evaluate(&self, batch: &RecordBatch) -> DataFusionResult<ColumnarValue> {
-        let child_value = self.child.evaluate(batch)?.into_array(batch.num_rows())?;
-
-        match child_value.data_type() {
-            DataType::List(_) => {
-                let list_array = as_list_array(&child_value)?;
-
-                get_array_struct_fields(list_array, self.ordinal)
-            }
-            DataType::LargeList(_) => {
-                let list_array = as_large_list_array(&child_value)?;
-
-                get_array_struct_fields(list_array, self.ordinal)
-            }
-            data_type => Err(DataFusionError::Internal(format!(
-                "Unexpected child type for ListExtract: {:?}",
-                data_type
-            ))),
-        }
-    }
-
-    fn children(&self) -> Vec<&Arc<dyn PhysicalExpr>> {
-        vec![&self.child]
-    }
-
-    fn with_new_children(
-        self: Arc<Self>,
-        children: Vec<Arc<dyn PhysicalExpr>>,
-    ) -> datafusion_common::Result<Arc<dyn PhysicalExpr>> {
-        match children.len() {
-            1 => Ok(Arc::new(GetArrayStructFields::new(
-                Arc::clone(&children[0]),
-                self.ordinal,
-            ))),
-            _ => internal_err!("GetArrayStructFields should have exactly one child"),
-        }
-    }
-}
-
-fn get_array_struct_fields<O: OffsetSizeTrait>(
-    list_array: &GenericListArray<O>,
-    ordinal: usize,
-) -> DataFusionResult<ColumnarValue> {
-    let values = list_array
-        .values()
-        .as_any()
-        .downcast_ref::<StructArray>()
-        .expect("A struct is expected");
-
-    let column = Arc::clone(values.column(ordinal));
-    let field = Arc::clone(&values.fields()[ordinal]);
-
-    let offsets = list_array.offsets();
-    let array = GenericListArray::new(field, offsets.clone(), column, list_array.nulls().cloned());
-
-    Ok(ColumnarValue::Array(Arc::new(array)))
-}
-
-impl Display for GetArrayStructFields {
-    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
-        write!(
-            f,
-            "GetArrayStructFields [child: {:?}, ordinal: {:?}]",
-            self.child, self.ordinal
-        )
-    }
-}
-
 #[derive(Debug, Eq)]
 pub struct ArrayInsert {
     src_array_expr: Arc<dyn PhysicalExpr>,
@@ -687,51 +319,13 @@ impl Display for ArrayInsert {
 
 #[cfg(test)]
 mod test {
-    use crate::list::{array_insert, list_extract, zero_based_index};
-
+    use super::*;
     use arrow::datatypes::Int32Type;
     use arrow_array::{Array, ArrayRef, Int32Array, ListArray};
-    use datafusion_common::{Result, ScalarValue};
+    use datafusion_common::Result;
     use datafusion_expr::ColumnarValue;
     use std::sync::Arc;
 
-    #[test]
-    fn test_list_extract_default_value() -> Result<()> {
-        let list = ListArray::from_iter_primitive::<Int32Type, _, _>(vec![
-            Some(vec![Some(1)]),
-            None,
-            Some(vec![]),
-        ]);
-        let indices = Int32Array::from(vec![0, 0, 0]);
-
-        let null_default = ScalarValue::Int32(None);
-
-        let ColumnarValue::Array(result) =
-            list_extract(&list, &indices, &null_default, false, zero_based_index)?
-        else {
-            unreachable!()
-        };
-
-        assert_eq!(
-            &result.to_data(),
-            &Int32Array::from(vec![Some(1), None, None]).to_data()
-        );
-
-        let zero_default = ScalarValue::Int32(Some(0));
-
-        let ColumnarValue::Array(result) =
-            list_extract(&list, &indices, &zero_default, false, zero_based_index)?
-        else {
-            unreachable!()
-        };
-
-        assert_eq!(
-            &result.to_data(),
-            &Int32Array::from(vec![Some(1), None, Some(0)]).to_data()
-        );
-        Ok(())
-    }
-
     #[test]
     fn test_array_insert() -> Result<()> {
         // Test inserting an item into a list array
diff --git a/src/array_funcs/get_array_struct_fields.rs b/src/array_funcs/get_array_struct_fields.rs
new file mode 100644
index 000000000000..8b1633649c9e
--- /dev/null
+++ b/src/array_funcs/get_array_struct_fields.rs
@@ -0,0 +1,166 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::record_batch::RecordBatch;
+use arrow_array::{Array, GenericListArray, OffsetSizeTrait, StructArray};
+use arrow_schema::{DataType, FieldRef, Schema};
+use datafusion::logical_expr::ColumnarValue;
+use datafusion_common::{
+    cast::{as_large_list_array, as_list_array},
+    internal_err, DataFusionError, Result as DataFusionResult,
+};
+use datafusion_physical_expr::PhysicalExpr;
+use std::hash::Hash;
+use std::{
+    any::Any,
+    fmt::{Debug, Display, Formatter},
+    sync::Arc,
+};
+
+#[derive(Debug, Eq)]
+pub struct GetArrayStructFields {
+    child: Arc<dyn PhysicalExpr>,
+    ordinal: usize,
+}
+
+impl Hash for GetArrayStructFields {
+    fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
+        self.child.hash(state);
+        self.ordinal.hash(state);
+    }
+}
+impl PartialEq for GetArrayStructFields {
+    fn eq(&self, other: &Self) -> bool {
+        self.child.eq(&other.child) && self.ordinal.eq(&other.ordinal)
+    }
+}
+
+impl GetArrayStructFields {
+    pub fn new(child: Arc<dyn PhysicalExpr>, ordinal: usize) -> Self {
+        Self { child, ordinal }
+    }
+
+    fn list_field(&self, input_schema: &Schema) -> DataFusionResult<FieldRef> {
+        match self.child.data_type(input_schema)? {
+            DataType::List(field) | DataType::LargeList(field) => Ok(field),
+            data_type => Err(DataFusionError::Internal(format!(
+                "Unexpected data type in GetArrayStructFields: {:?}",
+                data_type
+            ))),
+        }
+    }
+
+    fn child_field(&self, input_schema: &Schema) -> DataFusionResult<FieldRef> {
+        match self.list_field(input_schema)?.data_type() {
+            DataType::Struct(fields) => Ok(Arc::clone(&fields[self.ordinal])),
+            data_type => Err(DataFusionError::Internal(format!(
+                "Unexpected data type in GetArrayStructFields: {:?}",
+                data_type
+            ))),
+        }
+    }
+}
+
+impl PhysicalExpr for GetArrayStructFields {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn data_type(&self, input_schema: &Schema) -> DataFusionResult<DataType> {
+        let struct_field = self.child_field(input_schema)?;
+        match self.child.data_type(input_schema)? {
+            DataType::List(_) => Ok(DataType::List(struct_field)),
+            DataType::LargeList(_) => Ok(DataType::LargeList(struct_field)),
+            data_type => Err(DataFusionError::Internal(format!(
+                "Unexpected data type in GetArrayStructFields: {:?}",
+                data_type
+            ))),
+        }
+    }
+
+    fn nullable(&self, input_schema: &Schema) -> DataFusionResult<bool> {
+        Ok(self.list_field(input_schema)?.is_nullable()
+            || self.child_field(input_schema)?.is_nullable())
+    }
+
+    fn evaluate(&self, batch: &RecordBatch) -> DataFusionResult<ColumnarValue> {
+        let child_value = self.child.evaluate(batch)?.into_array(batch.num_rows())?;
+
+        match child_value.data_type() {
+            DataType::List(_) => {
+                let list_array = as_list_array(&child_value)?;
+
+                get_array_struct_fields(list_array, self.ordinal)
+            }
+            DataType::LargeList(_) => {
+                let list_array = as_large_list_array(&child_value)?;
+
+                get_array_struct_fields(list_array, self.ordinal)
+            }
+            data_type => Err(DataFusionError::Internal(format!(
+                "Unexpected child type for ListExtract: {:?}",
+                data_type
+            ))),
+        }
+    }
+
+    fn children(&self) -> Vec<&Arc<dyn PhysicalExpr>> {
+        vec![&self.child]
+    }
+
+    fn with_new_children(
+        self: Arc<Self>,
+        children: Vec<Arc<dyn PhysicalExpr>>,
+    ) -> datafusion_common::Result<Arc<dyn PhysicalExpr>> {
+        match children.len() {
+            1 => Ok(Arc::new(GetArrayStructFields::new(
+                Arc::clone(&children[0]),
+                self.ordinal,
+            ))),
+            _ => internal_err!("GetArrayStructFields should have exactly one child"),
+        }
+    }
+}
+
+fn get_array_struct_fields<O: OffsetSizeTrait>(
+    list_array: &GenericListArray<O>,
+    ordinal: usize,
+) -> DataFusionResult<ColumnarValue> {
+    let values = list_array
+        .values()
+        .as_any()
+        .downcast_ref::<StructArray>()
+        .expect("A struct is expected");
+
+    let column = Arc::clone(values.column(ordinal));
+    let field = Arc::clone(&values.fields()[ordinal]);
+
+    let offsets = list_array.offsets();
+    let array = GenericListArray::new(field, offsets.clone(), column, list_array.nulls().cloned());
+
+    Ok(ColumnarValue::Array(Arc::new(array)))
+}
+
+impl Display for GetArrayStructFields {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        write!(
+            f,
+            "GetArrayStructFields [child: {:?}, ordinal: {:?}]",
+            self.child, self.ordinal
+        )
+    }
+}
diff --git a/src/array_funcs/list_extract.rs b/src/array_funcs/list_extract.rs
new file mode 100644
index 000000000000..c0f2291d9fe4
--- /dev/null
+++ b/src/array_funcs/list_extract.rs
@@ -0,0 +1,310 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::{array::MutableArrayData, datatypes::ArrowNativeType, record_batch::RecordBatch};
+use arrow_array::{Array, GenericListArray, Int32Array, OffsetSizeTrait};
+use arrow_schema::{DataType, FieldRef, Schema};
+use datafusion::logical_expr::ColumnarValue;
+use datafusion_common::{
+    cast::{as_int32_array, as_large_list_array, as_list_array},
+    internal_err, DataFusionError, Result as DataFusionResult, ScalarValue,
+};
+use datafusion_physical_expr::PhysicalExpr;
+use std::hash::Hash;
+use std::{
+    any::Any,
+    fmt::{Debug, Display, Formatter},
+    sync::Arc,
+};
+
+#[derive(Debug, Eq)]
+pub struct ListExtract {
+    child: Arc<dyn PhysicalExpr>,
+    ordinal: Arc<dyn PhysicalExpr>,
+    default_value: Option<Arc<dyn PhysicalExpr>>,
+    one_based: bool,
+    fail_on_error: bool,
+}
+
+impl Hash for ListExtract {
+    fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
+        self.child.hash(state);
+        self.ordinal.hash(state);
+        self.default_value.hash(state);
+        self.one_based.hash(state);
+        self.fail_on_error.hash(state);
+    }
+}
+impl PartialEq for ListExtract {
+    fn eq(&self, other: &Self) -> bool {
+        self.child.eq(&other.child)
+            && self.ordinal.eq(&other.ordinal)
+            && self.default_value.eq(&other.default_value)
+            && self.one_based.eq(&other.one_based)
+            && self.fail_on_error.eq(&other.fail_on_error)
+    }
+}
+
+impl ListExtract {
+    pub fn new(
+        child: Arc<dyn PhysicalExpr>,
+        ordinal: Arc<dyn PhysicalExpr>,
+        default_value: Option<Arc<dyn PhysicalExpr>>,
+        one_based: bool,
+        fail_on_error: bool,
+    ) -> Self {
+        Self {
+            child,
+            ordinal,
+            default_value,
+            one_based,
+            fail_on_error,
+        }
+    }
+
+    fn child_field(&self, input_schema: &Schema) -> DataFusionResult<FieldRef> {
+        match self.child.data_type(input_schema)? {
+            DataType::List(field) | DataType::LargeList(field) => Ok(field),
+            data_type => Err(DataFusionError::Internal(format!(
+                "Unexpected data type in ListExtract: {:?}",
+                data_type
+            ))),
+        }
+    }
+}
+
+impl PhysicalExpr for ListExtract {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn data_type(&self, input_schema: &Schema) -> DataFusionResult<DataType> {
+        Ok(self.child_field(input_schema)?.data_type().clone())
+    }
+
+    fn nullable(&self, input_schema: &Schema) -> DataFusionResult<bool> {
+        // Only non-nullable if fail_on_error is enabled and the element is non-nullable
+        Ok(!self.fail_on_error || self.child_field(input_schema)?.is_nullable())
+    }
+
+    fn evaluate(&self, batch: &RecordBatch) -> DataFusionResult<ColumnarValue> {
+        let child_value = self.child.evaluate(batch)?.into_array(batch.num_rows())?;
+        let ordinal_value = self.ordinal.evaluate(batch)?.into_array(batch.num_rows())?;
+
+        let default_value = self
+            .default_value
+            .as_ref()
+            .map(|d| {
+                d.evaluate(batch).map(|value| match value {
+                    ColumnarValue::Scalar(scalar)
+                        if !scalar.data_type().equals_datatype(child_value.data_type()) =>
+                    {
+                        scalar.cast_to(child_value.data_type())
+                    }
+                    ColumnarValue::Scalar(scalar) => Ok(scalar),
+                    v => Err(DataFusionError::Execution(format!(
+                        "Expected scalar default value for ListExtract, got {:?}",
+                        v
+                    ))),
+                })
+            })
+            .transpose()?
+            .unwrap_or(self.data_type(&batch.schema())?.try_into())?;
+
+        let adjust_index = if self.one_based {
+            one_based_index
+        } else {
+            zero_based_index
+        };
+
+        match child_value.data_type() {
+            DataType::List(_) => {
+                let list_array = as_list_array(&child_value)?;
+                let index_array = as_int32_array(&ordinal_value)?;
+
+                list_extract(
+                    list_array,
+                    index_array,
+                    &default_value,
+                    self.fail_on_error,
+                    adjust_index,
+                )
+            }
+            DataType::LargeList(_) => {
+                let list_array = as_large_list_array(&child_value)?;
+                let index_array = as_int32_array(&ordinal_value)?;
+
+                list_extract(
+                    list_array,
+                    index_array,
+                    &default_value,
+                    self.fail_on_error,
+                    adjust_index,
+                )
+            }
+            data_type => Err(DataFusionError::Internal(format!(
+                "Unexpected child type for ListExtract: {:?}",
+                data_type
+            ))),
+        }
+    }
+
+    fn children(&self) -> Vec<&Arc<dyn PhysicalExpr>> {
+        vec![&self.child, &self.ordinal]
+    }
+
+    fn with_new_children(
+        self: Arc<Self>,
+        children: Vec<Arc<dyn PhysicalExpr>>,
+    ) -> datafusion_common::Result<Arc<dyn PhysicalExpr>> {
+        match children.len() {
+            2 => Ok(Arc::new(ListExtract::new(
+                Arc::clone(&children[0]),
+                Arc::clone(&children[1]),
+                self.default_value.clone(),
+                self.one_based,
+                self.fail_on_error,
+            ))),
+            _ => internal_err!("ListExtract should have exactly two children"),
+        }
+    }
+}
+
+fn one_based_index(index: i32, len: usize) -> DataFusionResult<Option<usize>> {
+    if index == 0 {
+        return Err(DataFusionError::Execution(
+            "Invalid index of 0 for one-based ListExtract".to_string(),
+        ));
+    }
+
+    let abs_index = index.abs().as_usize();
+    if abs_index <= len {
+        if index > 0 {
+            Ok(Some(abs_index - 1))
+        } else {
+            Ok(Some(len - abs_index))
+        }
+    } else {
+        Ok(None)
+    }
+}
+
+fn zero_based_index(index: i32, len: usize) -> DataFusionResult<Option<usize>> {
+    if index < 0 {
+        Ok(None)
+    } else {
+        let positive_index = index.as_usize();
+        if positive_index < len {
+            Ok(Some(positive_index))
+        } else {
+            Ok(None)
+        }
+    }
+}
+
+fn list_extract<O: OffsetSizeTrait>(
+    list_array: &GenericListArray<O>,
+    index_array: &Int32Array,
+    default_value: &ScalarValue,
+    fail_on_error: bool,
+    adjust_index: impl Fn(i32, usize) -> DataFusionResult<Option<usize>>,
+) -> DataFusionResult<ColumnarValue> {
+    let values = list_array.values();
+    let offsets = list_array.offsets();
+
+    let data = values.to_data();
+
+    let default_data = default_value.to_array()?.to_data();
+
+    let mut mutable = MutableArrayData::new(vec![&data, &default_data], true, index_array.len());
+
+    for (row, (offset_window, index)) in offsets.windows(2).zip(index_array.values()).enumerate() {
+        let start = offset_window[0].as_usize();
+        let len = offset_window[1].as_usize() - start;
+
+        if let Some(i) = adjust_index(*index, len)? {
+            mutable.extend(0, start + i, start + i + 1);
+        } else if list_array.is_null(row) {
+            mutable.extend_nulls(1);
+        } else if fail_on_error {
+            return Err(DataFusionError::Execution(
+                "Index out of bounds for array".to_string(),
+            ));
+        } else {
+            mutable.extend(1, 0, 1);
+        }
+    }
+
+    let data = mutable.freeze();
+    Ok(ColumnarValue::Array(arrow::array::make_array(data)))
+}
+
+impl Display for ListExtract {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        write!(
+            f,
+            "ListExtract [child: {:?}, ordinal: {:?}, default_value: {:?}, one_based: {:?}, fail_on_error: {:?}]",
+            self.child, self.ordinal,  self.default_value, self.one_based, self.fail_on_error
+        )
+    }
+}
+
+#[cfg(test)]
+mod test {
+    use super::*;
+    use arrow::datatypes::Int32Type;
+    use arrow_array::{Array, Int32Array, ListArray};
+    use datafusion_common::{Result, ScalarValue};
+    use datafusion_expr::ColumnarValue;
+
+    #[test]
+    fn test_list_extract_default_value() -> Result<()> {
+        let list = ListArray::from_iter_primitive::<Int32Type, _, _>(vec![
+            Some(vec![Some(1)]),
+            None,
+            Some(vec![]),
+        ]);
+        let indices = Int32Array::from(vec![0, 0, 0]);
+
+        let null_default = ScalarValue::Int32(None);
+
+        let ColumnarValue::Array(result) =
+            list_extract(&list, &indices, &null_default, false, zero_based_index)?
+        else {
+            unreachable!()
+        };
+
+        assert_eq!(
+            &result.to_data(),
+            &Int32Array::from(vec![Some(1), None, None]).to_data()
+        );
+
+        let zero_default = ScalarValue::Int32(Some(0));
+
+        let ColumnarValue::Array(result) =
+            list_extract(&list, &indices, &zero_default, false, zero_based_index)?
+        else {
+            unreachable!()
+        };
+
+        assert_eq!(
+            &result.to_data(),
+            &Int32Array::from(vec![Some(1), None, Some(0)]).to_data()
+        );
+        Ok(())
+    }
+}
diff --git a/src/array_funcs/mod.rs b/src/array_funcs/mod.rs
new file mode 100644
index 000000000000..0a215f96cf85
--- /dev/null
+++ b/src/array_funcs/mod.rs
@@ -0,0 +1,24 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+mod array_insert;
+mod get_array_struct_fields;
+mod list_extract;
+
+pub use array_insert::ArrayInsert;
+pub use get_array_struct_fields::GetArrayStructFields;
+pub use list_extract::ListExtract;
diff --git a/src/bitwise_not.rs b/src/bitwise_funcs/bitwise_not.rs
similarity index 100%
rename from src/bitwise_not.rs
rename to src/bitwise_funcs/bitwise_not.rs
diff --git a/src/bitwise_funcs/mod.rs b/src/bitwise_funcs/mod.rs
new file mode 100644
index 000000000000..9c2636331961
--- /dev/null
+++ b/src/bitwise_funcs/mod.rs
@@ -0,0 +1,20 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+mod bitwise_not;
+
+pub use bitwise_not::{bitwise_not, BitwiseNotExpr};
diff --git a/src/if_expr.rs b/src/conditional_funcs/if_expr.rs
similarity index 100%
rename from src/if_expr.rs
rename to src/conditional_funcs/if_expr.rs
diff --git a/src/conditional_funcs/mod.rs b/src/conditional_funcs/mod.rs
new file mode 100644
index 000000000000..70c459ef7c08
--- /dev/null
+++ b/src/conditional_funcs/mod.rs
@@ -0,0 +1,20 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+mod if_expr;
+
+pub use if_expr::IfExpr;
diff --git a/src/cast.rs b/src/conversion_funcs/cast.rs
similarity index 100%
rename from src/cast.rs
rename to src/conversion_funcs/cast.rs
diff --git a/src/conversion_funcs/mod.rs b/src/conversion_funcs/mod.rs
new file mode 100644
index 000000000000..f2c6f7ca368b
--- /dev/null
+++ b/src/conversion_funcs/mod.rs
@@ -0,0 +1,18 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+pub mod cast;
diff --git a/src/lib.rs b/src/lib.rs
index 22bec87ee6d4..14982264d191 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -19,17 +19,12 @@
 // The lint makes easier for code reader/reviewer separate references clones from more heavyweight ones
 #![deny(clippy::clone_on_ref_ptr)]
 
-mod cast;
 mod error;
-mod if_expr;
 
-mod bitwise_not;
-pub use bitwise_not::{bitwise_not, BitwiseNotExpr};
 mod checkoverflow;
 pub use checkoverflow::CheckOverflow;
 
 mod kernels;
-mod list;
 pub mod scalar_funcs;
 mod schema_adapter;
 mod static_invoke;
@@ -52,6 +47,8 @@ mod predicate_funcs;
 pub use predicate_funcs::{spark_isnan, RLike};
 
 mod agg_funcs;
+mod array_funcs;
+mod bitwise_funcs;
 mod comet_scalar_funcs;
 pub mod hash_funcs;
 
@@ -63,13 +60,19 @@ pub use agg_funcs::*;
 pub use crate::{CreateNamedStruct, GetStructField};
 pub use crate::{DateTruncExpr, HourExpr, MinuteExpr, SecondExpr, TimestampTruncExpr};
 pub use cast::{spark_cast, Cast, SparkCastOptions};
+mod conditional_funcs;
+mod conversion_funcs;
+
+pub use array_funcs::*;
+pub use bitwise_funcs::*;
+pub use conditional_funcs::*;
+pub use conversion_funcs::*;
+
 pub use comet_scalar_funcs::create_comet_physical_fun;
 pub use datetime_funcs::*;
 pub use error::{SparkError, SparkResult};
 pub use hash_funcs::*;
-pub use if_expr::IfExpr;
 pub use json_funcs::ToJson;
-pub use list::{ArrayInsert, GetArrayStructFields, ListExtract};
 pub use string_funcs::*;
 pub use struct_funcs::*;
 

From ba085119bbfb1920650ef3636e6d89d6056af47b Mon Sep 17 00:00:00 2001
From: Zhen Wang <643348094@qq.com>
Date: Sun, 19 Jan 2025 21:51:27 +0800
Subject: [PATCH 53/68] fix: partially fix consistency issue of hash functions
 with decimal input (#1295)

---
 src/hash_funcs/utils.rs | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)

diff --git a/src/hash_funcs/utils.rs b/src/hash_funcs/utils.rs
index 07ba1952d7d8..ede89c0ba437 100644
--- a/src/hash_funcs/utils.rs
+++ b/src/hash_funcs/utils.rs
@@ -104,6 +104,26 @@ macro_rules! hash_array_primitive_float {
     };
 }
 
+#[macro_export]
+macro_rules! hash_array_small_decimal {
+    ($array_type:ident, $column: ident, $hashes: ident, $hash_method: ident) => {
+        let array = $column.as_any().downcast_ref::<$array_type>().unwrap();
+
+        if array.null_count() == 0 {
+            for (i, hash) in $hashes.iter_mut().enumerate() {
+                *hash = $hash_method(i64::try_from(array.value(i)).unwrap().to_le_bytes(), *hash);
+            }
+        } else {
+            for (i, hash) in $hashes.iter_mut().enumerate() {
+                if !array.is_null(i) {
+                    *hash =
+                        $hash_method(i64::try_from(array.value(i)).unwrap().to_le_bytes(), *hash);
+                }
+            }
+        }
+    };
+}
+
 #[macro_export]
 macro_rules! hash_array_decimal {
     ($array_type:ident, $column: ident, $hashes: ident, $hash_method: ident) => {
@@ -274,6 +294,11 @@ macro_rules! create_hashes_internal {
                 DataType::FixedSizeBinary(_) => {
                     $crate::hash_array!(FixedSizeBinaryArray, col, $hashes_buffer, $hash_method);
                 }
+                // Apache Spark: if it's a small decimal, i.e. precision <= 18, turn it into long and hash it.
+                // Else, turn it into bytes and hash it.
+                DataType::Decimal128(precision, _) if *precision <= 18 => {
+                    $crate::hash_array_small_decimal!(Decimal128Array, col, $hashes_buffer, $hash_method);
+                }
                 DataType::Decimal128(_, _) => {
                     $crate::hash_array_decimal!(Decimal128Array, col, $hashes_buffer, $hash_method);
                 }

From 219a1336200e525bd07585f1eaf77c90b55898a0 Mon Sep 17 00:00:00 2001
From: Raz Luvaton <16746759+rluvaton@users.noreply.github.com>
Date: Mon, 20 Jan 2025 02:46:07 +0200
Subject: [PATCH 54/68] chore: extract math_funcs expressions to folders based
 on spark grouping (#1219)

* extract math_funcs expressions to folders based on spark grouping

* fix merge conflicts and move chr to `string_funcs`
---
 benches/decimal_div.rs                        |   2 +-
 src/comet_scalar_funcs.rs                     |   8 +-
 src/hash_funcs/sha2.rs                        |   2 +-
 src/lib.rs                                    |  25 +-
 src/math_funcs/ceil.rs                        |  83 +++
 src/math_funcs/div.rs                         |  92 +++
 src/math_funcs/floor.rs                       |  83 +++
 src/{scalar_funcs => math_funcs}/hex.rs       |   0
 .../internal}/checkoverflow.rs                |   0
 src/math_funcs/internal/make_decimal.rs       |  66 ++
 src/math_funcs/internal/mod.rs                |  26 +
 .../internal}/normalize_nan.rs                |   0
 src/math_funcs/internal/unscaled_value.rs     |  44 ++
 src/math_funcs/mod.rs                         |  35 ++
 src/{ => math_funcs}/negative.rs              |   2 +-
 src/math_funcs/round.rs                       | 137 +++++
 src/{scalar_funcs => math_funcs}/unhex.rs     |   0
 src/math_funcs/utils.rs                       |  74 +++
 src/scalar_funcs.rs                           | 569 ------------------
 src/{scalar_funcs => string_funcs}/chr.rs     |   0
 src/string_funcs/mod.rs                       |   2 +
 21 files changed, 661 insertions(+), 589 deletions(-)
 create mode 100644 src/math_funcs/ceil.rs
 create mode 100644 src/math_funcs/div.rs
 create mode 100644 src/math_funcs/floor.rs
 rename src/{scalar_funcs => math_funcs}/hex.rs (100%)
 rename src/{ => math_funcs/internal}/checkoverflow.rs (100%)
 create mode 100644 src/math_funcs/internal/make_decimal.rs
 create mode 100644 src/math_funcs/internal/mod.rs
 rename src/{ => math_funcs/internal}/normalize_nan.rs (100%)
 create mode 100644 src/math_funcs/internal/unscaled_value.rs
 create mode 100644 src/math_funcs/mod.rs
 rename src/{ => math_funcs}/negative.rs (99%)
 create mode 100644 src/math_funcs/round.rs
 rename src/{scalar_funcs => math_funcs}/unhex.rs (100%)
 create mode 100644 src/math_funcs/utils.rs
 delete mode 100644 src/scalar_funcs.rs
 rename src/{scalar_funcs => string_funcs}/chr.rs (100%)

diff --git a/benches/decimal_div.rs b/benches/decimal_div.rs
index 89f06e50532e..ad527fecba41 100644
--- a/benches/decimal_div.rs
+++ b/benches/decimal_div.rs
@@ -19,7 +19,7 @@ use arrow::compute::cast;
 use arrow_array::builder::Decimal128Builder;
 use arrow_schema::DataType;
 use criterion::{black_box, criterion_group, criterion_main, Criterion};
-use datafusion_comet_spark_expr::scalar_funcs::spark_decimal_div;
+use datafusion_comet_spark_expr::spark_decimal_div;
 use datafusion_expr::ColumnarValue;
 use std::sync::Arc;
 
diff --git a/src/comet_scalar_funcs.rs b/src/comet_scalar_funcs.rs
index 27c77d7f2be1..6070e81d20a0 100644
--- a/src/comet_scalar_funcs.rs
+++ b/src/comet_scalar_funcs.rs
@@ -16,11 +16,11 @@
 // under the License.
 
 use crate::hash_funcs::*;
-use crate::scalar_funcs::{
-    spark_ceil, spark_decimal_div, spark_floor, spark_hex, spark_isnan, spark_make_decimal,
-    spark_round, spark_unhex, spark_unscaled_value, SparkChrFunc,
+use crate::{
+    spark_ceil, spark_date_add, spark_date_sub, spark_decimal_div, spark_floor, spark_hex,
+    spark_isnan, spark_make_decimal, spark_read_side_padding, spark_round, spark_unhex,
+    spark_unscaled_value, SparkChrFunc,
 };
-use crate::{spark_date_add, spark_date_sub, spark_read_side_padding};
 use arrow_schema::DataType;
 use datafusion_common::{DataFusionError, Result as DataFusionResult};
 use datafusion_expr::registry::FunctionRegistry;
diff --git a/src/hash_funcs/sha2.rs b/src/hash_funcs/sha2.rs
index 90917a9eb420..40d8def3a615 100644
--- a/src/hash_funcs/sha2.rs
+++ b/src/hash_funcs/sha2.rs
@@ -15,7 +15,7 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use crate::scalar_funcs::hex_strings;
+use crate::math_funcs::hex::hex_strings;
 use arrow_array::{Array, StringArray};
 use datafusion::functions::crypto::{sha224, sha256, sha384, sha512};
 use datafusion_common::cast::as_binary_array;
diff --git a/src/lib.rs b/src/lib.rs
index 14982264d191..c9cfab27d40d 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -21,29 +21,22 @@
 
 mod error;
 
-mod checkoverflow;
-pub use checkoverflow::CheckOverflow;
-
 mod kernels;
-pub mod scalar_funcs;
 mod schema_adapter;
 mod static_invoke;
 pub use schema_adapter::SparkSchemaAdapterFactory;
 pub use static_invoke::*;
 
-mod negative;
 mod struct_funcs;
-pub use negative::{create_negate_expr, NegativeExpr};
-mod normalize_nan;
+pub use struct_funcs::{CreateNamedStruct, GetStructField};
 
 mod json_funcs;
 pub mod test_common;
 pub mod timezone;
 mod unbound;
 pub use unbound::UnboundColumn;
-pub mod utils;
-pub use normalize_nan::NormalizeNaNAndZero;
 mod predicate_funcs;
+pub mod utils;
 pub use predicate_funcs::{spark_isnan, RLike};
 
 mod agg_funcs;
@@ -57,11 +50,10 @@ mod string_funcs;
 mod datetime_funcs;
 pub use agg_funcs::*;
 
-pub use crate::{CreateNamedStruct, GetStructField};
-pub use crate::{DateTruncExpr, HourExpr, MinuteExpr, SecondExpr, TimestampTruncExpr};
 pub use cast::{spark_cast, Cast, SparkCastOptions};
 mod conditional_funcs;
 mod conversion_funcs;
+mod math_funcs;
 
 pub use array_funcs::*;
 pub use bitwise_funcs::*;
@@ -69,12 +61,19 @@ pub use conditional_funcs::*;
 pub use conversion_funcs::*;
 
 pub use comet_scalar_funcs::create_comet_physical_fun;
-pub use datetime_funcs::*;
+pub use datetime_funcs::{
+    spark_date_add, spark_date_sub, DateTruncExpr, HourExpr, MinuteExpr, SecondExpr,
+    TimestampTruncExpr,
+};
 pub use error::{SparkError, SparkResult};
 pub use hash_funcs::*;
 pub use json_funcs::ToJson;
+pub use math_funcs::{
+    create_negate_expr, spark_ceil, spark_decimal_div, spark_floor, spark_hex, spark_make_decimal,
+    spark_round, spark_unhex, spark_unscaled_value, CheckOverflow, NegativeExpr,
+    NormalizeNaNAndZero,
+};
 pub use string_funcs::*;
-pub use struct_funcs::*;
 
 /// Spark supports three evaluation modes when evaluating expressions, which affect
 /// the behavior when processing input values that are invalid or would result in an
diff --git a/src/math_funcs/ceil.rs b/src/math_funcs/ceil.rs
new file mode 100644
index 000000000000..9c0fc9b57142
--- /dev/null
+++ b/src/math_funcs/ceil.rs
@@ -0,0 +1,83 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::downcast_compute_op;
+use crate::math_funcs::utils::{get_precision_scale, make_decimal_array, make_decimal_scalar};
+use arrow::array::{Float32Array, Float64Array, Int64Array};
+use arrow_array::{Array, ArrowNativeTypeOp};
+use arrow_schema::DataType;
+use datafusion::physical_plan::ColumnarValue;
+use datafusion_common::{DataFusionError, ScalarValue};
+use num::integer::div_ceil;
+use std::sync::Arc;
+
+/// `ceil` function that simulates Spark `ceil` expression
+pub fn spark_ceil(
+    args: &[ColumnarValue],
+    data_type: &DataType,
+) -> Result<ColumnarValue, DataFusionError> {
+    let value = &args[0];
+    match value {
+        ColumnarValue::Array(array) => match array.data_type() {
+            DataType::Float32 => {
+                let result = downcast_compute_op!(array, "ceil", ceil, Float32Array, Int64Array);
+                Ok(ColumnarValue::Array(result?))
+            }
+            DataType::Float64 => {
+                let result = downcast_compute_op!(array, "ceil", ceil, Float64Array, Int64Array);
+                Ok(ColumnarValue::Array(result?))
+            }
+            DataType::Int64 => {
+                let result = array.as_any().downcast_ref::<Int64Array>().unwrap();
+                Ok(ColumnarValue::Array(Arc::new(result.clone())))
+            }
+            DataType::Decimal128(_, scale) if *scale > 0 => {
+                let f = decimal_ceil_f(scale);
+                let (precision, scale) = get_precision_scale(data_type);
+                make_decimal_array(array, precision, scale, &f)
+            }
+            other => Err(DataFusionError::Internal(format!(
+                "Unsupported data type {:?} for function ceil",
+                other,
+            ))),
+        },
+        ColumnarValue::Scalar(a) => match a {
+            ScalarValue::Float32(a) => Ok(ColumnarValue::Scalar(ScalarValue::Int64(
+                a.map(|x| x.ceil() as i64),
+            ))),
+            ScalarValue::Float64(a) => Ok(ColumnarValue::Scalar(ScalarValue::Int64(
+                a.map(|x| x.ceil() as i64),
+            ))),
+            ScalarValue::Int64(a) => Ok(ColumnarValue::Scalar(ScalarValue::Int64(a.map(|x| x)))),
+            ScalarValue::Decimal128(a, _, scale) if *scale > 0 => {
+                let f = decimal_ceil_f(scale);
+                let (precision, scale) = get_precision_scale(data_type);
+                make_decimal_scalar(a, precision, scale, &f)
+            }
+            _ => Err(DataFusionError::Internal(format!(
+                "Unsupported data type {:?} for function ceil",
+                value.data_type(),
+            ))),
+        },
+    }
+}
+
+#[inline]
+fn decimal_ceil_f(scale: &i8) -> impl Fn(i128) -> i128 {
+    let div = 10_i128.pow_wrapping(*scale as u32);
+    move |x: i128| div_ceil(x, div)
+}
diff --git a/src/math_funcs/div.rs b/src/math_funcs/div.rs
new file mode 100644
index 000000000000..72c23b9e9b2c
--- /dev/null
+++ b/src/math_funcs/div.rs
@@ -0,0 +1,92 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::math_funcs::utils::get_precision_scale;
+use arrow::{
+    array::{ArrayRef, AsArray},
+    datatypes::Decimal128Type,
+};
+use arrow_array::{Array, Decimal128Array};
+use arrow_schema::{DataType, DECIMAL128_MAX_PRECISION};
+use datafusion::physical_plan::ColumnarValue;
+use datafusion_common::DataFusionError;
+use num::{BigInt, Signed, ToPrimitive};
+use std::sync::Arc;
+
+// Let Decimal(p3, s3) as return type i.e. Decimal(p1, s1) / Decimal(p2, s2) = Decimal(p3, s3).
+// Conversely, Decimal(p1, s1) = Decimal(p2, s2) * Decimal(p3, s3). This means that, in order to
+// get enough scale that matches with Spark behavior, it requires to widen s1 to s2 + s3 + 1. Since
+// both s2 and s3 are 38 at max., s1 is 77 at max. DataFusion division cannot handle such scale >
+// Decimal256Type::MAX_SCALE. Therefore, we need to implement this decimal division using BigInt.
+pub fn spark_decimal_div(
+    args: &[ColumnarValue],
+    data_type: &DataType,
+) -> Result<ColumnarValue, DataFusionError> {
+    let left = &args[0];
+    let right = &args[1];
+    let (p3, s3) = get_precision_scale(data_type);
+
+    let (left, right): (ArrayRef, ArrayRef) = match (left, right) {
+        (ColumnarValue::Array(l), ColumnarValue::Array(r)) => (Arc::clone(l), Arc::clone(r)),
+        (ColumnarValue::Scalar(l), ColumnarValue::Array(r)) => {
+            (l.to_array_of_size(r.len())?, Arc::clone(r))
+        }
+        (ColumnarValue::Array(l), ColumnarValue::Scalar(r)) => {
+            (Arc::clone(l), r.to_array_of_size(l.len())?)
+        }
+        (ColumnarValue::Scalar(l), ColumnarValue::Scalar(r)) => (l.to_array()?, r.to_array()?),
+    };
+    let left = left.as_primitive::<Decimal128Type>();
+    let right = right.as_primitive::<Decimal128Type>();
+    let (p1, s1) = get_precision_scale(left.data_type());
+    let (p2, s2) = get_precision_scale(right.data_type());
+
+    let l_exp = ((s2 + s3 + 1) as u32).saturating_sub(s1 as u32);
+    let r_exp = (s1 as u32).saturating_sub((s2 + s3 + 1) as u32);
+    let result: Decimal128Array = if p1 as u32 + l_exp > DECIMAL128_MAX_PRECISION as u32
+        || p2 as u32 + r_exp > DECIMAL128_MAX_PRECISION as u32
+    {
+        let ten = BigInt::from(10);
+        let l_mul = ten.pow(l_exp);
+        let r_mul = ten.pow(r_exp);
+        let five = BigInt::from(5);
+        let zero = BigInt::from(0);
+        arrow::compute::kernels::arity::binary(left, right, |l, r| {
+            let l = BigInt::from(l) * &l_mul;
+            let r = BigInt::from(r) * &r_mul;
+            let div = if r.eq(&zero) { zero.clone() } else { &l / &r };
+            let res = if div.is_negative() {
+                div - &five
+            } else {
+                div + &five
+            } / &ten;
+            res.to_i128().unwrap_or(i128::MAX)
+        })?
+    } else {
+        let l_mul = 10_i128.pow(l_exp);
+        let r_mul = 10_i128.pow(r_exp);
+        arrow::compute::kernels::arity::binary(left, right, |l, r| {
+            let l = l * l_mul;
+            let r = r * r_mul;
+            let div = if r == 0 { 0 } else { l / r };
+            let res = if div.is_negative() { div - 5 } else { div + 5 } / 10;
+            res.to_i128().unwrap_or(i128::MAX)
+        })?
+    };
+    let result = result.with_data_type(DataType::Decimal128(p3, s3));
+    Ok(ColumnarValue::Array(Arc::new(result)))
+}
diff --git a/src/math_funcs/floor.rs b/src/math_funcs/floor.rs
new file mode 100644
index 000000000000..9a95d95afe14
--- /dev/null
+++ b/src/math_funcs/floor.rs
@@ -0,0 +1,83 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::downcast_compute_op;
+use crate::math_funcs::utils::{get_precision_scale, make_decimal_array, make_decimal_scalar};
+use arrow::array::{Float32Array, Float64Array, Int64Array};
+use arrow_array::{Array, ArrowNativeTypeOp};
+use arrow_schema::DataType;
+use datafusion::physical_plan::ColumnarValue;
+use datafusion_common::{DataFusionError, ScalarValue};
+use num::integer::div_floor;
+use std::sync::Arc;
+
+/// `floor` function that simulates Spark `floor` expression
+pub fn spark_floor(
+    args: &[ColumnarValue],
+    data_type: &DataType,
+) -> Result<ColumnarValue, DataFusionError> {
+    let value = &args[0];
+    match value {
+        ColumnarValue::Array(array) => match array.data_type() {
+            DataType::Float32 => {
+                let result = downcast_compute_op!(array, "floor", floor, Float32Array, Int64Array);
+                Ok(ColumnarValue::Array(result?))
+            }
+            DataType::Float64 => {
+                let result = downcast_compute_op!(array, "floor", floor, Float64Array, Int64Array);
+                Ok(ColumnarValue::Array(result?))
+            }
+            DataType::Int64 => {
+                let result = array.as_any().downcast_ref::<Int64Array>().unwrap();
+                Ok(ColumnarValue::Array(Arc::new(result.clone())))
+            }
+            DataType::Decimal128(_, scale) if *scale > 0 => {
+                let f = decimal_floor_f(scale);
+                let (precision, scale) = get_precision_scale(data_type);
+                make_decimal_array(array, precision, scale, &f)
+            }
+            other => Err(DataFusionError::Internal(format!(
+                "Unsupported data type {:?} for function floor",
+                other,
+            ))),
+        },
+        ColumnarValue::Scalar(a) => match a {
+            ScalarValue::Float32(a) => Ok(ColumnarValue::Scalar(ScalarValue::Int64(
+                a.map(|x| x.floor() as i64),
+            ))),
+            ScalarValue::Float64(a) => Ok(ColumnarValue::Scalar(ScalarValue::Int64(
+                a.map(|x| x.floor() as i64),
+            ))),
+            ScalarValue::Int64(a) => Ok(ColumnarValue::Scalar(ScalarValue::Int64(a.map(|x| x)))),
+            ScalarValue::Decimal128(a, _, scale) if *scale > 0 => {
+                let f = decimal_floor_f(scale);
+                let (precision, scale) = get_precision_scale(data_type);
+                make_decimal_scalar(a, precision, scale, &f)
+            }
+            _ => Err(DataFusionError::Internal(format!(
+                "Unsupported data type {:?} for function floor",
+                value.data_type(),
+            ))),
+        },
+    }
+}
+
+#[inline]
+fn decimal_floor_f(scale: &i8) -> impl Fn(i128) -> i128 {
+    let div = 10_i128.pow_wrapping(*scale as u32);
+    move |x: i128| div_floor(x, div)
+}
diff --git a/src/scalar_funcs/hex.rs b/src/math_funcs/hex.rs
similarity index 100%
rename from src/scalar_funcs/hex.rs
rename to src/math_funcs/hex.rs
diff --git a/src/checkoverflow.rs b/src/math_funcs/internal/checkoverflow.rs
similarity index 100%
rename from src/checkoverflow.rs
rename to src/math_funcs/internal/checkoverflow.rs
diff --git a/src/math_funcs/internal/make_decimal.rs b/src/math_funcs/internal/make_decimal.rs
new file mode 100644
index 000000000000..dd761cd69f05
--- /dev/null
+++ b/src/math_funcs/internal/make_decimal.rs
@@ -0,0 +1,66 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::math_funcs::utils::get_precision_scale;
+use arrow::{
+    array::{AsArray, Decimal128Builder},
+    datatypes::{validate_decimal_precision, Int64Type},
+};
+use arrow_schema::DataType;
+use datafusion::physical_plan::ColumnarValue;
+use datafusion_common::{internal_err, Result as DataFusionResult, ScalarValue};
+use std::sync::Arc;
+
+/// Spark-compatible `MakeDecimal` expression (internal to Spark optimizer)
+pub fn spark_make_decimal(
+    args: &[ColumnarValue],
+    data_type: &DataType,
+) -> DataFusionResult<ColumnarValue> {
+    let (precision, scale) = get_precision_scale(data_type);
+    match &args[0] {
+        ColumnarValue::Scalar(v) => match v {
+            ScalarValue::Int64(n) => Ok(ColumnarValue::Scalar(ScalarValue::Decimal128(
+                long_to_decimal(n, precision),
+                precision,
+                scale,
+            ))),
+            sv => internal_err!("Expected Int64 but found {sv:?}"),
+        },
+        ColumnarValue::Array(a) => {
+            let arr = a.as_primitive::<Int64Type>();
+            let mut result = Decimal128Builder::new();
+            for v in arr.into_iter() {
+                result.append_option(long_to_decimal(&v, precision))
+            }
+            let result_type = DataType::Decimal128(precision, scale);
+
+            Ok(ColumnarValue::Array(Arc::new(
+                result.finish().with_data_type(result_type),
+            )))
+        }
+    }
+}
+
+/// Convert the input long to decimal with the given maximum precision. If overflows, returns null
+/// instead.
+#[inline]
+fn long_to_decimal(v: &Option<i64>, precision: u8) -> Option<i128> {
+    match v {
+        Some(v) if validate_decimal_precision(*v as i128, precision).is_ok() => Some(*v as i128),
+        _ => None,
+    }
+}
diff --git a/src/math_funcs/internal/mod.rs b/src/math_funcs/internal/mod.rs
new file mode 100644
index 000000000000..29295f0d524d
--- /dev/null
+++ b/src/math_funcs/internal/mod.rs
@@ -0,0 +1,26 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+mod checkoverflow;
+mod make_decimal;
+mod normalize_nan;
+mod unscaled_value;
+
+pub use checkoverflow::CheckOverflow;
+pub use make_decimal::spark_make_decimal;
+pub use normalize_nan::NormalizeNaNAndZero;
+pub use unscaled_value::spark_unscaled_value;
diff --git a/src/normalize_nan.rs b/src/math_funcs/internal/normalize_nan.rs
similarity index 100%
rename from src/normalize_nan.rs
rename to src/math_funcs/internal/normalize_nan.rs
diff --git a/src/math_funcs/internal/unscaled_value.rs b/src/math_funcs/internal/unscaled_value.rs
new file mode 100644
index 000000000000..053f9b078f06
--- /dev/null
+++ b/src/math_funcs/internal/unscaled_value.rs
@@ -0,0 +1,44 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::{
+    array::{AsArray, Int64Builder},
+    datatypes::Decimal128Type,
+};
+use datafusion::physical_plan::ColumnarValue;
+use datafusion_common::{internal_err, Result as DataFusionResult, ScalarValue};
+use std::sync::Arc;
+
+/// Spark-compatible `UnscaledValue` expression (internal to Spark optimizer)
+pub fn spark_unscaled_value(args: &[ColumnarValue]) -> DataFusionResult<ColumnarValue> {
+    match &args[0] {
+        ColumnarValue::Scalar(v) => match v {
+            ScalarValue::Decimal128(d, _, _) => Ok(ColumnarValue::Scalar(ScalarValue::Int64(
+                d.map(|n| n as i64),
+            ))),
+            dt => internal_err!("Expected Decimal128 but found {dt:}"),
+        },
+        ColumnarValue::Array(a) => {
+            let arr = a.as_primitive::<Decimal128Type>();
+            let mut result = Int64Builder::new();
+            for v in arr.into_iter() {
+                result.append_option(v.map(|v| v as i64));
+            }
+            Ok(ColumnarValue::Array(Arc::new(result.finish())))
+        }
+    }
+}
diff --git a/src/math_funcs/mod.rs b/src/math_funcs/mod.rs
new file mode 100644
index 000000000000..c559ae15c0c3
--- /dev/null
+++ b/src/math_funcs/mod.rs
@@ -0,0 +1,35 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+mod ceil;
+mod div;
+mod floor;
+pub(crate) mod hex;
+pub mod internal;
+mod negative;
+mod round;
+pub(crate) mod unhex;
+mod utils;
+
+pub use ceil::spark_ceil;
+pub use div::spark_decimal_div;
+pub use floor::spark_floor;
+pub use hex::spark_hex;
+pub use internal::*;
+pub use negative::{create_negate_expr, NegativeExpr};
+pub use round::spark_round;
+pub use unhex::spark_unhex;
diff --git a/src/negative.rs b/src/math_funcs/negative.rs
similarity index 99%
rename from src/negative.rs
rename to src/math_funcs/negative.rs
index 7fb50891791c..cafbcfcbdb0f 100644
--- a/src/negative.rs
+++ b/src/math_funcs/negative.rs
@@ -15,7 +15,7 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use super::arithmetic_overflow_error;
+use crate::arithmetic_overflow_error;
 use crate::SparkError;
 use arrow::{compute::kernels::numeric::neg_wrapping, datatypes::IntervalDayTimeType};
 use arrow_array::RecordBatch;
diff --git a/src/math_funcs/round.rs b/src/math_funcs/round.rs
new file mode 100644
index 000000000000..a47b7bc294f3
--- /dev/null
+++ b/src/math_funcs/round.rs
@@ -0,0 +1,137 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::math_funcs::utils::{get_precision_scale, make_decimal_array, make_decimal_scalar};
+use arrow::array::{Int16Array, Int32Array, Int64Array, Int8Array};
+use arrow_array::{Array, ArrowNativeTypeOp};
+use arrow_schema::DataType;
+use datafusion::{functions::math::round::round, physical_plan::ColumnarValue};
+use datafusion_common::{exec_err, internal_err, DataFusionError, ScalarValue};
+use std::{cmp::min, sync::Arc};
+
+macro_rules! integer_round {
+    ($X:expr, $DIV:expr, $HALF:expr) => {{
+        let rem = $X % $DIV;
+        if rem <= -$HALF {
+            ($X - rem).sub_wrapping($DIV)
+        } else if rem >= $HALF {
+            ($X - rem).add_wrapping($DIV)
+        } else {
+            $X - rem
+        }
+    }};
+}
+
+macro_rules! round_integer_array {
+    ($ARRAY:expr, $POINT:expr, $TYPE:ty, $NATIVE:ty) => {{
+        let array = $ARRAY.as_any().downcast_ref::<$TYPE>().unwrap();
+        let ten: $NATIVE = 10;
+        let result: $TYPE = if let Some(div) = ten.checked_pow((-(*$POINT)) as u32) {
+            let half = div / 2;
+            arrow::compute::kernels::arity::unary(array, |x| integer_round!(x, div, half))
+        } else {
+            arrow::compute::kernels::arity::unary(array, |_| 0)
+        };
+        Ok(ColumnarValue::Array(Arc::new(result)))
+    }};
+}
+
+macro_rules! round_integer_scalar {
+    ($SCALAR:expr, $POINT:expr, $TYPE:expr, $NATIVE:ty) => {{
+        let ten: $NATIVE = 10;
+        if let Some(div) = ten.checked_pow((-(*$POINT)) as u32) {
+            let half = div / 2;
+            Ok(ColumnarValue::Scalar($TYPE(
+                $SCALAR.map(|x| integer_round!(x, div, half)),
+            )))
+        } else {
+            Ok(ColumnarValue::Scalar($TYPE(Some(0))))
+        }
+    }};
+}
+
+/// `round` function that simulates Spark `round` expression
+pub fn spark_round(
+    args: &[ColumnarValue],
+    data_type: &DataType,
+) -> Result<ColumnarValue, DataFusionError> {
+    let value = &args[0];
+    let point = &args[1];
+    let ColumnarValue::Scalar(ScalarValue::Int64(Some(point))) = point else {
+        return internal_err!("Invalid point argument for Round(): {:#?}", point);
+    };
+    match value {
+        ColumnarValue::Array(array) => match array.data_type() {
+            DataType::Int64 if *point < 0 => round_integer_array!(array, point, Int64Array, i64),
+            DataType::Int32 if *point < 0 => round_integer_array!(array, point, Int32Array, i32),
+            DataType::Int16 if *point < 0 => round_integer_array!(array, point, Int16Array, i16),
+            DataType::Int8 if *point < 0 => round_integer_array!(array, point, Int8Array, i8),
+            DataType::Decimal128(_, scale) if *scale >= 0 => {
+                let f = decimal_round_f(scale, point);
+                let (precision, scale) = get_precision_scale(data_type);
+                make_decimal_array(array, precision, scale, &f)
+            }
+            DataType::Float32 | DataType::Float64 => {
+                Ok(ColumnarValue::Array(round(&[Arc::clone(array)])?))
+            }
+            dt => exec_err!("Not supported datatype for ROUND: {dt}"),
+        },
+        ColumnarValue::Scalar(a) => match a {
+            ScalarValue::Int64(a) if *point < 0 => {
+                round_integer_scalar!(a, point, ScalarValue::Int64, i64)
+            }
+            ScalarValue::Int32(a) if *point < 0 => {
+                round_integer_scalar!(a, point, ScalarValue::Int32, i32)
+            }
+            ScalarValue::Int16(a) if *point < 0 => {
+                round_integer_scalar!(a, point, ScalarValue::Int16, i16)
+            }
+            ScalarValue::Int8(a) if *point < 0 => {
+                round_integer_scalar!(a, point, ScalarValue::Int8, i8)
+            }
+            ScalarValue::Decimal128(a, _, scale) if *scale >= 0 => {
+                let f = decimal_round_f(scale, point);
+                let (precision, scale) = get_precision_scale(data_type);
+                make_decimal_scalar(a, precision, scale, &f)
+            }
+            ScalarValue::Float32(_) | ScalarValue::Float64(_) => Ok(ColumnarValue::Scalar(
+                ScalarValue::try_from_array(&round(&[a.to_array()?])?, 0)?,
+            )),
+            dt => exec_err!("Not supported datatype for ROUND: {dt}"),
+        },
+    }
+}
+
+// Spark uses BigDecimal. See RoundBase implementation in Spark. Instead, we do the same by
+// 1) add the half of divisor, 2) round down by division, 3) adjust precision by multiplication
+#[inline]
+fn decimal_round_f(scale: &i8, point: &i64) -> Box<dyn Fn(i128) -> i128> {
+    if *point < 0 {
+        if let Some(div) = 10_i128.checked_pow((-(*point) as u32) + (*scale as u32)) {
+            let half = div / 2;
+            let mul = 10_i128.pow_wrapping((-(*point)) as u32);
+            // i128 can hold 39 digits of a base 10 number, adding half will not cause overflow
+            Box::new(move |x: i128| (x + x.signum() * half) / div * mul)
+        } else {
+            Box::new(move |_: i128| 0)
+        }
+    } else {
+        let div = 10_i128.pow_wrapping((*scale as u32) - min(*scale as u32, *point as u32));
+        let half = div / 2;
+        Box::new(move |x: i128| (x + x.signum() * half) / div)
+    }
+}
diff --git a/src/scalar_funcs/unhex.rs b/src/math_funcs/unhex.rs
similarity index 100%
rename from src/scalar_funcs/unhex.rs
rename to src/math_funcs/unhex.rs
diff --git a/src/math_funcs/utils.rs b/src/math_funcs/utils.rs
new file mode 100644
index 000000000000..204b7139e4b8
--- /dev/null
+++ b/src/math_funcs/utils.rs
@@ -0,0 +1,74 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow_array::cast::AsArray;
+use arrow_array::types::Decimal128Type;
+use arrow_array::{ArrayRef, Decimal128Array};
+use arrow_schema::DataType;
+use datafusion_common::{DataFusionError, ScalarValue};
+use datafusion_expr_common::columnar_value::ColumnarValue;
+use std::sync::Arc;
+
+#[macro_export]
+macro_rules! downcast_compute_op {
+    ($ARRAY:expr, $NAME:expr, $FUNC:ident, $TYPE:ident, $RESULT:ident) => {{
+        let n = $ARRAY.as_any().downcast_ref::<$TYPE>();
+        match n {
+            Some(array) => {
+                let res: $RESULT =
+                    arrow::compute::kernels::arity::unary(array, |x| x.$FUNC() as i64);
+                Ok(Arc::new(res))
+            }
+            _ => Err(DataFusionError::Internal(format!(
+                "Invalid data type for {}",
+                $NAME
+            ))),
+        }
+    }};
+}
+
+#[inline]
+pub(crate) fn make_decimal_scalar(
+    a: &Option<i128>,
+    precision: u8,
+    scale: i8,
+    f: &dyn Fn(i128) -> i128,
+) -> Result<ColumnarValue, DataFusionError> {
+    let result = ScalarValue::Decimal128(a.map(f), precision, scale);
+    Ok(ColumnarValue::Scalar(result))
+}
+
+#[inline]
+pub(crate) fn make_decimal_array(
+    array: &ArrayRef,
+    precision: u8,
+    scale: i8,
+    f: &dyn Fn(i128) -> i128,
+) -> Result<ColumnarValue, DataFusionError> {
+    let array = array.as_primitive::<Decimal128Type>();
+    let result: Decimal128Array = arrow::compute::kernels::arity::unary(array, f);
+    let result = result.with_data_type(DataType::Decimal128(precision, scale));
+    Ok(ColumnarValue::Array(Arc::new(result)))
+}
+
+#[inline]
+pub(crate) fn get_precision_scale(data_type: &DataType) -> (u8, i8) {
+    let DataType::Decimal128(precision, scale) = data_type else {
+        unreachable!()
+    };
+    (*precision, *scale)
+}
diff --git a/src/scalar_funcs.rs b/src/scalar_funcs.rs
deleted file mode 100644
index 52ece10e8df8..000000000000
--- a/src/scalar_funcs.rs
+++ /dev/null
@@ -1,569 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-use arrow::{
-    array::{
-        ArrayRef, AsArray, Decimal128Builder, Float32Array, Float64Array, Int16Array, Int32Array,
-        Int64Array, Int64Builder, Int8Array,
-    },
-    compute::kernels::numeric::{add, sub},
-    datatypes::{validate_decimal_precision, Decimal128Type, Int64Type},
-};
-use arrow_array::builder::IntervalDayTimeBuilder;
-use arrow_array::types::{Int16Type, Int32Type, Int8Type, IntervalDayTime};
-use arrow_array::{Array, ArrowNativeTypeOp, BooleanArray, Datum, Decimal128Array};
-use arrow_schema::{ArrowError, DataType, DECIMAL128_MAX_PRECISION};
-use datafusion::physical_expr_common::datum;
-use datafusion::{functions::math::round::round, physical_plan::ColumnarValue};
-use datafusion_common::{
-    exec_err, internal_err, DataFusionError, Result as DataFusionResult, ScalarValue,
-};
-use num::{
-    integer::{div_ceil, div_floor},
-    BigInt, Signed, ToPrimitive,
-};
-use std::{cmp::min, sync::Arc};
-
-mod unhex;
-pub use unhex::spark_unhex;
-
-mod hex;
-pub(crate) use hex::hex_strings;
-pub use hex::spark_hex;
-
-mod chr;
-pub use chr::SparkChrFunc;
-
-#[inline]
-fn get_precision_scale(data_type: &DataType) -> (u8, i8) {
-    let DataType::Decimal128(precision, scale) = data_type else {
-        unreachable!()
-    };
-    (*precision, *scale)
-}
-
-macro_rules! downcast_compute_op {
-    ($ARRAY:expr, $NAME:expr, $FUNC:ident, $TYPE:ident, $RESULT:ident) => {{
-        let n = $ARRAY.as_any().downcast_ref::<$TYPE>();
-        match n {
-            Some(array) => {
-                let res: $RESULT =
-                    arrow::compute::kernels::arity::unary(array, |x| x.$FUNC() as i64);
-                Ok(Arc::new(res))
-            }
-            _ => Err(DataFusionError::Internal(format!(
-                "Invalid data type for {}",
-                $NAME
-            ))),
-        }
-    }};
-}
-
-/// `ceil` function that simulates Spark `ceil` expression
-pub fn spark_ceil(
-    args: &[ColumnarValue],
-    data_type: &DataType,
-) -> Result<ColumnarValue, DataFusionError> {
-    let value = &args[0];
-    match value {
-        ColumnarValue::Array(array) => match array.data_type() {
-            DataType::Float32 => {
-                let result = downcast_compute_op!(array, "ceil", ceil, Float32Array, Int64Array);
-                Ok(ColumnarValue::Array(result?))
-            }
-            DataType::Float64 => {
-                let result = downcast_compute_op!(array, "ceil", ceil, Float64Array, Int64Array);
-                Ok(ColumnarValue::Array(result?))
-            }
-            DataType::Int64 => {
-                let result = array.as_any().downcast_ref::<Int64Array>().unwrap();
-                Ok(ColumnarValue::Array(Arc::new(result.clone())))
-            }
-            DataType::Decimal128(_, scale) if *scale > 0 => {
-                let f = decimal_ceil_f(scale);
-                let (precision, scale) = get_precision_scale(data_type);
-                make_decimal_array(array, precision, scale, &f)
-            }
-            other => Err(DataFusionError::Internal(format!(
-                "Unsupported data type {:?} for function ceil",
-                other,
-            ))),
-        },
-        ColumnarValue::Scalar(a) => match a {
-            ScalarValue::Float32(a) => Ok(ColumnarValue::Scalar(ScalarValue::Int64(
-                a.map(|x| x.ceil() as i64),
-            ))),
-            ScalarValue::Float64(a) => Ok(ColumnarValue::Scalar(ScalarValue::Int64(
-                a.map(|x| x.ceil() as i64),
-            ))),
-            ScalarValue::Int64(a) => Ok(ColumnarValue::Scalar(ScalarValue::Int64(a.map(|x| x)))),
-            ScalarValue::Decimal128(a, _, scale) if *scale > 0 => {
-                let f = decimal_ceil_f(scale);
-                let (precision, scale) = get_precision_scale(data_type);
-                make_decimal_scalar(a, precision, scale, &f)
-            }
-            _ => Err(DataFusionError::Internal(format!(
-                "Unsupported data type {:?} for function ceil",
-                value.data_type(),
-            ))),
-        },
-    }
-}
-
-/// `floor` function that simulates Spark `floor` expression
-pub fn spark_floor(
-    args: &[ColumnarValue],
-    data_type: &DataType,
-) -> Result<ColumnarValue, DataFusionError> {
-    let value = &args[0];
-    match value {
-        ColumnarValue::Array(array) => match array.data_type() {
-            DataType::Float32 => {
-                let result = downcast_compute_op!(array, "floor", floor, Float32Array, Int64Array);
-                Ok(ColumnarValue::Array(result?))
-            }
-            DataType::Float64 => {
-                let result = downcast_compute_op!(array, "floor", floor, Float64Array, Int64Array);
-                Ok(ColumnarValue::Array(result?))
-            }
-            DataType::Int64 => {
-                let result = array.as_any().downcast_ref::<Int64Array>().unwrap();
-                Ok(ColumnarValue::Array(Arc::new(result.clone())))
-            }
-            DataType::Decimal128(_, scale) if *scale > 0 => {
-                let f = decimal_floor_f(scale);
-                let (precision, scale) = get_precision_scale(data_type);
-                make_decimal_array(array, precision, scale, &f)
-            }
-            other => Err(DataFusionError::Internal(format!(
-                "Unsupported data type {:?} for function floor",
-                other,
-            ))),
-        },
-        ColumnarValue::Scalar(a) => match a {
-            ScalarValue::Float32(a) => Ok(ColumnarValue::Scalar(ScalarValue::Int64(
-                a.map(|x| x.floor() as i64),
-            ))),
-            ScalarValue::Float64(a) => Ok(ColumnarValue::Scalar(ScalarValue::Int64(
-                a.map(|x| x.floor() as i64),
-            ))),
-            ScalarValue::Int64(a) => Ok(ColumnarValue::Scalar(ScalarValue::Int64(a.map(|x| x)))),
-            ScalarValue::Decimal128(a, _, scale) if *scale > 0 => {
-                let f = decimal_floor_f(scale);
-                let (precision, scale) = get_precision_scale(data_type);
-                make_decimal_scalar(a, precision, scale, &f)
-            }
-            _ => Err(DataFusionError::Internal(format!(
-                "Unsupported data type {:?} for function floor",
-                value.data_type(),
-            ))),
-        },
-    }
-}
-
-/// Spark-compatible `UnscaledValue` expression (internal to Spark optimizer)
-pub fn spark_unscaled_value(args: &[ColumnarValue]) -> DataFusionResult<ColumnarValue> {
-    match &args[0] {
-        ColumnarValue::Scalar(v) => match v {
-            ScalarValue::Decimal128(d, _, _) => Ok(ColumnarValue::Scalar(ScalarValue::Int64(
-                d.map(|n| n as i64),
-            ))),
-            dt => internal_err!("Expected Decimal128 but found {dt:}"),
-        },
-        ColumnarValue::Array(a) => {
-            let arr = a.as_primitive::<Decimal128Type>();
-            let mut result = Int64Builder::new();
-            for v in arr.into_iter() {
-                result.append_option(v.map(|v| v as i64));
-            }
-            Ok(ColumnarValue::Array(Arc::new(result.finish())))
-        }
-    }
-}
-
-/// Spark-compatible `MakeDecimal` expression (internal to Spark optimizer)
-pub fn spark_make_decimal(
-    args: &[ColumnarValue],
-    data_type: &DataType,
-) -> DataFusionResult<ColumnarValue> {
-    let (precision, scale) = get_precision_scale(data_type);
-    match &args[0] {
-        ColumnarValue::Scalar(v) => match v {
-            ScalarValue::Int64(n) => Ok(ColumnarValue::Scalar(ScalarValue::Decimal128(
-                long_to_decimal(n, precision),
-                precision,
-                scale,
-            ))),
-            sv => internal_err!("Expected Int64 but found {sv:?}"),
-        },
-        ColumnarValue::Array(a) => {
-            let arr = a.as_primitive::<Int64Type>();
-            let mut result = Decimal128Builder::new();
-            for v in arr.into_iter() {
-                result.append_option(long_to_decimal(&v, precision))
-            }
-            let result_type = DataType::Decimal128(precision, scale);
-
-            Ok(ColumnarValue::Array(Arc::new(
-                result.finish().with_data_type(result_type),
-            )))
-        }
-    }
-}
-
-/// Convert the input long to decimal with the given maximum precision. If overflows, returns null
-/// instead.
-#[inline]
-fn long_to_decimal(v: &Option<i64>, precision: u8) -> Option<i128> {
-    match v {
-        Some(v) if validate_decimal_precision(*v as i128, precision).is_ok() => Some(*v as i128),
-        _ => None,
-    }
-}
-
-#[inline]
-fn decimal_ceil_f(scale: &i8) -> impl Fn(i128) -> i128 {
-    let div = 10_i128.pow_wrapping(*scale as u32);
-    move |x: i128| div_ceil(x, div)
-}
-
-#[inline]
-fn decimal_floor_f(scale: &i8) -> impl Fn(i128) -> i128 {
-    let div = 10_i128.pow_wrapping(*scale as u32);
-    move |x: i128| div_floor(x, div)
-}
-
-// Spark uses BigDecimal. See RoundBase implementation in Spark. Instead, we do the same by
-// 1) add the half of divisor, 2) round down by division, 3) adjust precision by multiplication
-#[inline]
-fn decimal_round_f(scale: &i8, point: &i64) -> Box<dyn Fn(i128) -> i128> {
-    if *point < 0 {
-        if let Some(div) = 10_i128.checked_pow((-(*point) as u32) + (*scale as u32)) {
-            let half = div / 2;
-            let mul = 10_i128.pow_wrapping((-(*point)) as u32);
-            // i128 can hold 39 digits of a base 10 number, adding half will not cause overflow
-            Box::new(move |x: i128| (x + x.signum() * half) / div * mul)
-        } else {
-            Box::new(move |_: i128| 0)
-        }
-    } else {
-        let div = 10_i128.pow_wrapping((*scale as u32) - min(*scale as u32, *point as u32));
-        let half = div / 2;
-        Box::new(move |x: i128| (x + x.signum() * half) / div)
-    }
-}
-
-#[inline]
-fn make_decimal_array(
-    array: &ArrayRef,
-    precision: u8,
-    scale: i8,
-    f: &dyn Fn(i128) -> i128,
-) -> Result<ColumnarValue, DataFusionError> {
-    let array = array.as_primitive::<Decimal128Type>();
-    let result: Decimal128Array = arrow::compute::kernels::arity::unary(array, f);
-    let result = result.with_data_type(DataType::Decimal128(precision, scale));
-    Ok(ColumnarValue::Array(Arc::new(result)))
-}
-
-#[inline]
-fn make_decimal_scalar(
-    a: &Option<i128>,
-    precision: u8,
-    scale: i8,
-    f: &dyn Fn(i128) -> i128,
-) -> Result<ColumnarValue, DataFusionError> {
-    let result = ScalarValue::Decimal128(a.map(f), precision, scale);
-    Ok(ColumnarValue::Scalar(result))
-}
-
-macro_rules! integer_round {
-    ($X:expr, $DIV:expr, $HALF:expr) => {{
-        let rem = $X % $DIV;
-        if rem <= -$HALF {
-            ($X - rem).sub_wrapping($DIV)
-        } else if rem >= $HALF {
-            ($X - rem).add_wrapping($DIV)
-        } else {
-            $X - rem
-        }
-    }};
-}
-
-macro_rules! round_integer_array {
-    ($ARRAY:expr, $POINT:expr, $TYPE:ty, $NATIVE:ty) => {{
-        let array = $ARRAY.as_any().downcast_ref::<$TYPE>().unwrap();
-        let ten: $NATIVE = 10;
-        let result: $TYPE = if let Some(div) = ten.checked_pow((-(*$POINT)) as u32) {
-            let half = div / 2;
-            arrow::compute::kernels::arity::unary(array, |x| integer_round!(x, div, half))
-        } else {
-            arrow::compute::kernels::arity::unary(array, |_| 0)
-        };
-        Ok(ColumnarValue::Array(Arc::new(result)))
-    }};
-}
-
-macro_rules! round_integer_scalar {
-    ($SCALAR:expr, $POINT:expr, $TYPE:expr, $NATIVE:ty) => {{
-        let ten: $NATIVE = 10;
-        if let Some(div) = ten.checked_pow((-(*$POINT)) as u32) {
-            let half = div / 2;
-            Ok(ColumnarValue::Scalar($TYPE(
-                $SCALAR.map(|x| integer_round!(x, div, half)),
-            )))
-        } else {
-            Ok(ColumnarValue::Scalar($TYPE(Some(0))))
-        }
-    }};
-}
-
-/// `round` function that simulates Spark `round` expression
-pub fn spark_round(
-    args: &[ColumnarValue],
-    data_type: &DataType,
-) -> Result<ColumnarValue, DataFusionError> {
-    let value = &args[0];
-    let point = &args[1];
-    let ColumnarValue::Scalar(ScalarValue::Int64(Some(point))) = point else {
-        return internal_err!("Invalid point argument for Round(): {:#?}", point);
-    };
-    match value {
-        ColumnarValue::Array(array) => match array.data_type() {
-            DataType::Int64 if *point < 0 => round_integer_array!(array, point, Int64Array, i64),
-            DataType::Int32 if *point < 0 => round_integer_array!(array, point, Int32Array, i32),
-            DataType::Int16 if *point < 0 => round_integer_array!(array, point, Int16Array, i16),
-            DataType::Int8 if *point < 0 => round_integer_array!(array, point, Int8Array, i8),
-            DataType::Decimal128(_, scale) if *scale >= 0 => {
-                let f = decimal_round_f(scale, point);
-                let (precision, scale) = get_precision_scale(data_type);
-                make_decimal_array(array, precision, scale, &f)
-            }
-            DataType::Float32 | DataType::Float64 => {
-                Ok(ColumnarValue::Array(round(&[Arc::clone(array)])?))
-            }
-            dt => exec_err!("Not supported datatype for ROUND: {dt}"),
-        },
-        ColumnarValue::Scalar(a) => match a {
-            ScalarValue::Int64(a) if *point < 0 => {
-                round_integer_scalar!(a, point, ScalarValue::Int64, i64)
-            }
-            ScalarValue::Int32(a) if *point < 0 => {
-                round_integer_scalar!(a, point, ScalarValue::Int32, i32)
-            }
-            ScalarValue::Int16(a) if *point < 0 => {
-                round_integer_scalar!(a, point, ScalarValue::Int16, i16)
-            }
-            ScalarValue::Int8(a) if *point < 0 => {
-                round_integer_scalar!(a, point, ScalarValue::Int8, i8)
-            }
-            ScalarValue::Decimal128(a, _, scale) if *scale >= 0 => {
-                let f = decimal_round_f(scale, point);
-                let (precision, scale) = get_precision_scale(data_type);
-                make_decimal_scalar(a, precision, scale, &f)
-            }
-            ScalarValue::Float32(_) | ScalarValue::Float64(_) => Ok(ColumnarValue::Scalar(
-                ScalarValue::try_from_array(&round(&[a.to_array()?])?, 0)?,
-            )),
-            dt => exec_err!("Not supported datatype for ROUND: {dt}"),
-        },
-    }
-}
-
-// Let Decimal(p3, s3) as return type i.e. Decimal(p1, s1) / Decimal(p2, s2) = Decimal(p3, s3).
-// Conversely, Decimal(p1, s1) = Decimal(p2, s2) * Decimal(p3, s3). This means that, in order to
-// get enough scale that matches with Spark behavior, it requires to widen s1 to s2 + s3 + 1. Since
-// both s2 and s3 are 38 at max., s1 is 77 at max. DataFusion division cannot handle such scale >
-// Decimal256Type::MAX_SCALE. Therefore, we need to implement this decimal division using BigInt.
-pub fn spark_decimal_div(
-    args: &[ColumnarValue],
-    data_type: &DataType,
-) -> Result<ColumnarValue, DataFusionError> {
-    let left = &args[0];
-    let right = &args[1];
-    let (p3, s3) = get_precision_scale(data_type);
-
-    let (left, right): (ArrayRef, ArrayRef) = match (left, right) {
-        (ColumnarValue::Array(l), ColumnarValue::Array(r)) => (Arc::clone(l), Arc::clone(r)),
-        (ColumnarValue::Scalar(l), ColumnarValue::Array(r)) => {
-            (l.to_array_of_size(r.len())?, Arc::clone(r))
-        }
-        (ColumnarValue::Array(l), ColumnarValue::Scalar(r)) => {
-            (Arc::clone(l), r.to_array_of_size(l.len())?)
-        }
-        (ColumnarValue::Scalar(l), ColumnarValue::Scalar(r)) => (l.to_array()?, r.to_array()?),
-    };
-    let left = left.as_primitive::<Decimal128Type>();
-    let right = right.as_primitive::<Decimal128Type>();
-    let (p1, s1) = get_precision_scale(left.data_type());
-    let (p2, s2) = get_precision_scale(right.data_type());
-
-    let l_exp = ((s2 + s3 + 1) as u32).saturating_sub(s1 as u32);
-    let r_exp = (s1 as u32).saturating_sub((s2 + s3 + 1) as u32);
-    let result: Decimal128Array = if p1 as u32 + l_exp > DECIMAL128_MAX_PRECISION as u32
-        || p2 as u32 + r_exp > DECIMAL128_MAX_PRECISION as u32
-    {
-        let ten = BigInt::from(10);
-        let l_mul = ten.pow(l_exp);
-        let r_mul = ten.pow(r_exp);
-        let five = BigInt::from(5);
-        let zero = BigInt::from(0);
-        arrow::compute::kernels::arity::binary(left, right, |l, r| {
-            let l = BigInt::from(l) * &l_mul;
-            let r = BigInt::from(r) * &r_mul;
-            let div = if r.eq(&zero) { zero.clone() } else { &l / &r };
-            let res = if div.is_negative() {
-                div - &five
-            } else {
-                div + &five
-            } / &ten;
-            res.to_i128().unwrap_or(i128::MAX)
-        })?
-    } else {
-        let l_mul = 10_i128.pow(l_exp);
-        let r_mul = 10_i128.pow(r_exp);
-        arrow::compute::kernels::arity::binary(left, right, |l, r| {
-            let l = l * l_mul;
-            let r = r * r_mul;
-            let div = if r == 0 { 0 } else { l / r };
-            let res = if div.is_negative() { div - 5 } else { div + 5 } / 10;
-            res.to_i128().unwrap_or(i128::MAX)
-        })?
-    };
-    let result = result.with_data_type(DataType::Decimal128(p3, s3));
-    Ok(ColumnarValue::Array(Arc::new(result)))
-}
-
-macro_rules! scalar_date_arithmetic {
-    ($start:expr, $days:expr, $op:expr) => {{
-        let interval = IntervalDayTime::new(*$days as i32, 0);
-        let interval_cv = ColumnarValue::Scalar(ScalarValue::IntervalDayTime(Some(interval)));
-        datum::apply($start, &interval_cv, $op)
-    }};
-}
-macro_rules! array_date_arithmetic {
-    ($days:expr, $interval_builder:expr, $intType:ty) => {{
-        for day in $days.as_primitive::<$intType>().into_iter() {
-            if let Some(non_null_day) = day {
-                $interval_builder.append_value(IntervalDayTime::new(non_null_day as i32, 0));
-            } else {
-                $interval_builder.append_null();
-            }
-        }
-    }};
-}
-
-/// Spark-compatible `date_add` and `date_sub` expressions, which assumes days for the second
-/// argument, but we cannot directly add that to a Date32. We generate an IntervalDayTime from the
-/// second argument and use DataFusion's interface to apply Arrow's operators.
-fn spark_date_arithmetic(
-    args: &[ColumnarValue],
-    op: impl Fn(&dyn Datum, &dyn Datum) -> Result<ArrayRef, ArrowError>,
-) -> Result<ColumnarValue, DataFusionError> {
-    let start = &args[0];
-    match &args[1] {
-        ColumnarValue::Scalar(ScalarValue::Int8(Some(days))) => {
-            scalar_date_arithmetic!(start, days, op)
-        }
-        ColumnarValue::Scalar(ScalarValue::Int16(Some(days))) => {
-            scalar_date_arithmetic!(start, days, op)
-        }
-        ColumnarValue::Scalar(ScalarValue::Int32(Some(days))) => {
-            scalar_date_arithmetic!(start, days, op)
-        }
-        ColumnarValue::Array(days) => {
-            let mut interval_builder = IntervalDayTimeBuilder::with_capacity(days.len());
-            match days.data_type() {
-                DataType::Int8 => {
-                    array_date_arithmetic!(days, interval_builder, Int8Type)
-                }
-                DataType::Int16 => {
-                    array_date_arithmetic!(days, interval_builder, Int16Type)
-                }
-                DataType::Int32 => {
-                    array_date_arithmetic!(days, interval_builder, Int32Type)
-                }
-                _ => {
-                    return Err(DataFusionError::Internal(format!(
-                        "Unsupported data types {:?} for date arithmetic.",
-                        args,
-                    )))
-                }
-            }
-            let interval_cv = ColumnarValue::Array(Arc::new(interval_builder.finish()));
-            datum::apply(start, &interval_cv, op)
-        }
-        _ => Err(DataFusionError::Internal(format!(
-            "Unsupported data types {:?} for date arithmetic.",
-            args,
-        ))),
-    }
-}
-pub fn spark_date_add(args: &[ColumnarValue]) -> Result<ColumnarValue, DataFusionError> {
-    spark_date_arithmetic(args, add)
-}
-
-pub fn spark_date_sub(args: &[ColumnarValue]) -> Result<ColumnarValue, DataFusionError> {
-    spark_date_arithmetic(args, sub)
-}
-
-/// Spark-compatible `isnan` expression
-pub fn spark_isnan(args: &[ColumnarValue]) -> Result<ColumnarValue, DataFusionError> {
-    fn set_nulls_to_false(is_nan: BooleanArray) -> ColumnarValue {
-        match is_nan.nulls() {
-            Some(nulls) => {
-                let is_not_null = nulls.inner();
-                ColumnarValue::Array(Arc::new(BooleanArray::new(
-                    is_nan.values() & is_not_null,
-                    None,
-                )))
-            }
-            None => ColumnarValue::Array(Arc::new(is_nan)),
-        }
-    }
-    let value = &args[0];
-    match value {
-        ColumnarValue::Array(array) => match array.data_type() {
-            DataType::Float64 => {
-                let array = array.as_any().downcast_ref::<Float64Array>().unwrap();
-                let is_nan = BooleanArray::from_unary(array, |x| x.is_nan());
-                Ok(set_nulls_to_false(is_nan))
-            }
-            DataType::Float32 => {
-                let array = array.as_any().downcast_ref::<Float32Array>().unwrap();
-                let is_nan = BooleanArray::from_unary(array, |x| x.is_nan());
-                Ok(set_nulls_to_false(is_nan))
-            }
-            other => Err(DataFusionError::Internal(format!(
-                "Unsupported data type {:?} for function isnan",
-                other,
-            ))),
-        },
-        ColumnarValue::Scalar(a) => match a {
-            ScalarValue::Float64(a) => Ok(ColumnarValue::Scalar(ScalarValue::Boolean(Some(
-                a.map(|x| x.is_nan()).unwrap_or(false),
-            )))),
-            ScalarValue::Float32(a) => Ok(ColumnarValue::Scalar(ScalarValue::Boolean(Some(
-                a.map(|x| x.is_nan()).unwrap_or(false),
-            )))),
-            _ => Err(DataFusionError::Internal(format!(
-                "Unsupported data type {:?} for function isnan",
-                value.data_type(),
-            ))),
-        },
-    }
-}
diff --git a/src/scalar_funcs/chr.rs b/src/string_funcs/chr.rs
similarity index 100%
rename from src/scalar_funcs/chr.rs
rename to src/string_funcs/chr.rs
diff --git a/src/string_funcs/mod.rs b/src/string_funcs/mod.rs
index 2c2a5b37c70c..d56b5662c323 100644
--- a/src/string_funcs/mod.rs
+++ b/src/string_funcs/mod.rs
@@ -15,10 +15,12 @@
 // specific language governing permissions and limitations
 // under the License.
 
+mod chr;
 mod prediction;
 mod string_space;
 mod substring;
 
+pub use chr::SparkChrFunc;
 pub use prediction::*;
 pub use string_space::StringSpaceExpr;
 pub use substring::SubstringExpr;

From bbb8548e76a6040aaac876bcbd58c664a6898ffb Mon Sep 17 00:00:00 2001
From: Andy Grove <agrove@apache.org>
Date: Tue, 21 Jan 2025 15:42:30 -0700
Subject: [PATCH 55/68] chore: merge comet-parquet-exec branch into main
 (#1318)

---
 src/avg.rs                   | 341 +++++++++++++++++++++
 src/avg_decimal.rs           | 522 ++++++++++++++++++++++++++++++++
 src/conversion_funcs/cast.rs |  34 ++-
 src/covariance.rs            | 306 +++++++++++++++++++
 src/strings.rs               | 290 ++++++++++++++++++
 src/sum_decimal.rs           | 555 +++++++++++++++++++++++++++++++++++
 src/temporal.rs              | 510 ++++++++++++++++++++++++++++++++
 src/utils.rs                 |  41 ++-
 src/variance.rs              | 247 ++++++++++++++++
 9 files changed, 2836 insertions(+), 10 deletions(-)
 create mode 100644 src/avg.rs
 create mode 100644 src/avg_decimal.rs
 create mode 100644 src/covariance.rs
 create mode 100644 src/strings.rs
 create mode 100644 src/sum_decimal.rs
 create mode 100644 src/temporal.rs
 create mode 100644 src/variance.rs

diff --git a/src/avg.rs b/src/avg.rs
new file mode 100644
index 000000000000..816440ac9ade
--- /dev/null
+++ b/src/avg.rs
@@ -0,0 +1,341 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::compute::sum;
+use arrow_array::{
+    builder::PrimitiveBuilder,
+    cast::AsArray,
+    types::{Float64Type, Int64Type},
+    Array, ArrayRef, ArrowNumericType, Int64Array, PrimitiveArray,
+};
+use arrow_schema::{DataType, Field};
+use datafusion::logical_expr::{
+    type_coercion::aggregates::avg_return_type, Accumulator, EmitTo, GroupsAccumulator, Signature,
+};
+use datafusion_common::{not_impl_err, Result, ScalarValue};
+use datafusion_physical_expr::expressions::format_state_name;
+use std::{any::Any, sync::Arc};
+
+use arrow_array::ArrowNativeTypeOp;
+use datafusion_expr::function::{AccumulatorArgs, StateFieldsArgs};
+use datafusion_expr::Volatility::Immutable;
+use datafusion_expr::{AggregateUDFImpl, ReversedUDAF};
+use DataType::*;
+
+/// AVG aggregate expression
+#[derive(Debug, Clone)]
+pub struct Avg {
+    name: String,
+    signature: Signature,
+    // expr: Arc<dyn PhysicalExpr>,
+    input_data_type: DataType,
+    result_data_type: DataType,
+}
+
+impl Avg {
+    /// Create a new AVG aggregate function
+    pub fn new(name: impl Into<String>, data_type: DataType) -> Self {
+        let result_data_type = avg_return_type("avg", &data_type).unwrap();
+
+        Self {
+            name: name.into(),
+            signature: Signature::user_defined(Immutable),
+            input_data_type: data_type,
+            result_data_type,
+        }
+    }
+}
+
+impl AggregateUDFImpl for Avg {
+    /// Return a reference to Any that can be used for downcasting
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn accumulator(&self, _acc_args: AccumulatorArgs) -> Result<Box<dyn Accumulator>> {
+        // instantiate specialized accumulator based for the type
+        match (&self.input_data_type, &self.result_data_type) {
+            (Float64, Float64) => Ok(Box::<AvgAccumulator>::default()),
+            _ => not_impl_err!(
+                "AvgAccumulator for ({} --> {})",
+                self.input_data_type,
+                self.result_data_type
+            ),
+        }
+    }
+
+    fn state_fields(&self, _args: StateFieldsArgs) -> Result<Vec<Field>> {
+        Ok(vec![
+            Field::new(
+                format_state_name(&self.name, "sum"),
+                self.input_data_type.clone(),
+                true,
+            ),
+            Field::new(
+                format_state_name(&self.name, "count"),
+                DataType::Int64,
+                true,
+            ),
+        ])
+    }
+
+    fn name(&self) -> &str {
+        &self.name
+    }
+
+    fn reverse_expr(&self) -> ReversedUDAF {
+        ReversedUDAF::Identical
+    }
+
+    fn groups_accumulator_supported(&self, _args: AccumulatorArgs) -> bool {
+        true
+    }
+
+    fn create_groups_accumulator(
+        &self,
+        _args: AccumulatorArgs,
+    ) -> Result<Box<dyn GroupsAccumulator>> {
+        // instantiate specialized accumulator based for the type
+        match (&self.input_data_type, &self.result_data_type) {
+            (Float64, Float64) => Ok(Box::new(AvgGroupsAccumulator::<Float64Type, _>::new(
+                &self.input_data_type,
+                |sum: f64, count: i64| Ok(sum / count as f64),
+            ))),
+
+            _ => not_impl_err!(
+                "AvgGroupsAccumulator for ({} --> {})",
+                self.input_data_type,
+                self.result_data_type
+            ),
+        }
+    }
+
+    fn default_value(&self, _data_type: &DataType) -> Result<ScalarValue> {
+        Ok(ScalarValue::Float64(None))
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
+        avg_return_type(self.name(), &arg_types[0])
+    }
+}
+
+/// An accumulator to compute the average
+#[derive(Debug, Default)]
+pub struct AvgAccumulator {
+    sum: Option<f64>,
+    count: i64,
+}
+
+impl Accumulator for AvgAccumulator {
+    fn state(&mut self) -> Result<Vec<ScalarValue>> {
+        Ok(vec![
+            ScalarValue::Float64(self.sum),
+            ScalarValue::from(self.count),
+        ])
+    }
+
+    fn update_batch(&mut self, values: &[ArrayRef]) -> Result<()> {
+        let values = values[0].as_primitive::<Float64Type>();
+        self.count += (values.len() - values.null_count()) as i64;
+        let v = self.sum.get_or_insert(0.);
+        if let Some(x) = sum(values) {
+            *v += x;
+        }
+        Ok(())
+    }
+
+    fn merge_batch(&mut self, states: &[ArrayRef]) -> Result<()> {
+        // counts are summed
+        self.count += sum(states[1].as_primitive::<Int64Type>()).unwrap_or_default();
+
+        // sums are summed
+        if let Some(x) = sum(states[0].as_primitive::<Float64Type>()) {
+            let v = self.sum.get_or_insert(0.);
+            *v += x;
+        }
+        Ok(())
+    }
+
+    fn evaluate(&mut self) -> Result<ScalarValue> {
+        if self.count == 0 {
+            // If all input are nulls, count will be 0 and we will get null after the division.
+            // This is consistent with Spark Average implementation.
+            Ok(ScalarValue::Float64(None))
+        } else {
+            Ok(ScalarValue::Float64(
+                self.sum.map(|f| f / self.count as f64),
+            ))
+        }
+    }
+
+    fn size(&self) -> usize {
+        std::mem::size_of_val(self)
+    }
+}
+
+/// An accumulator to compute the average of `[PrimitiveArray<T>]`.
+/// Stores values as native types, and does overflow checking
+///
+/// F: Function that calculates the average value from a sum of
+/// T::Native and a total count
+#[derive(Debug)]
+struct AvgGroupsAccumulator<T, F>
+where
+    T: ArrowNumericType + Send,
+    F: Fn(T::Native, i64) -> Result<T::Native> + Send,
+{
+    /// The type of the returned average
+    return_data_type: DataType,
+
+    /// Count per group (use i64 to make Int64Array)
+    counts: Vec<i64>,
+
+    /// Sums per group, stored as the native type
+    sums: Vec<T::Native>,
+
+    /// Function that computes the final average (value / count)
+    avg_fn: F,
+}
+
+impl<T, F> AvgGroupsAccumulator<T, F>
+where
+    T: ArrowNumericType + Send,
+    F: Fn(T::Native, i64) -> Result<T::Native> + Send,
+{
+    pub fn new(return_data_type: &DataType, avg_fn: F) -> Self {
+        Self {
+            return_data_type: return_data_type.clone(),
+            counts: vec![],
+            sums: vec![],
+            avg_fn,
+        }
+    }
+}
+
+impl<T, F> GroupsAccumulator for AvgGroupsAccumulator<T, F>
+where
+    T: ArrowNumericType + Send,
+    F: Fn(T::Native, i64) -> Result<T::Native> + Send,
+{
+    fn update_batch(
+        &mut self,
+        values: &[ArrayRef],
+        group_indices: &[usize],
+        _opt_filter: Option<&arrow_array::BooleanArray>,
+        total_num_groups: usize,
+    ) -> Result<()> {
+        assert_eq!(values.len(), 1, "single argument to update_batch");
+        let values = values[0].as_primitive::<T>();
+        let data = values.values();
+
+        // increment counts, update sums
+        self.counts.resize(total_num_groups, 0);
+        self.sums.resize(total_num_groups, T::default_value());
+
+        let iter = group_indices.iter().zip(data.iter());
+        if values.null_count() == 0 {
+            for (&group_index, &value) in iter {
+                let sum = &mut self.sums[group_index];
+                *sum = (*sum).add_wrapping(value);
+                self.counts[group_index] += 1;
+            }
+        } else {
+            for (idx, (&group_index, &value)) in iter.enumerate() {
+                if values.is_null(idx) {
+                    continue;
+                }
+                let sum = &mut self.sums[group_index];
+                *sum = (*sum).add_wrapping(value);
+
+                self.counts[group_index] += 1;
+            }
+        }
+
+        Ok(())
+    }
+
+    fn merge_batch(
+        &mut self,
+        values: &[ArrayRef],
+        group_indices: &[usize],
+        _opt_filter: Option<&arrow_array::BooleanArray>,
+        total_num_groups: usize,
+    ) -> Result<()> {
+        assert_eq!(values.len(), 2, "two arguments to merge_batch");
+        // first batch is partial sums, second is counts
+        let partial_sums = values[0].as_primitive::<T>();
+        let partial_counts = values[1].as_primitive::<Int64Type>();
+        // update counts with partial counts
+        self.counts.resize(total_num_groups, 0);
+        let iter1 = group_indices.iter().zip(partial_counts.values().iter());
+        for (&group_index, &partial_count) in iter1 {
+            self.counts[group_index] += partial_count;
+        }
+
+        // update sums
+        self.sums.resize(total_num_groups, T::default_value());
+        let iter2 = group_indices.iter().zip(partial_sums.values().iter());
+        for (&group_index, &new_value) in iter2 {
+            let sum = &mut self.sums[group_index];
+            *sum = sum.add_wrapping(new_value);
+        }
+
+        Ok(())
+    }
+
+    fn evaluate(&mut self, emit_to: EmitTo) -> Result<ArrayRef> {
+        let counts = emit_to.take_needed(&mut self.counts);
+        let sums = emit_to.take_needed(&mut self.sums);
+        let mut builder = PrimitiveBuilder::<T>::with_capacity(sums.len());
+        let iter = sums.into_iter().zip(counts);
+
+        for (sum, count) in iter {
+            if count != 0 {
+                builder.append_value((self.avg_fn)(sum, count)?)
+            } else {
+                builder.append_null();
+            }
+        }
+        let array: PrimitiveArray<T> = builder.finish();
+
+        Ok(Arc::new(array))
+    }
+
+    // return arrays for sums and counts
+    fn state(&mut self, emit_to: EmitTo) -> Result<Vec<ArrayRef>> {
+        let counts = emit_to.take_needed(&mut self.counts);
+        let counts = Int64Array::new(counts.into(), None);
+
+        let sums = emit_to.take_needed(&mut self.sums);
+        let sums = PrimitiveArray::<T>::new(sums.into(), None)
+            .with_data_type(self.return_data_type.clone());
+
+        Ok(vec![
+            Arc::new(sums) as ArrayRef,
+            Arc::new(counts) as ArrayRef,
+        ])
+    }
+
+    fn size(&self) -> usize {
+        self.counts.capacity() * std::mem::size_of::<i64>()
+            + self.sums.capacity() * std::mem::size_of::<T>()
+    }
+}
diff --git a/src/avg_decimal.rs b/src/avg_decimal.rs
new file mode 100644
index 000000000000..05fc28e58341
--- /dev/null
+++ b/src/avg_decimal.rs
@@ -0,0 +1,522 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::{array::BooleanBufferBuilder, buffer::NullBuffer, compute::sum};
+use arrow_array::{
+    builder::PrimitiveBuilder,
+    cast::AsArray,
+    types::{Decimal128Type, Int64Type},
+    Array, ArrayRef, Decimal128Array, Int64Array, PrimitiveArray,
+};
+use arrow_schema::{DataType, Field};
+use datafusion::logical_expr::{Accumulator, EmitTo, GroupsAccumulator, Signature};
+use datafusion_common::{not_impl_err, Result, ScalarValue};
+use datafusion_physical_expr::expressions::format_state_name;
+use std::{any::Any, sync::Arc};
+
+use crate::utils::is_valid_decimal_precision;
+use arrow_array::ArrowNativeTypeOp;
+use arrow_data::decimal::{MAX_DECIMAL_FOR_EACH_PRECISION, MIN_DECIMAL_FOR_EACH_PRECISION};
+use datafusion::logical_expr::Volatility::Immutable;
+use datafusion_expr::function::{AccumulatorArgs, StateFieldsArgs};
+use datafusion_expr::type_coercion::aggregates::avg_return_type;
+use datafusion_expr::{AggregateUDFImpl, ReversedUDAF};
+use num::{integer::div_ceil, Integer};
+use DataType::*;
+
+/// AVG aggregate expression
+#[derive(Debug, Clone)]
+pub struct AvgDecimal {
+    signature: Signature,
+    sum_data_type: DataType,
+    result_data_type: DataType,
+}
+
+impl AvgDecimal {
+    /// Create a new AVG aggregate function
+    pub fn new(result_type: DataType, sum_type: DataType) -> Self {
+        Self {
+            signature: Signature::user_defined(Immutable),
+            result_data_type: result_type,
+            sum_data_type: sum_type,
+        }
+    }
+}
+
+impl AggregateUDFImpl for AvgDecimal {
+    /// Return a reference to Any that can be used for downcasting
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn accumulator(&self, _acc_args: AccumulatorArgs) -> Result<Box<dyn Accumulator>> {
+        match (&self.sum_data_type, &self.result_data_type) {
+            (Decimal128(sum_precision, sum_scale), Decimal128(target_precision, target_scale)) => {
+                Ok(Box::new(AvgDecimalAccumulator::new(
+                    *sum_scale,
+                    *sum_precision,
+                    *target_precision,
+                    *target_scale,
+                )))
+            }
+            _ => not_impl_err!(
+                "AvgDecimalAccumulator for ({} --> {})",
+                self.sum_data_type,
+                self.result_data_type
+            ),
+        }
+    }
+
+    fn state_fields(&self, _args: StateFieldsArgs) -> Result<Vec<Field>> {
+        Ok(vec![
+            Field::new(
+                format_state_name(self.name(), "sum"),
+                self.sum_data_type.clone(),
+                true,
+            ),
+            Field::new(
+                format_state_name(self.name(), "count"),
+                DataType::Int64,
+                true,
+            ),
+        ])
+    }
+
+    fn name(&self) -> &str {
+        "avg"
+    }
+
+    fn reverse_expr(&self) -> ReversedUDAF {
+        ReversedUDAF::Identical
+    }
+
+    fn groups_accumulator_supported(&self, _args: AccumulatorArgs) -> bool {
+        true
+    }
+
+    fn create_groups_accumulator(
+        &self,
+        _args: AccumulatorArgs,
+    ) -> Result<Box<dyn GroupsAccumulator>> {
+        // instantiate specialized accumulator based for the type
+        match (&self.sum_data_type, &self.result_data_type) {
+            (Decimal128(sum_precision, sum_scale), Decimal128(target_precision, target_scale)) => {
+                Ok(Box::new(AvgDecimalGroupsAccumulator::new(
+                    &self.result_data_type,
+                    &self.sum_data_type,
+                    *target_precision,
+                    *target_scale,
+                    *sum_precision,
+                    *sum_scale,
+                )))
+            }
+            _ => not_impl_err!(
+                "AvgDecimalGroupsAccumulator for ({} --> {})",
+                self.sum_data_type,
+                self.result_data_type
+            ),
+        }
+    }
+
+    fn default_value(&self, _data_type: &DataType) -> Result<ScalarValue> {
+        match &self.result_data_type {
+            Decimal128(target_precision, target_scale) => {
+                Ok(make_decimal128(None, *target_precision, *target_scale))
+            }
+            _ => not_impl_err!(
+                "The result_data_type of AvgDecimal should be Decimal128 but got{}",
+                self.result_data_type
+            ),
+        }
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
+        avg_return_type(self.name(), &arg_types[0])
+    }
+}
+
+/// An accumulator to compute the average for decimals
+#[derive(Debug)]
+struct AvgDecimalAccumulator {
+    sum: Option<i128>,
+    count: i64,
+    is_empty: bool,
+    is_not_null: bool,
+    sum_scale: i8,
+    sum_precision: u8,
+    target_precision: u8,
+    target_scale: i8,
+}
+
+impl AvgDecimalAccumulator {
+    pub fn new(sum_scale: i8, sum_precision: u8, target_precision: u8, target_scale: i8) -> Self {
+        Self {
+            sum: None,
+            count: 0,
+            is_empty: true,
+            is_not_null: true,
+            sum_scale,
+            sum_precision,
+            target_precision,
+            target_scale,
+        }
+    }
+
+    fn update_single(&mut self, values: &Decimal128Array, idx: usize) {
+        let v = unsafe { values.value_unchecked(idx) };
+        let (new_sum, is_overflow) = match self.sum {
+            Some(sum) => sum.overflowing_add(v),
+            None => (v, false),
+        };
+
+        if is_overflow || !is_valid_decimal_precision(new_sum, self.sum_precision) {
+            // Overflow: set buffer accumulator to null
+            self.is_not_null = false;
+            return;
+        }
+
+        self.sum = Some(new_sum);
+
+        if let Some(new_count) = self.count.checked_add(1) {
+            self.count = new_count;
+        } else {
+            self.is_not_null = false;
+            return;
+        }
+
+        self.is_not_null = true;
+    }
+}
+
+fn make_decimal128(value: Option<i128>, precision: u8, scale: i8) -> ScalarValue {
+    ScalarValue::Decimal128(value, precision, scale)
+}
+
+impl Accumulator for AvgDecimalAccumulator {
+    fn state(&mut self) -> Result<Vec<ScalarValue>> {
+        Ok(vec![
+            ScalarValue::Decimal128(self.sum, self.sum_precision, self.sum_scale),
+            ScalarValue::from(self.count),
+        ])
+    }
+
+    fn update_batch(&mut self, values: &[ArrayRef]) -> Result<()> {
+        if !self.is_empty && !self.is_not_null {
+            // This means there's a overflow in decimal, so we will just skip the rest
+            // of the computation
+            return Ok(());
+        }
+
+        let values = &values[0];
+        let data = values.as_primitive::<Decimal128Type>();
+
+        self.is_empty = self.is_empty && values.len() == values.null_count();
+
+        if values.null_count() == 0 {
+            for i in 0..data.len() {
+                self.update_single(data, i);
+            }
+        } else {
+            for i in 0..data.len() {
+                if data.is_null(i) {
+                    continue;
+                }
+                self.update_single(data, i);
+            }
+        }
+        Ok(())
+    }
+
+    fn merge_batch(&mut self, states: &[ArrayRef]) -> Result<()> {
+        // counts are summed
+        self.count += sum(states[1].as_primitive::<Int64Type>()).unwrap_or_default();
+
+        // sums are summed
+        if let Some(x) = sum(states[0].as_primitive::<Decimal128Type>()) {
+            let v = self.sum.get_or_insert(0);
+            let (result, overflowed) = v.overflowing_add(x);
+            if overflowed {
+                // Set to None if overflow happens
+                self.sum = None;
+            } else {
+                *v = result;
+            }
+        }
+        Ok(())
+    }
+
+    fn evaluate(&mut self) -> Result<ScalarValue> {
+        let scaler = 10_i128.pow(self.target_scale.saturating_sub(self.sum_scale) as u32);
+        let target_min = MIN_DECIMAL_FOR_EACH_PRECISION[self.target_precision as usize - 1];
+        let target_max = MAX_DECIMAL_FOR_EACH_PRECISION[self.target_precision as usize - 1];
+
+        let result = self
+            .sum
+            .map(|v| avg(v, self.count as i128, target_min, target_max, scaler));
+
+        match result {
+            Some(value) => Ok(make_decimal128(
+                value,
+                self.target_precision,
+                self.target_scale,
+            )),
+            _ => Ok(make_decimal128(
+                None,
+                self.target_precision,
+                self.target_scale,
+            )),
+        }
+    }
+
+    fn size(&self) -> usize {
+        std::mem::size_of_val(self)
+    }
+}
+
+#[derive(Debug)]
+struct AvgDecimalGroupsAccumulator {
+    /// Tracks if the value is null
+    is_not_null: BooleanBufferBuilder,
+
+    // Tracks if the value is empty
+    is_empty: BooleanBufferBuilder,
+
+    /// The type of the avg return type
+    return_data_type: DataType,
+    target_precision: u8,
+    target_scale: i8,
+
+    /// Count per group (use i64 to make Int64Array)
+    counts: Vec<i64>,
+
+    /// Sums per group, stored as i128
+    sums: Vec<i128>,
+
+    /// The type of the sum
+    sum_data_type: DataType,
+    /// This is input_precision + 10 to be consistent with Spark
+    sum_precision: u8,
+    sum_scale: i8,
+}
+
+impl AvgDecimalGroupsAccumulator {
+    pub fn new(
+        return_data_type: &DataType,
+        sum_data_type: &DataType,
+        target_precision: u8,
+        target_scale: i8,
+        sum_precision: u8,
+        sum_scale: i8,
+    ) -> Self {
+        Self {
+            is_not_null: BooleanBufferBuilder::new(0),
+            is_empty: BooleanBufferBuilder::new(0),
+            return_data_type: return_data_type.clone(),
+            target_precision,
+            target_scale,
+            sum_data_type: sum_data_type.clone(),
+            sum_precision,
+            sum_scale,
+            counts: vec![],
+            sums: vec![],
+        }
+    }
+
+    fn is_overflow(&self, index: usize) -> bool {
+        !self.is_empty.get_bit(index) && !self.is_not_null.get_bit(index)
+    }
+
+    fn update_single(&mut self, group_index: usize, value: i128) {
+        if self.is_overflow(group_index) {
+            // This means there's a overflow in decimal, so we will just skip the rest
+            // of the computation
+            return;
+        }
+
+        self.is_empty.set_bit(group_index, false);
+        let (new_sum, is_overflow) = self.sums[group_index].overflowing_add(value);
+        self.counts[group_index] += 1;
+
+        if is_overflow || !is_valid_decimal_precision(new_sum, self.sum_precision) {
+            // Overflow: set buffer accumulator to null
+            self.is_not_null.set_bit(group_index, false);
+            return;
+        }
+
+        self.sums[group_index] = new_sum;
+        self.is_not_null.set_bit(group_index, true)
+    }
+}
+
+fn ensure_bit_capacity(builder: &mut BooleanBufferBuilder, capacity: usize) {
+    if builder.len() < capacity {
+        let additional = capacity - builder.len();
+        builder.append_n(additional, true);
+    }
+}
+
+impl GroupsAccumulator for AvgDecimalGroupsAccumulator {
+    fn update_batch(
+        &mut self,
+        values: &[ArrayRef],
+        group_indices: &[usize],
+        _opt_filter: Option<&arrow_array::BooleanArray>,
+        total_num_groups: usize,
+    ) -> Result<()> {
+        assert_eq!(values.len(), 1, "single argument to update_batch");
+        let values = values[0].as_primitive::<Decimal128Type>();
+        let data = values.values();
+
+        // increment counts, update sums
+        self.counts.resize(total_num_groups, 0);
+        self.sums.resize(total_num_groups, 0);
+        ensure_bit_capacity(&mut self.is_empty, total_num_groups);
+        ensure_bit_capacity(&mut self.is_not_null, total_num_groups);
+
+        let iter = group_indices.iter().zip(data.iter());
+        if values.null_count() == 0 {
+            for (&group_index, &value) in iter {
+                self.update_single(group_index, value);
+            }
+        } else {
+            for (idx, (&group_index, &value)) in iter.enumerate() {
+                if values.is_null(idx) {
+                    continue;
+                }
+                self.update_single(group_index, value);
+            }
+        }
+        Ok(())
+    }
+
+    fn merge_batch(
+        &mut self,
+        values: &[ArrayRef],
+        group_indices: &[usize],
+        _opt_filter: Option<&arrow_array::BooleanArray>,
+        total_num_groups: usize,
+    ) -> Result<()> {
+        assert_eq!(values.len(), 2, "two arguments to merge_batch");
+        // first batch is partial sums, second is counts
+        let partial_sums = values[0].as_primitive::<Decimal128Type>();
+        let partial_counts = values[1].as_primitive::<Int64Type>();
+        // update counts with partial counts
+        self.counts.resize(total_num_groups, 0);
+        let iter1 = group_indices.iter().zip(partial_counts.values().iter());
+        for (&group_index, &partial_count) in iter1 {
+            self.counts[group_index] += partial_count;
+        }
+
+        // update sums
+        self.sums.resize(total_num_groups, 0);
+        let iter2 = group_indices.iter().zip(partial_sums.values().iter());
+        for (&group_index, &new_value) in iter2 {
+            let sum = &mut self.sums[group_index];
+            *sum = sum.add_wrapping(new_value);
+        }
+
+        Ok(())
+    }
+
+    fn evaluate(&mut self, emit_to: EmitTo) -> Result<ArrayRef> {
+        let counts = emit_to.take_needed(&mut self.counts);
+        let sums = emit_to.take_needed(&mut self.sums);
+
+        let mut builder = PrimitiveBuilder::<Decimal128Type>::with_capacity(sums.len())
+            .with_data_type(self.return_data_type.clone());
+        let iter = sums.into_iter().zip(counts);
+
+        let scaler = 10_i128.pow(self.target_scale.saturating_sub(self.sum_scale) as u32);
+        let target_min = MIN_DECIMAL_FOR_EACH_PRECISION[self.target_precision as usize - 1];
+        let target_max = MAX_DECIMAL_FOR_EACH_PRECISION[self.target_precision as usize - 1];
+
+        for (sum, count) in iter {
+            if count != 0 {
+                match avg(sum, count as i128, target_min, target_max, scaler) {
+                    Some(value) => {
+                        builder.append_value(value);
+                    }
+                    _ => {
+                        builder.append_null();
+                    }
+                }
+            } else {
+                builder.append_null();
+            }
+        }
+        let array: PrimitiveArray<Decimal128Type> = builder.finish();
+
+        Ok(Arc::new(array))
+    }
+
+    // return arrays for sums and counts
+    fn state(&mut self, emit_to: EmitTo) -> Result<Vec<ArrayRef>> {
+        let nulls = self.is_not_null.finish();
+        let nulls = Some(NullBuffer::new(nulls));
+
+        let counts = emit_to.take_needed(&mut self.counts);
+        let counts = Int64Array::new(counts.into(), nulls.clone());
+
+        let sums = emit_to.take_needed(&mut self.sums);
+        let sums =
+            Decimal128Array::new(sums.into(), nulls).with_data_type(self.sum_data_type.clone());
+
+        Ok(vec![
+            Arc::new(sums) as ArrayRef,
+            Arc::new(counts) as ArrayRef,
+        ])
+    }
+
+    fn size(&self) -> usize {
+        self.counts.capacity() * std::mem::size_of::<i64>()
+            + self.sums.capacity() * std::mem::size_of::<i128>()
+    }
+}
+
+/// Returns the `sum`/`count` as a i128 Decimal128 with
+/// target_scale and target_precision and return None if overflows.
+///
+/// * sum: The total sum value stored as Decimal128 with sum_scale
+/// * count: total count, stored as a i128 (*NOT* a Decimal128 value)
+/// * target_min: The minimum output value possible to represent with the target precision
+/// * target_max: The maximum output value possible to represent with the target precision
+/// * scaler: scale factor for avg
+#[inline(always)]
+fn avg(sum: i128, count: i128, target_min: i128, target_max: i128, scaler: i128) -> Option<i128> {
+    if let Some(value) = sum.checked_mul(scaler) {
+        // `sum / count` with ROUND_HALF_UP
+        let (div, rem) = value.div_rem(&count);
+        let half = div_ceil(count, 2);
+        let half_neg = half.neg_wrapping();
+        let new_value = match value >= 0 {
+            true if rem >= half => div.add_wrapping(1),
+            false if rem <= half_neg => div.sub_wrapping(1),
+            _ => div,
+        };
+        if new_value >= target_min && new_value <= target_max {
+            Some(new_value)
+        } else {
+            None
+        }
+    } else {
+        None
+    }
+}
diff --git a/src/conversion_funcs/cast.rs b/src/conversion_funcs/cast.rs
index 6e0e0915cde8..0f597613255b 100644
--- a/src/conversion_funcs/cast.rs
+++ b/src/conversion_funcs/cast.rs
@@ -37,7 +37,7 @@ use arrow::{
 };
 use arrow_array::builder::StringBuilder;
 use arrow_array::{DictionaryArray, StringArray, StructArray};
-use arrow_schema::{DataType, Field, Schema};
+use arrow_schema::{DataType, Schema};
 use chrono::{NaiveDate, NaiveDateTime, TimeZone, Timelike};
 use datafusion_common::{
     cast::as_generic_string_array, internal_err, Result as DataFusionResult, ScalarValue,
@@ -49,6 +49,7 @@ use num::{
     ToPrimitive,
 };
 use regex::Regex;
+use std::collections::HashMap;
 use std::str::FromStr;
 use std::{
     any::Any,
@@ -803,11 +804,15 @@ pub struct SparkCastOptions {
     pub eval_mode: EvalMode,
     /// When cast from/to timezone related types, we need timezone, which will be resolved with
     /// session local timezone by an analyzer in Spark.
+    // TODO we should change timezone to Tz to avoid repeated parsing
     pub timezone: String,
     /// Allow casts that are supported but not guaranteed to be 100% compatible
     pub allow_incompat: bool,
     /// Support casting unsigned ints to signed ints (used by Parquet SchemaAdapter)
     pub allow_cast_unsigned_ints: bool,
+    /// We also use the cast logic for adapting Parquet schemas, so this flag is used
+    /// for that use case
+    pub is_adapting_schema: bool,
 }
 
 impl SparkCastOptions {
@@ -817,6 +822,7 @@ impl SparkCastOptions {
             timezone: timezone.to_string(),
             allow_incompat,
             allow_cast_unsigned_ints: false,
+            is_adapting_schema: false,
         }
     }
 
@@ -826,6 +832,7 @@ impl SparkCastOptions {
             timezone: "".to_string(),
             allow_incompat,
             allow_cast_unsigned_ints: false,
+            is_adapting_schema: false,
         }
     }
 }
@@ -952,7 +959,9 @@ fn cast_array(
         {
             Ok(cast_with_options(&array, to_type, &CAST_OPTIONS)?)
         }
-        _ if is_datafusion_spark_compatible(from_type, to_type, cast_options.allow_incompat) => {
+        _ if cast_options.is_adapting_schema
+            || is_datafusion_spark_compatible(from_type, to_type, cast_options.allow_incompat) =>
+        {
             // use DataFusion cast only when we know that it is compatible with Spark
             Ok(cast_with_options(&array, to_type, &CAST_OPTIONS)?)
         }
@@ -1058,17 +1067,28 @@ fn cast_struct_to_struct(
     cast_options: &SparkCastOptions,
 ) -> DataFusionResult<ArrayRef> {
     match (from_type, to_type) {
-        (DataType::Struct(_), DataType::Struct(to_fields)) => {
-            let mut cast_fields: Vec<(Arc<Field>, ArrayRef)> = Vec::with_capacity(to_fields.len());
+        (DataType::Struct(from_fields), DataType::Struct(to_fields)) => {
+            // TODO some of this logic may be specific to converting Parquet to Spark
+            let mut field_name_to_index_map = HashMap::new();
+            for (i, field) in from_fields.iter().enumerate() {
+                field_name_to_index_map.insert(field.name(), i);
+            }
+            assert_eq!(field_name_to_index_map.len(), from_fields.len());
+            let mut cast_fields: Vec<ArrayRef> = Vec::with_capacity(to_fields.len());
             for i in 0..to_fields.len() {
+                let from_index = field_name_to_index_map[to_fields[i].name()];
                 let cast_field = cast_array(
-                    Arc::clone(array.column(i)),
+                    Arc::clone(array.column(from_index)),
                     to_fields[i].data_type(),
                     cast_options,
                 )?;
-                cast_fields.push((Arc::clone(&to_fields[i]), cast_field));
+                cast_fields.push(cast_field);
             }
-            Ok(Arc::new(StructArray::from(cast_fields)))
+            Ok(Arc::new(StructArray::new(
+                to_fields.clone(),
+                cast_fields,
+                array.nulls().cloned(),
+            )))
         }
         _ => unreachable!(),
     }
diff --git a/src/covariance.rs b/src/covariance.rs
new file mode 100644
index 000000000000..fa3563cdea55
--- /dev/null
+++ b/src/covariance.rs
@@ -0,0 +1,306 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+use std::any::Any;
+
+use arrow::{
+    array::{ArrayRef, Float64Array},
+    compute::cast,
+    datatypes::{DataType, Field},
+};
+use datafusion::logical_expr::Accumulator;
+use datafusion_common::{
+    downcast_value, unwrap_or_internal_err, DataFusionError, Result, ScalarValue,
+};
+use datafusion_expr::function::{AccumulatorArgs, StateFieldsArgs};
+use datafusion_expr::type_coercion::aggregates::NUMERICS;
+use datafusion_expr::{AggregateUDFImpl, Signature, Volatility};
+use datafusion_physical_expr::expressions::format_state_name;
+use datafusion_physical_expr::expressions::StatsType;
+
+/// COVAR_SAMP and COVAR_POP aggregate expression
+/// The implementation mostly is the same as the DataFusion's implementation. The reason
+/// we have our own implementation is that DataFusion has UInt64 for state_field count,
+/// while Spark has Double for count.
+#[derive(Debug, Clone)]
+pub struct Covariance {
+    name: String,
+    signature: Signature,
+    stats_type: StatsType,
+    null_on_divide_by_zero: bool,
+}
+
+impl Covariance {
+    /// Create a new COVAR aggregate function
+    pub fn new(
+        name: impl Into<String>,
+        data_type: DataType,
+        stats_type: StatsType,
+        null_on_divide_by_zero: bool,
+    ) -> Self {
+        // the result of covariance just support FLOAT64 data type.
+        assert!(matches!(data_type, DataType::Float64));
+        Self {
+            name: name.into(),
+            signature: Signature::uniform(2, NUMERICS.to_vec(), Volatility::Immutable),
+            stats_type,
+            null_on_divide_by_zero,
+        }
+    }
+}
+
+impl AggregateUDFImpl for Covariance {
+    /// Return a reference to Any that can be used for downcasting
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn name(&self) -> &str {
+        &self.name
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        Ok(DataType::Float64)
+    }
+    fn default_value(&self, _data_type: &DataType) -> Result<ScalarValue> {
+        Ok(ScalarValue::Float64(None))
+    }
+
+    fn accumulator(&self, _acc_args: AccumulatorArgs) -> Result<Box<dyn Accumulator>> {
+        Ok(Box::new(CovarianceAccumulator::try_new(
+            self.stats_type,
+            self.null_on_divide_by_zero,
+        )?))
+    }
+
+    fn state_fields(&self, _args: StateFieldsArgs) -> Result<Vec<Field>> {
+        Ok(vec![
+            Field::new(
+                format_state_name(&self.name, "count"),
+                DataType::Float64,
+                true,
+            ),
+            Field::new(
+                format_state_name(&self.name, "mean1"),
+                DataType::Float64,
+                true,
+            ),
+            Field::new(
+                format_state_name(&self.name, "mean2"),
+                DataType::Float64,
+                true,
+            ),
+            Field::new(
+                format_state_name(&self.name, "algo_const"),
+                DataType::Float64,
+                true,
+            ),
+        ])
+    }
+}
+
+/// An accumulator to compute covariance
+#[derive(Debug)]
+pub struct CovarianceAccumulator {
+    algo_const: f64,
+    mean1: f64,
+    mean2: f64,
+    count: f64,
+    stats_type: StatsType,
+    null_on_divide_by_zero: bool,
+}
+
+impl CovarianceAccumulator {
+    /// Creates a new `CovarianceAccumulator`
+    pub fn try_new(s_type: StatsType, null_on_divide_by_zero: bool) -> Result<Self> {
+        Ok(Self {
+            algo_const: 0_f64,
+            mean1: 0_f64,
+            mean2: 0_f64,
+            count: 0_f64,
+            stats_type: s_type,
+            null_on_divide_by_zero,
+        })
+    }
+
+    pub fn get_count(&self) -> f64 {
+        self.count
+    }
+
+    pub fn get_mean1(&self) -> f64 {
+        self.mean1
+    }
+
+    pub fn get_mean2(&self) -> f64 {
+        self.mean2
+    }
+
+    pub fn get_algo_const(&self) -> f64 {
+        self.algo_const
+    }
+}
+
+impl Accumulator for CovarianceAccumulator {
+    fn state(&mut self) -> Result<Vec<ScalarValue>> {
+        Ok(vec![
+            ScalarValue::from(self.count),
+            ScalarValue::from(self.mean1),
+            ScalarValue::from(self.mean2),
+            ScalarValue::from(self.algo_const),
+        ])
+    }
+
+    fn update_batch(&mut self, values: &[ArrayRef]) -> Result<()> {
+        let values1 = &cast(&values[0], &DataType::Float64)?;
+        let values2 = &cast(&values[1], &DataType::Float64)?;
+
+        let mut arr1 = downcast_value!(values1, Float64Array).iter().flatten();
+        let mut arr2 = downcast_value!(values2, Float64Array).iter().flatten();
+
+        for i in 0..values1.len() {
+            let value1 = if values1.is_valid(i) {
+                arr1.next()
+            } else {
+                None
+            };
+            let value2 = if values2.is_valid(i) {
+                arr2.next()
+            } else {
+                None
+            };
+
+            if value1.is_none() || value2.is_none() {
+                continue;
+            }
+
+            let value1 = unwrap_or_internal_err!(value1);
+            let value2 = unwrap_or_internal_err!(value2);
+            let new_count = self.count + 1.0;
+            let delta1 = value1 - self.mean1;
+            let new_mean1 = delta1 / new_count + self.mean1;
+            let delta2 = value2 - self.mean2;
+            let new_mean2 = delta2 / new_count + self.mean2;
+            let new_c = delta1 * (value2 - new_mean2) + self.algo_const;
+
+            self.count += 1.0;
+            self.mean1 = new_mean1;
+            self.mean2 = new_mean2;
+            self.algo_const = new_c;
+        }
+
+        Ok(())
+    }
+
+    fn retract_batch(&mut self, values: &[ArrayRef]) -> Result<()> {
+        let values1 = &cast(&values[0], &DataType::Float64)?;
+        let values2 = &cast(&values[1], &DataType::Float64)?;
+        let mut arr1 = downcast_value!(values1, Float64Array).iter().flatten();
+        let mut arr2 = downcast_value!(values2, Float64Array).iter().flatten();
+
+        for i in 0..values1.len() {
+            let value1 = if values1.is_valid(i) {
+                arr1.next()
+            } else {
+                None
+            };
+            let value2 = if values2.is_valid(i) {
+                arr2.next()
+            } else {
+                None
+            };
+
+            if value1.is_none() || value2.is_none() {
+                continue;
+            }
+
+            let value1 = unwrap_or_internal_err!(value1);
+            let value2 = unwrap_or_internal_err!(value2);
+
+            let new_count = self.count - 1.0;
+            let delta1 = self.mean1 - value1;
+            let new_mean1 = delta1 / new_count + self.mean1;
+            let delta2 = self.mean2 - value2;
+            let new_mean2 = delta2 / new_count + self.mean2;
+            let new_c = self.algo_const - delta1 * (new_mean2 - value2);
+
+            self.count -= 1.0;
+            self.mean1 = new_mean1;
+            self.mean2 = new_mean2;
+            self.algo_const = new_c;
+        }
+
+        Ok(())
+    }
+
+    fn merge_batch(&mut self, states: &[ArrayRef]) -> Result<()> {
+        let counts = downcast_value!(states[0], Float64Array);
+        let means1 = downcast_value!(states[1], Float64Array);
+        let means2 = downcast_value!(states[2], Float64Array);
+        let cs = downcast_value!(states[3], Float64Array);
+
+        for i in 0..counts.len() {
+            let c = counts.value(i);
+            if c == 0.0 {
+                continue;
+            }
+            let new_count = self.count + c;
+            let new_mean1 = self.mean1 * self.count / new_count + means1.value(i) * c / new_count;
+            let new_mean2 = self.mean2 * self.count / new_count + means2.value(i) * c / new_count;
+            let delta1 = self.mean1 - means1.value(i);
+            let delta2 = self.mean2 - means2.value(i);
+            let new_c =
+                self.algo_const + cs.value(i) + delta1 * delta2 * self.count * c / new_count;
+
+            self.count = new_count;
+            self.mean1 = new_mean1;
+            self.mean2 = new_mean2;
+            self.algo_const = new_c;
+        }
+        Ok(())
+    }
+
+    fn evaluate(&mut self) -> Result<ScalarValue> {
+        if self.count == 0.0 {
+            return Ok(ScalarValue::Float64(None));
+        }
+
+        let count = match self.stats_type {
+            StatsType::Population => self.count,
+            StatsType::Sample if self.count > 1.0 => self.count - 1.0,
+            StatsType::Sample => {
+                // self.count == 1.0
+                return if self.null_on_divide_by_zero {
+                    Ok(ScalarValue::Float64(None))
+                } else {
+                    Ok(ScalarValue::Float64(Some(f64::NAN)))
+                };
+            }
+        };
+
+        Ok(ScalarValue::Float64(Some(self.algo_const / count)))
+    }
+
+    fn size(&self) -> usize {
+        std::mem::size_of_val(self)
+    }
+}
diff --git a/src/strings.rs b/src/strings.rs
new file mode 100644
index 000000000000..c2706b589652
--- /dev/null
+++ b/src/strings.rs
@@ -0,0 +1,290 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#![allow(deprecated)]
+
+use crate::kernels::strings::{string_space, substring};
+use arrow::{
+    compute::{
+        contains_dyn, contains_utf8_scalar_dyn, ends_with_dyn, ends_with_utf8_scalar_dyn, like_dyn,
+        like_utf8_scalar_dyn, starts_with_dyn, starts_with_utf8_scalar_dyn,
+    },
+    record_batch::RecordBatch,
+};
+use arrow_schema::{DataType, Schema};
+use datafusion::logical_expr::ColumnarValue;
+use datafusion_common::{DataFusionError, ScalarValue::Utf8};
+use datafusion_physical_expr::PhysicalExpr;
+use std::{
+    any::Any,
+    fmt::{Display, Formatter},
+    hash::Hash,
+    sync::Arc,
+};
+
+macro_rules! make_predicate_function {
+    ($name: ident, $kernel: ident, $str_scalar_kernel: ident) => {
+        #[derive(Debug, Eq)]
+        pub struct $name {
+            left: Arc<dyn PhysicalExpr>,
+            right: Arc<dyn PhysicalExpr>,
+        }
+
+        impl $name {
+            pub fn new(left: Arc<dyn PhysicalExpr>, right: Arc<dyn PhysicalExpr>) -> Self {
+                Self { left, right }
+            }
+        }
+
+        impl Display for $name {
+            fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+                write!(f, "$name [left: {}, right: {}]", self.left, self.right)
+            }
+        }
+
+        impl Hash for $name {
+            fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
+                self.left.hash(state);
+                self.right.hash(state);
+            }
+        }
+
+        impl PartialEq for $name {
+            fn eq(&self, other: &Self) -> bool {
+                self.left.eq(&other.left) && self.right.eq(&other.right)
+            }
+        }
+
+        impl PhysicalExpr for $name {
+            fn as_any(&self) -> &dyn Any {
+                self
+            }
+
+            fn data_type(&self, _: &Schema) -> datafusion_common::Result<DataType> {
+                Ok(DataType::Boolean)
+            }
+
+            fn nullable(&self, _: &Schema) -> datafusion_common::Result<bool> {
+                Ok(true)
+            }
+
+            fn evaluate(&self, batch: &RecordBatch) -> datafusion_common::Result<ColumnarValue> {
+                let left_arg = self.left.evaluate(batch)?;
+                let right_arg = self.right.evaluate(batch)?;
+
+                let array = match (left_arg, right_arg) {
+                    // array (op) scalar
+                    (ColumnarValue::Array(array), ColumnarValue::Scalar(Utf8(Some(string)))) => {
+                        $str_scalar_kernel(&array, string.as_str())
+                    }
+                    (ColumnarValue::Array(_), ColumnarValue::Scalar(other)) => {
+                        return Err(DataFusionError::Execution(format!(
+                            "Should be String but got: {:?}",
+                            other
+                        )))
+                    }
+                    // array (op) array
+                    (ColumnarValue::Array(array1), ColumnarValue::Array(array2)) => {
+                        $kernel(&array1, &array2)
+                    }
+                    // scalar (op) scalar should be folded at Spark optimizer
+                    _ => {
+                        return Err(DataFusionError::Execution(
+                            "Predicate on two literals should be folded at Spark".to_string(),
+                        ))
+                    }
+                }?;
+
+                Ok(ColumnarValue::Array(Arc::new(array)))
+            }
+
+            fn children(&self) -> Vec<&Arc<dyn PhysicalExpr>> {
+                vec![&self.left, &self.right]
+            }
+
+            fn with_new_children(
+                self: Arc<Self>,
+                children: Vec<Arc<dyn PhysicalExpr>>,
+            ) -> datafusion_common::Result<Arc<dyn PhysicalExpr>> {
+                Ok(Arc::new($name::new(
+                    children[0].clone(),
+                    children[1].clone(),
+                )))
+            }
+        }
+    };
+}
+
+make_predicate_function!(Like, like_dyn, like_utf8_scalar_dyn);
+
+make_predicate_function!(StartsWith, starts_with_dyn, starts_with_utf8_scalar_dyn);
+
+make_predicate_function!(EndsWith, ends_with_dyn, ends_with_utf8_scalar_dyn);
+
+make_predicate_function!(Contains, contains_dyn, contains_utf8_scalar_dyn);
+
+#[derive(Debug, Eq)]
+pub struct SubstringExpr {
+    pub child: Arc<dyn PhysicalExpr>,
+    pub start: i64,
+    pub len: u64,
+}
+
+impl Hash for SubstringExpr {
+    fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
+        self.child.hash(state);
+        self.start.hash(state);
+        self.len.hash(state);
+    }
+}
+
+impl PartialEq for SubstringExpr {
+    fn eq(&self, other: &Self) -> bool {
+        self.child.eq(&other.child) && self.start.eq(&other.start) && self.len.eq(&other.len)
+    }
+}
+#[derive(Debug, Eq)]
+pub struct StringSpaceExpr {
+    pub child: Arc<dyn PhysicalExpr>,
+}
+
+impl Hash for StringSpaceExpr {
+    fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
+        self.child.hash(state);
+    }
+}
+
+impl PartialEq for StringSpaceExpr {
+    fn eq(&self, other: &Self) -> bool {
+        self.child.eq(&other.child)
+    }
+}
+
+impl SubstringExpr {
+    pub fn new(child: Arc<dyn PhysicalExpr>, start: i64, len: u64) -> Self {
+        Self { child, start, len }
+    }
+}
+
+impl StringSpaceExpr {
+    pub fn new(child: Arc<dyn PhysicalExpr>) -> Self {
+        Self { child }
+    }
+}
+
+impl Display for SubstringExpr {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        write!(
+            f,
+            "StringSpace [start: {}, len: {}, child: {}]",
+            self.start, self.len, self.child
+        )
+    }
+}
+
+impl Display for StringSpaceExpr {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        write!(f, "StringSpace [child: {}] ", self.child)
+    }
+}
+
+impl PhysicalExpr for SubstringExpr {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn data_type(&self, input_schema: &Schema) -> datafusion_common::Result<DataType> {
+        self.child.data_type(input_schema)
+    }
+
+    fn nullable(&self, _: &Schema) -> datafusion_common::Result<bool> {
+        Ok(true)
+    }
+
+    fn evaluate(&self, batch: &RecordBatch) -> datafusion_common::Result<ColumnarValue> {
+        let arg = self.child.evaluate(batch)?;
+        match arg {
+            ColumnarValue::Array(array) => {
+                let result = substring(&array, self.start, self.len)?;
+
+                Ok(ColumnarValue::Array(result))
+            }
+            _ => Err(DataFusionError::Execution(
+                "Substring(scalar) should be fold in Spark JVM side.".to_string(),
+            )),
+        }
+    }
+
+    fn children(&self) -> Vec<&Arc<dyn PhysicalExpr>> {
+        vec![&self.child]
+    }
+
+    fn with_new_children(
+        self: Arc<Self>,
+        children: Vec<Arc<dyn PhysicalExpr>>,
+    ) -> datafusion_common::Result<Arc<dyn PhysicalExpr>> {
+        Ok(Arc::new(SubstringExpr::new(
+            Arc::clone(&children[0]),
+            self.start,
+            self.len,
+        )))
+    }
+}
+
+impl PhysicalExpr for StringSpaceExpr {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn data_type(&self, input_schema: &Schema) -> datafusion_common::Result<DataType> {
+        match self.child.data_type(input_schema)? {
+            DataType::Dictionary(key_type, _) => {
+                Ok(DataType::Dictionary(key_type, Box::new(DataType::Utf8)))
+            }
+            _ => Ok(DataType::Utf8),
+        }
+    }
+
+    fn nullable(&self, _: &Schema) -> datafusion_common::Result<bool> {
+        Ok(true)
+    }
+
+    fn evaluate(&self, batch: &RecordBatch) -> datafusion_common::Result<ColumnarValue> {
+        let arg = self.child.evaluate(batch)?;
+        match arg {
+            ColumnarValue::Array(array) => {
+                let result = string_space(&array)?;
+
+                Ok(ColumnarValue::Array(result))
+            }
+            _ => Err(DataFusionError::Execution(
+                "StringSpace(scalar) should be fold in Spark JVM side.".to_string(),
+            )),
+        }
+    }
+
+    fn children(&self) -> Vec<&Arc<dyn PhysicalExpr>> {
+        vec![&self.child]
+    }
+
+    fn with_new_children(
+        self: Arc<Self>,
+        children: Vec<Arc<dyn PhysicalExpr>>,
+    ) -> datafusion_common::Result<Arc<dyn PhysicalExpr>> {
+        Ok(Arc::new(StringSpaceExpr::new(Arc::clone(&children[0]))))
+    }
+}
diff --git a/src/sum_decimal.rs b/src/sum_decimal.rs
new file mode 100644
index 000000000000..f3f34d9bfa9d
--- /dev/null
+++ b/src/sum_decimal.rs
@@ -0,0 +1,555 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::utils::{is_valid_decimal_precision, unlikely};
+use arrow::{
+    array::BooleanBufferBuilder,
+    buffer::{BooleanBuffer, NullBuffer},
+};
+use arrow_array::{
+    cast::AsArray, types::Decimal128Type, Array, ArrayRef, BooleanArray, Decimal128Array,
+};
+use arrow_schema::{DataType, Field};
+use datafusion::logical_expr::{Accumulator, EmitTo, GroupsAccumulator};
+use datafusion_common::{DataFusionError, Result as DFResult, ScalarValue};
+use datafusion_expr::function::{AccumulatorArgs, StateFieldsArgs};
+use datafusion_expr::Volatility::Immutable;
+use datafusion_expr::{AggregateUDFImpl, ReversedUDAF, Signature};
+use std::{any::Any, ops::BitAnd, sync::Arc};
+
+#[derive(Debug)]
+pub struct SumDecimal {
+    /// Aggregate function signature
+    signature: Signature,
+    /// The data type of the SUM result. This will always be a decimal type
+    /// with the same precision and scale as specified in this struct
+    result_type: DataType,
+    /// Decimal precision
+    precision: u8,
+    /// Decimal scale
+    scale: i8,
+}
+
+impl SumDecimal {
+    pub fn try_new(data_type: DataType) -> DFResult<Self> {
+        // The `data_type` is the SUM result type passed from Spark side
+        let (precision, scale) = match data_type {
+            DataType::Decimal128(p, s) => (p, s),
+            _ => {
+                return Err(DataFusionError::Internal(
+                    "Invalid data type for SumDecimal".into(),
+                ))
+            }
+        };
+        Ok(Self {
+            signature: Signature::user_defined(Immutable),
+            result_type: data_type,
+            precision,
+            scale,
+        })
+    }
+}
+
+impl AggregateUDFImpl for SumDecimal {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn accumulator(&self, _args: AccumulatorArgs) -> DFResult<Box<dyn Accumulator>> {
+        Ok(Box::new(SumDecimalAccumulator::new(
+            self.precision,
+            self.scale,
+        )))
+    }
+
+    fn state_fields(&self, _args: StateFieldsArgs) -> DFResult<Vec<Field>> {
+        let fields = vec![
+            Field::new(self.name(), self.result_type.clone(), self.is_nullable()),
+            Field::new("is_empty", DataType::Boolean, false),
+        ];
+        Ok(fields)
+    }
+
+    fn name(&self) -> &str {
+        "sum"
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, _arg_types: &[DataType]) -> DFResult<DataType> {
+        Ok(self.result_type.clone())
+    }
+
+    fn groups_accumulator_supported(&self, _args: AccumulatorArgs) -> bool {
+        true
+    }
+
+    fn create_groups_accumulator(
+        &self,
+        _args: AccumulatorArgs,
+    ) -> DFResult<Box<dyn GroupsAccumulator>> {
+        Ok(Box::new(SumDecimalGroupsAccumulator::new(
+            self.result_type.clone(),
+            self.precision,
+        )))
+    }
+
+    fn default_value(&self, _data_type: &DataType) -> DFResult<ScalarValue> {
+        ScalarValue::new_primitive::<Decimal128Type>(
+            None,
+            &DataType::Decimal128(self.precision, self.scale),
+        )
+    }
+
+    fn reverse_expr(&self) -> ReversedUDAF {
+        ReversedUDAF::Identical
+    }
+
+    fn is_nullable(&self) -> bool {
+        // SumDecimal is always nullable because overflows can cause null values
+        true
+    }
+}
+
+#[derive(Debug)]
+struct SumDecimalAccumulator {
+    sum: i128,
+    is_empty: bool,
+    is_not_null: bool,
+
+    precision: u8,
+    scale: i8,
+}
+
+impl SumDecimalAccumulator {
+    fn new(precision: u8, scale: i8) -> Self {
+        Self {
+            sum: 0,
+            is_empty: true,
+            is_not_null: true,
+            precision,
+            scale,
+        }
+    }
+
+    fn update_single(&mut self, values: &Decimal128Array, idx: usize) {
+        let v = unsafe { values.value_unchecked(idx) };
+        let (new_sum, is_overflow) = self.sum.overflowing_add(v);
+
+        if is_overflow || !is_valid_decimal_precision(new_sum, self.precision) {
+            // Overflow: set buffer accumulator to null
+            self.is_not_null = false;
+            return;
+        }
+
+        self.sum = new_sum;
+        self.is_not_null = true;
+    }
+}
+
+impl Accumulator for SumDecimalAccumulator {
+    fn update_batch(&mut self, values: &[ArrayRef]) -> DFResult<()> {
+        assert_eq!(
+            values.len(),
+            1,
+            "Expect only one element in 'values' but found {}",
+            values.len()
+        );
+
+        if !self.is_empty && !self.is_not_null {
+            // This means there's a overflow in decimal, so we will just skip the rest
+            // of the computation
+            return Ok(());
+        }
+
+        let values = &values[0];
+        let data = values.as_primitive::<Decimal128Type>();
+
+        self.is_empty = self.is_empty && values.len() == values.null_count();
+
+        if values.null_count() == 0 {
+            for i in 0..data.len() {
+                self.update_single(data, i);
+            }
+        } else {
+            for i in 0..data.len() {
+                if data.is_null(i) {
+                    continue;
+                }
+                self.update_single(data, i);
+            }
+        }
+
+        Ok(())
+    }
+
+    fn evaluate(&mut self) -> DFResult<ScalarValue> {
+        // For each group:
+        //   1. if `is_empty` is true, it means either there is no value or all values for the group
+        //      are null, in this case we'll return null
+        //   2. if `is_empty` is false, but `null_state` is true, it means there's an overflow. In
+        //      non-ANSI mode Spark returns null.
+        if self.is_empty || !self.is_not_null {
+            ScalarValue::new_primitive::<Decimal128Type>(
+                None,
+                &DataType::Decimal128(self.precision, self.scale),
+            )
+        } else {
+            ScalarValue::try_new_decimal128(self.sum, self.precision, self.scale)
+        }
+    }
+
+    fn size(&self) -> usize {
+        std::mem::size_of_val(self)
+    }
+
+    fn state(&mut self) -> DFResult<Vec<ScalarValue>> {
+        let sum = if self.is_not_null {
+            ScalarValue::try_new_decimal128(self.sum, self.precision, self.scale)?
+        } else {
+            ScalarValue::new_primitive::<Decimal128Type>(
+                None,
+                &DataType::Decimal128(self.precision, self.scale),
+            )?
+        };
+        Ok(vec![sum, ScalarValue::from(self.is_empty)])
+    }
+
+    fn merge_batch(&mut self, states: &[ArrayRef]) -> DFResult<()> {
+        assert_eq!(
+            states.len(),
+            2,
+            "Expect two element in 'states' but found {}",
+            states.len()
+        );
+        assert_eq!(states[0].len(), 1);
+        assert_eq!(states[1].len(), 1);
+
+        let that_sum = states[0].as_primitive::<Decimal128Type>();
+        let that_is_empty = states[1].as_any().downcast_ref::<BooleanArray>().unwrap();
+
+        let this_overflow = !self.is_empty && !self.is_not_null;
+        let that_overflow = !that_is_empty.value(0) && that_sum.is_null(0);
+
+        self.is_not_null = !this_overflow && !that_overflow;
+        self.is_empty = self.is_empty && that_is_empty.value(0);
+
+        if self.is_not_null {
+            self.sum += that_sum.value(0);
+        }
+
+        Ok(())
+    }
+}
+
+struct SumDecimalGroupsAccumulator {
+    // Whether aggregate buffer for a particular group is null. True indicates it is not null.
+    is_not_null: BooleanBufferBuilder,
+    is_empty: BooleanBufferBuilder,
+    sum: Vec<i128>,
+    result_type: DataType,
+    precision: u8,
+}
+
+impl SumDecimalGroupsAccumulator {
+    fn new(result_type: DataType, precision: u8) -> Self {
+        Self {
+            is_not_null: BooleanBufferBuilder::new(0),
+            is_empty: BooleanBufferBuilder::new(0),
+            sum: Vec::new(),
+            result_type,
+            precision,
+        }
+    }
+
+    fn is_overflow(&self, index: usize) -> bool {
+        !self.is_empty.get_bit(index) && !self.is_not_null.get_bit(index)
+    }
+
+    fn update_single(&mut self, group_index: usize, value: i128) {
+        if unlikely(self.is_overflow(group_index)) {
+            // This means there's a overflow in decimal, so we will just skip the rest
+            // of the computation
+            return;
+        }
+
+        self.is_empty.set_bit(group_index, false);
+        let (new_sum, is_overflow) = self.sum[group_index].overflowing_add(value);
+
+        if is_overflow || !is_valid_decimal_precision(new_sum, self.precision) {
+            // Overflow: set buffer accumulator to null
+            self.is_not_null.set_bit(group_index, false);
+            return;
+        }
+
+        self.sum[group_index] = new_sum;
+        self.is_not_null.set_bit(group_index, true)
+    }
+}
+
+fn ensure_bit_capacity(builder: &mut BooleanBufferBuilder, capacity: usize) {
+    if builder.len() < capacity {
+        let additional = capacity - builder.len();
+        builder.append_n(additional, true);
+    }
+}
+
+/// Build a boolean buffer from the state and reset the state, based on the emit_to
+/// strategy.
+fn build_bool_state(state: &mut BooleanBufferBuilder, emit_to: &EmitTo) -> BooleanBuffer {
+    let bool_state: BooleanBuffer = state.finish();
+
+    match emit_to {
+        EmitTo::All => bool_state,
+        EmitTo::First(n) => {
+            // split off the first N values in bool_state
+            let first_n_bools: BooleanBuffer = bool_state.iter().take(*n).collect();
+            // reset the existing seen buffer
+            for seen in bool_state.iter().skip(*n) {
+                state.append(seen);
+            }
+            first_n_bools
+        }
+    }
+}
+
+impl GroupsAccumulator for SumDecimalGroupsAccumulator {
+    fn update_batch(
+        &mut self,
+        values: &[ArrayRef],
+        group_indices: &[usize],
+        opt_filter: Option<&BooleanArray>,
+        total_num_groups: usize,
+    ) -> DFResult<()> {
+        assert!(opt_filter.is_none(), "opt_filter is not supported yet");
+        assert_eq!(values.len(), 1);
+        let values = values[0].as_primitive::<Decimal128Type>();
+        let data = values.values();
+
+        // Update size for the accumulate states
+        self.sum.resize(total_num_groups, 0);
+        ensure_bit_capacity(&mut self.is_empty, total_num_groups);
+        ensure_bit_capacity(&mut self.is_not_null, total_num_groups);
+
+        let iter = group_indices.iter().zip(data.iter());
+        if values.null_count() == 0 {
+            for (&group_index, &value) in iter {
+                self.update_single(group_index, value);
+            }
+        } else {
+            for (idx, (&group_index, &value)) in iter.enumerate() {
+                if values.is_null(idx) {
+                    continue;
+                }
+                self.update_single(group_index, value);
+            }
+        }
+
+        Ok(())
+    }
+
+    fn evaluate(&mut self, emit_to: EmitTo) -> DFResult<ArrayRef> {
+        // For each group:
+        //   1. if `is_empty` is true, it means either there is no value or all values for the group
+        //      are null, in this case we'll return null
+        //   2. if `is_empty` is false, but `null_state` is true, it means there's an overflow. In
+        //      non-ANSI mode Spark returns null.
+        let nulls = build_bool_state(&mut self.is_not_null, &emit_to);
+        let is_empty = build_bool_state(&mut self.is_empty, &emit_to);
+        let x = (!&is_empty).bitand(&nulls);
+
+        let result = emit_to.take_needed(&mut self.sum);
+        let result = Decimal128Array::new(result.into(), Some(NullBuffer::new(x)))
+            .with_data_type(self.result_type.clone());
+
+        Ok(Arc::new(result))
+    }
+
+    fn state(&mut self, emit_to: EmitTo) -> DFResult<Vec<ArrayRef>> {
+        let nulls = build_bool_state(&mut self.is_not_null, &emit_to);
+        let nulls = Some(NullBuffer::new(nulls));
+
+        let sum = emit_to.take_needed(&mut self.sum);
+        let sum = Decimal128Array::new(sum.into(), nulls.clone())
+            .with_data_type(self.result_type.clone());
+
+        let is_empty = build_bool_state(&mut self.is_empty, &emit_to);
+        let is_empty = BooleanArray::new(is_empty, None);
+
+        Ok(vec![
+            Arc::new(sum) as ArrayRef,
+            Arc::new(is_empty) as ArrayRef,
+        ])
+    }
+
+    fn merge_batch(
+        &mut self,
+        values: &[ArrayRef],
+        group_indices: &[usize],
+        opt_filter: Option<&BooleanArray>,
+        total_num_groups: usize,
+    ) -> DFResult<()> {
+        assert_eq!(
+            values.len(),
+            2,
+            "Expected two arrays: 'sum' and 'is_empty', but found {}",
+            values.len()
+        );
+        assert!(opt_filter.is_none(), "opt_filter is not supported yet");
+
+        // Make sure we have enough capacity for the additional groups
+        self.sum.resize(total_num_groups, 0);
+        ensure_bit_capacity(&mut self.is_empty, total_num_groups);
+        ensure_bit_capacity(&mut self.is_not_null, total_num_groups);
+
+        let that_sum = &values[0];
+        let that_sum = that_sum.as_primitive::<Decimal128Type>();
+        let that_is_empty = &values[1];
+        let that_is_empty = that_is_empty
+            .as_any()
+            .downcast_ref::<BooleanArray>()
+            .unwrap();
+
+        group_indices
+            .iter()
+            .enumerate()
+            .for_each(|(idx, &group_index)| unsafe {
+                let this_overflow = self.is_overflow(group_index);
+                let that_is_empty = that_is_empty.value_unchecked(idx);
+                let that_overflow = !that_is_empty && that_sum.is_null(idx);
+                let is_overflow = this_overflow || that_overflow;
+
+                // This part follows the logic in Spark:
+                //   `org.apache.spark.sql.catalyst.expressions.aggregate.Sum`
+                self.is_not_null.set_bit(group_index, !is_overflow);
+                self.is_empty.set_bit(
+                    group_index,
+                    self.is_empty.get_bit(group_index) && that_is_empty,
+                );
+                if !is_overflow {
+                    // .. otherwise, the sum value for this particular index must not be null,
+                    // and thus we merge both values and update this sum.
+                    self.sum[group_index] += that_sum.value_unchecked(idx);
+                }
+            });
+
+        Ok(())
+    }
+
+    fn size(&self) -> usize {
+        self.sum.capacity() * std::mem::size_of::<i128>()
+            + self.is_empty.capacity() / 8
+            + self.is_not_null.capacity() / 8
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use arrow::datatypes::*;
+    use arrow_array::builder::{Decimal128Builder, StringBuilder};
+    use arrow_array::RecordBatch;
+    use datafusion::execution::TaskContext;
+    use datafusion::physical_plan::aggregates::{AggregateExec, AggregateMode, PhysicalGroupBy};
+    use datafusion::physical_plan::memory::MemoryExec;
+    use datafusion::physical_plan::ExecutionPlan;
+    use datafusion_common::Result;
+    use datafusion_expr::AggregateUDF;
+    use datafusion_physical_expr::aggregate::AggregateExprBuilder;
+    use datafusion_physical_expr::expressions::Column;
+    use datafusion_physical_expr::PhysicalExpr;
+    use futures::StreamExt;
+
+    #[test]
+    fn invalid_data_type() {
+        assert!(SumDecimal::try_new(DataType::Int32).is_err());
+    }
+
+    #[tokio::test]
+    async fn sum_no_overflow() -> Result<()> {
+        let num_rows = 8192;
+        let batch = create_record_batch(num_rows);
+        let mut batches = Vec::new();
+        for _ in 0..10 {
+            batches.push(batch.clone());
+        }
+        let partitions = &[batches];
+        let c0: Arc<dyn PhysicalExpr> = Arc::new(Column::new("c0", 0));
+        let c1: Arc<dyn PhysicalExpr> = Arc::new(Column::new("c1", 1));
+
+        let data_type = DataType::Decimal128(8, 2);
+        let schema = Arc::clone(&partitions[0][0].schema());
+        let scan: Arc<dyn ExecutionPlan> =
+            Arc::new(MemoryExec::try_new(partitions, Arc::clone(&schema), None).unwrap());
+
+        let aggregate_udf = Arc::new(AggregateUDF::new_from_impl(SumDecimal::try_new(
+            data_type.clone(),
+        )?));
+
+        let aggr_expr = AggregateExprBuilder::new(aggregate_udf, vec![c1])
+            .schema(Arc::clone(&schema))
+            .alias("sum")
+            .with_ignore_nulls(false)
+            .with_distinct(false)
+            .build()?;
+
+        let aggregate = Arc::new(AggregateExec::try_new(
+            AggregateMode::Partial,
+            PhysicalGroupBy::new_single(vec![(c0, "c0".to_string())]),
+            vec![aggr_expr.into()],
+            vec![None], // no filter expressions
+            scan,
+            Arc::clone(&schema),
+        )?);
+
+        let mut stream = aggregate
+            .execute(0, Arc::new(TaskContext::default()))
+            .unwrap();
+        while let Some(batch) = stream.next().await {
+            let _batch = batch?;
+        }
+
+        Ok(())
+    }
+
+    fn create_record_batch(num_rows: usize) -> RecordBatch {
+        let mut decimal_builder = Decimal128Builder::with_capacity(num_rows);
+        let mut string_builder = StringBuilder::with_capacity(num_rows, num_rows * 32);
+        for i in 0..num_rows {
+            decimal_builder.append_value(i as i128);
+            string_builder.append_value(format!("this is string #{}", i % 1024));
+        }
+        let decimal_array = Arc::new(decimal_builder.finish());
+        let string_array = Arc::new(string_builder.finish());
+
+        let mut fields = vec![];
+        let mut columns: Vec<ArrayRef> = vec![];
+
+        // string column
+        fields.push(Field::new("c0", DataType::Utf8, false));
+        columns.push(string_array);
+
+        // decimal column
+        fields.push(Field::new("c1", DataType::Decimal128(38, 10), false));
+        columns.push(decimal_array);
+
+        let schema = Schema::new(fields);
+        RecordBatch::try_new(Arc::new(schema), columns).unwrap()
+    }
+}
diff --git a/src/temporal.rs b/src/temporal.rs
new file mode 100644
index 000000000000..fb549f9ce818
--- /dev/null
+++ b/src/temporal.rs
@@ -0,0 +1,510 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::utils::array_with_timezone;
+use arrow::{
+    compute::{date_part, DatePart},
+    record_batch::RecordBatch,
+};
+use arrow_schema::{DataType, Schema, TimeUnit::Microsecond};
+use datafusion::logical_expr::ColumnarValue;
+use datafusion_common::{DataFusionError, ScalarValue::Utf8};
+use datafusion_physical_expr::PhysicalExpr;
+use std::hash::Hash;
+use std::{
+    any::Any,
+    fmt::{Debug, Display, Formatter},
+    sync::Arc,
+};
+
+use crate::kernels::temporal::{
+    date_trunc_array_fmt_dyn, date_trunc_dyn, timestamp_trunc_array_fmt_dyn, timestamp_trunc_dyn,
+};
+
+#[derive(Debug, Eq)]
+pub struct HourExpr {
+    /// An array with DataType::Timestamp(TimeUnit::Microsecond, None)
+    child: Arc<dyn PhysicalExpr>,
+    timezone: String,
+}
+
+impl Hash for HourExpr {
+    fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
+        self.child.hash(state);
+        self.timezone.hash(state);
+    }
+}
+impl PartialEq for HourExpr {
+    fn eq(&self, other: &Self) -> bool {
+        self.child.eq(&other.child) && self.timezone.eq(&other.timezone)
+    }
+}
+
+impl HourExpr {
+    pub fn new(child: Arc<dyn PhysicalExpr>, timezone: String) -> Self {
+        HourExpr { child, timezone }
+    }
+}
+
+impl Display for HourExpr {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        write!(
+            f,
+            "Hour [timezone:{}, child: {}]",
+            self.timezone, self.child
+        )
+    }
+}
+
+impl PhysicalExpr for HourExpr {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn data_type(&self, input_schema: &Schema) -> datafusion_common::Result<DataType> {
+        match self.child.data_type(input_schema).unwrap() {
+            DataType::Dictionary(key_type, _) => {
+                Ok(DataType::Dictionary(key_type, Box::new(DataType::Int32)))
+            }
+            _ => Ok(DataType::Int32),
+        }
+    }
+
+    fn nullable(&self, _: &Schema) -> datafusion_common::Result<bool> {
+        Ok(true)
+    }
+
+    fn evaluate(&self, batch: &RecordBatch) -> datafusion_common::Result<ColumnarValue> {
+        let arg = self.child.evaluate(batch)?;
+        match arg {
+            ColumnarValue::Array(array) => {
+                let array = array_with_timezone(
+                    array,
+                    self.timezone.clone(),
+                    Some(&DataType::Timestamp(
+                        Microsecond,
+                        Some(self.timezone.clone().into()),
+                    )),
+                )?;
+                let result = date_part(&array, DatePart::Hour)?;
+
+                Ok(ColumnarValue::Array(result))
+            }
+            _ => Err(DataFusionError::Execution(
+                "Hour(scalar) should be fold in Spark JVM side.".to_string(),
+            )),
+        }
+    }
+
+    fn children(&self) -> Vec<&Arc<dyn PhysicalExpr>> {
+        vec![&self.child]
+    }
+
+    fn with_new_children(
+        self: Arc<Self>,
+        children: Vec<Arc<dyn PhysicalExpr>>,
+    ) -> Result<Arc<dyn PhysicalExpr>, DataFusionError> {
+        Ok(Arc::new(HourExpr::new(
+            Arc::clone(&children[0]),
+            self.timezone.clone(),
+        )))
+    }
+}
+
+#[derive(Debug, Eq)]
+pub struct MinuteExpr {
+    /// An array with DataType::Timestamp(TimeUnit::Microsecond, None)
+    child: Arc<dyn PhysicalExpr>,
+    timezone: String,
+}
+
+impl Hash for MinuteExpr {
+    fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
+        self.child.hash(state);
+        self.timezone.hash(state);
+    }
+}
+impl PartialEq for MinuteExpr {
+    fn eq(&self, other: &Self) -> bool {
+        self.child.eq(&other.child) && self.timezone.eq(&other.timezone)
+    }
+}
+
+impl MinuteExpr {
+    pub fn new(child: Arc<dyn PhysicalExpr>, timezone: String) -> Self {
+        MinuteExpr { child, timezone }
+    }
+}
+
+impl Display for MinuteExpr {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        write!(
+            f,
+            "Minute [timezone:{}, child: {}]",
+            self.timezone, self.child
+        )
+    }
+}
+
+impl PhysicalExpr for MinuteExpr {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn data_type(&self, input_schema: &Schema) -> datafusion_common::Result<DataType> {
+        match self.child.data_type(input_schema).unwrap() {
+            DataType::Dictionary(key_type, _) => {
+                Ok(DataType::Dictionary(key_type, Box::new(DataType::Int32)))
+            }
+            _ => Ok(DataType::Int32),
+        }
+    }
+
+    fn nullable(&self, _: &Schema) -> datafusion_common::Result<bool> {
+        Ok(true)
+    }
+
+    fn evaluate(&self, batch: &RecordBatch) -> datafusion_common::Result<ColumnarValue> {
+        let arg = self.child.evaluate(batch)?;
+        match arg {
+            ColumnarValue::Array(array) => {
+                let array = array_with_timezone(
+                    array,
+                    self.timezone.clone(),
+                    Some(&DataType::Timestamp(
+                        Microsecond,
+                        Some(self.timezone.clone().into()),
+                    )),
+                )?;
+                let result = date_part(&array, DatePart::Minute)?;
+
+                Ok(ColumnarValue::Array(result))
+            }
+            _ => Err(DataFusionError::Execution(
+                "Minute(scalar) should be fold in Spark JVM side.".to_string(),
+            )),
+        }
+    }
+
+    fn children(&self) -> Vec<&Arc<dyn PhysicalExpr>> {
+        vec![&self.child]
+    }
+
+    fn with_new_children(
+        self: Arc<Self>,
+        children: Vec<Arc<dyn PhysicalExpr>>,
+    ) -> Result<Arc<dyn PhysicalExpr>, DataFusionError> {
+        Ok(Arc::new(MinuteExpr::new(
+            Arc::clone(&children[0]),
+            self.timezone.clone(),
+        )))
+    }
+}
+
+#[derive(Debug, Eq)]
+pub struct SecondExpr {
+    /// An array with DataType::Timestamp(TimeUnit::Microsecond, None)
+    child: Arc<dyn PhysicalExpr>,
+    timezone: String,
+}
+
+impl Hash for SecondExpr {
+    fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
+        self.child.hash(state);
+        self.timezone.hash(state);
+    }
+}
+impl PartialEq for SecondExpr {
+    fn eq(&self, other: &Self) -> bool {
+        self.child.eq(&other.child) && self.timezone.eq(&other.timezone)
+    }
+}
+
+impl SecondExpr {
+    pub fn new(child: Arc<dyn PhysicalExpr>, timezone: String) -> Self {
+        SecondExpr { child, timezone }
+    }
+}
+
+impl Display for SecondExpr {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        write!(
+            f,
+            "Second (timezone:{}, child: {}]",
+            self.timezone, self.child
+        )
+    }
+}
+
+impl PhysicalExpr for SecondExpr {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn data_type(&self, input_schema: &Schema) -> datafusion_common::Result<DataType> {
+        match self.child.data_type(input_schema).unwrap() {
+            DataType::Dictionary(key_type, _) => {
+                Ok(DataType::Dictionary(key_type, Box::new(DataType::Int32)))
+            }
+            _ => Ok(DataType::Int32),
+        }
+    }
+
+    fn nullable(&self, _: &Schema) -> datafusion_common::Result<bool> {
+        Ok(true)
+    }
+
+    fn evaluate(&self, batch: &RecordBatch) -> datafusion_common::Result<ColumnarValue> {
+        let arg = self.child.evaluate(batch)?;
+        match arg {
+            ColumnarValue::Array(array) => {
+                let array = array_with_timezone(
+                    array,
+                    self.timezone.clone(),
+                    Some(&DataType::Timestamp(
+                        Microsecond,
+                        Some(self.timezone.clone().into()),
+                    )),
+                )?;
+                let result = date_part(&array, DatePart::Second)?;
+
+                Ok(ColumnarValue::Array(result))
+            }
+            _ => Err(DataFusionError::Execution(
+                "Second(scalar) should be fold in Spark JVM side.".to_string(),
+            )),
+        }
+    }
+
+    fn children(&self) -> Vec<&Arc<dyn PhysicalExpr>> {
+        vec![&self.child]
+    }
+
+    fn with_new_children(
+        self: Arc<Self>,
+        children: Vec<Arc<dyn PhysicalExpr>>,
+    ) -> Result<Arc<dyn PhysicalExpr>, DataFusionError> {
+        Ok(Arc::new(SecondExpr::new(
+            Arc::clone(&children[0]),
+            self.timezone.clone(),
+        )))
+    }
+}
+
+#[derive(Debug, Eq)]
+pub struct DateTruncExpr {
+    /// An array with DataType::Date32
+    child: Arc<dyn PhysicalExpr>,
+    /// Scalar UTF8 string matching the valid values in Spark SQL: https://spark.apache.org/docs/latest/api/sql/index.html#trunc
+    format: Arc<dyn PhysicalExpr>,
+}
+
+impl Hash for DateTruncExpr {
+    fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
+        self.child.hash(state);
+        self.format.hash(state);
+    }
+}
+impl PartialEq for DateTruncExpr {
+    fn eq(&self, other: &Self) -> bool {
+        self.child.eq(&other.child) && self.format.eq(&other.format)
+    }
+}
+
+impl DateTruncExpr {
+    pub fn new(child: Arc<dyn PhysicalExpr>, format: Arc<dyn PhysicalExpr>) -> Self {
+        DateTruncExpr { child, format }
+    }
+}
+
+impl Display for DateTruncExpr {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        write!(
+            f,
+            "DateTrunc [child:{}, format: {}]",
+            self.child, self.format
+        )
+    }
+}
+
+impl PhysicalExpr for DateTruncExpr {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn data_type(&self, input_schema: &Schema) -> datafusion_common::Result<DataType> {
+        self.child.data_type(input_schema)
+    }
+
+    fn nullable(&self, _: &Schema) -> datafusion_common::Result<bool> {
+        Ok(true)
+    }
+
+    fn evaluate(&self, batch: &RecordBatch) -> datafusion_common::Result<ColumnarValue> {
+        let date = self.child.evaluate(batch)?;
+        let format = self.format.evaluate(batch)?;
+        match (date, format) {
+            (ColumnarValue::Array(date), ColumnarValue::Scalar(Utf8(Some(format)))) => {
+                let result = date_trunc_dyn(&date, format)?;
+                Ok(ColumnarValue::Array(result))
+            }
+            (ColumnarValue::Array(date), ColumnarValue::Array(formats)) => {
+                let result = date_trunc_array_fmt_dyn(&date, &formats)?;
+                Ok(ColumnarValue::Array(result))
+            }
+            _ => Err(DataFusionError::Execution(
+                "Invalid input to function DateTrunc. Expected (PrimitiveArray<Date32>, Scalar) or \
+                    (PrimitiveArray<Date32>, StringArray)".to_string(),
+            )),
+        }
+    }
+
+    fn children(&self) -> Vec<&Arc<dyn PhysicalExpr>> {
+        vec![&self.child]
+    }
+
+    fn with_new_children(
+        self: Arc<Self>,
+        children: Vec<Arc<dyn PhysicalExpr>>,
+    ) -> Result<Arc<dyn PhysicalExpr>, DataFusionError> {
+        Ok(Arc::new(DateTruncExpr::new(
+            Arc::clone(&children[0]),
+            Arc::clone(&self.format),
+        )))
+    }
+}
+
+#[derive(Debug, Eq)]
+pub struct TimestampTruncExpr {
+    /// An array with DataType::Timestamp(TimeUnit::Microsecond, None)
+    child: Arc<dyn PhysicalExpr>,
+    /// Scalar UTF8 string matching the valid values in Spark SQL: https://spark.apache.org/docs/latest/api/sql/index.html#date_trunc
+    format: Arc<dyn PhysicalExpr>,
+    /// String containing a timezone name. The name must be found in the standard timezone
+    /// database (https://en.wikipedia.org/wiki/List_of_tz_database_time_zones). The string is
+    /// later parsed into a chrono::TimeZone.
+    /// Timestamp arrays in this implementation are kept in arrays of UTC timestamps (in micros)
+    /// along with a single value for the associated TimeZone. The timezone offset is applied
+    /// just before any operations on the timestamp
+    timezone: String,
+}
+
+impl Hash for TimestampTruncExpr {
+    fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
+        self.child.hash(state);
+        self.format.hash(state);
+        self.timezone.hash(state);
+    }
+}
+impl PartialEq for TimestampTruncExpr {
+    fn eq(&self, other: &Self) -> bool {
+        self.child.eq(&other.child)
+            && self.format.eq(&other.format)
+            && self.timezone.eq(&other.timezone)
+    }
+}
+
+impl TimestampTruncExpr {
+    pub fn new(
+        child: Arc<dyn PhysicalExpr>,
+        format: Arc<dyn PhysicalExpr>,
+        timezone: String,
+    ) -> Self {
+        TimestampTruncExpr {
+            child,
+            format,
+            timezone,
+        }
+    }
+}
+
+impl Display for TimestampTruncExpr {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        write!(
+            f,
+            "TimestampTrunc [child:{}, format:{}, timezone: {}]",
+            self.child, self.format, self.timezone
+        )
+    }
+}
+
+impl PhysicalExpr for TimestampTruncExpr {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn data_type(&self, input_schema: &Schema) -> datafusion_common::Result<DataType> {
+        match self.child.data_type(input_schema)? {
+            DataType::Dictionary(key_type, _) => Ok(DataType::Dictionary(
+                key_type,
+                Box::new(DataType::Timestamp(Microsecond, None)),
+            )),
+            _ => Ok(DataType::Timestamp(Microsecond, None)),
+        }
+    }
+
+    fn nullable(&self, _: &Schema) -> datafusion_common::Result<bool> {
+        Ok(true)
+    }
+
+    fn evaluate(&self, batch: &RecordBatch) -> datafusion_common::Result<ColumnarValue> {
+        let timestamp = self.child.evaluate(batch)?;
+        let format = self.format.evaluate(batch)?;
+        let tz = self.timezone.clone();
+        match (timestamp, format) {
+            (ColumnarValue::Array(ts), ColumnarValue::Scalar(Utf8(Some(format)))) => {
+                let ts = array_with_timezone(
+                    ts,
+                    tz.clone(),
+                    Some(&DataType::Timestamp(Microsecond, Some(tz.into()))),
+                )?;
+                let result = timestamp_trunc_dyn(&ts, format)?;
+                Ok(ColumnarValue::Array(result))
+            }
+            (ColumnarValue::Array(ts), ColumnarValue::Array(formats)) => {
+                let ts = array_with_timezone(
+                    ts,
+                    tz.clone(),
+                    Some(&DataType::Timestamp(Microsecond, Some(tz.into()))),
+                )?;
+                let result = timestamp_trunc_array_fmt_dyn(&ts, &formats)?;
+                Ok(ColumnarValue::Array(result))
+            }
+            _ => Err(DataFusionError::Execution(
+                "Invalid input to function TimestampTrunc. \
+                    Expected (PrimitiveArray<TimestampMicrosecondType>, Scalar, String) or \
+                    (PrimitiveArray<TimestampMicrosecondType>, StringArray, String)"
+                    .to_string(),
+            )),
+        }
+    }
+
+    fn children(&self) -> Vec<&Arc<dyn PhysicalExpr>> {
+        vec![&self.child]
+    }
+
+    fn with_new_children(
+        self: Arc<Self>,
+        children: Vec<Arc<dyn PhysicalExpr>>,
+    ) -> Result<Arc<dyn PhysicalExpr>, DataFusionError> {
+        Ok(Arc::new(TimestampTruncExpr::new(
+            Arc::clone(&children[0]),
+            Arc::clone(&self.format),
+            self.timezone.clone(),
+        )))
+    }
+}
diff --git a/src/utils.rs b/src/utils.rs
index 18a2314fb107..ed04941bdb91 100644
--- a/src/utils.rs
+++ b/src/utils.rs
@@ -19,7 +19,7 @@ use arrow_array::{
     cast::as_primitive_array,
     types::{Int32Type, TimestampMicrosecondType},
 };
-use arrow_schema::{ArrowError, DataType, DECIMAL128_MAX_PRECISION};
+use arrow_schema::{ArrowError, DataType, TimeUnit, DECIMAL128_MAX_PRECISION};
 use std::sync::Arc;
 
 use crate::timezone::Tz;
@@ -27,6 +27,7 @@ use arrow::{
     array::{as_dictionary_array, Array, ArrayRef, PrimitiveArray},
     temporal_conversions::as_datetime,
 };
+use arrow_array::types::TimestampMillisecondType;
 use arrow_data::decimal::{MAX_DECIMAL_FOR_EACH_PRECISION, MIN_DECIMAL_FOR_EACH_PRECISION};
 use chrono::{DateTime, Offset, TimeZone};
 
@@ -71,6 +72,9 @@ pub fn array_with_timezone(
                 Some(DataType::Timestamp(_, Some(_))) => {
                     timestamp_ntz_to_timestamp(array, timezone.as_str(), Some(timezone.as_str()))
                 }
+                Some(DataType::Timestamp(_, None)) => {
+                    timestamp_ntz_to_timestamp(array, timezone.as_str(), None)
+                }
                 _ => {
                     // Not supported
                     panic!(
@@ -81,7 +85,7 @@ pub fn array_with_timezone(
                 }
             }
         }
-        DataType::Timestamp(_, Some(_)) => {
+        DataType::Timestamp(TimeUnit::Microsecond, Some(_)) => {
             assert!(!timezone.is_empty());
             let array = as_primitive_array::<TimestampMicrosecondType>(&array);
             let array_with_timezone = array.clone().with_timezone(timezone.clone());
@@ -93,6 +97,18 @@ pub fn array_with_timezone(
                 _ => Ok(array),
             }
         }
+        DataType::Timestamp(TimeUnit::Millisecond, Some(_)) => {
+            assert!(!timezone.is_empty());
+            let array = as_primitive_array::<TimestampMillisecondType>(&array);
+            let array_with_timezone = array.clone().with_timezone(timezone.clone());
+            let array = Arc::new(array_with_timezone) as ArrayRef;
+            match to_type {
+                Some(DataType::Utf8) | Some(DataType::Date32) => {
+                    pre_timestamp_cast(array, timezone)
+                }
+                _ => Ok(array),
+            }
+        }
         DataType::Dictionary(_, value_type)
             if matches!(value_type.as_ref(), &DataType::Timestamp(_, _)) =>
         {
@@ -128,7 +144,7 @@ fn timestamp_ntz_to_timestamp(
 ) -> Result<ArrayRef, ArrowError> {
     assert!(!tz.is_empty());
     match array.data_type() {
-        DataType::Timestamp(_, None) => {
+        DataType::Timestamp(TimeUnit::Microsecond, None) => {
             let array = as_primitive_array::<TimestampMicrosecondType>(&array);
             let tz: Tz = tz.parse()?;
             let array: PrimitiveArray<TimestampMicrosecondType> = array.try_unary(|value| {
@@ -147,6 +163,25 @@ fn timestamp_ntz_to_timestamp(
             };
             Ok(Arc::new(array_with_tz))
         }
+        DataType::Timestamp(TimeUnit::Millisecond, None) => {
+            let array = as_primitive_array::<TimestampMillisecondType>(&array);
+            let tz: Tz = tz.parse()?;
+            let array: PrimitiveArray<TimestampMillisecondType> = array.try_unary(|value| {
+                as_datetime::<TimestampMillisecondType>(value)
+                    .ok_or_else(|| datetime_cast_err(value))
+                    .map(|local_datetime| {
+                        let datetime: DateTime<Tz> =
+                            tz.from_local_datetime(&local_datetime).unwrap();
+                        datetime.timestamp_millis()
+                    })
+            })?;
+            let array_with_tz = if let Some(to_tz) = to_timezone {
+                array.with_timezone(to_tz)
+            } else {
+                array
+            };
+            Ok(Arc::new(array_with_tz))
+        }
         _ => Ok(array),
     }
 }
diff --git a/src/variance.rs b/src/variance.rs
new file mode 100644
index 000000000000..e71d713f5929
--- /dev/null
+++ b/src/variance.rs
@@ -0,0 +1,247 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::any::Any;
+
+use arrow::{
+    array::{ArrayRef, Float64Array},
+    datatypes::{DataType, Field},
+};
+use datafusion::logical_expr::Accumulator;
+use datafusion_common::{downcast_value, DataFusionError, Result, ScalarValue};
+use datafusion_expr::function::{AccumulatorArgs, StateFieldsArgs};
+use datafusion_expr::Volatility::Immutable;
+use datafusion_expr::{AggregateUDFImpl, Signature};
+use datafusion_physical_expr::expressions::format_state_name;
+use datafusion_physical_expr::expressions::StatsType;
+
+/// VAR_SAMP and VAR_POP aggregate expression
+/// The implementation mostly is the same as the DataFusion's implementation. The reason
+/// we have our own implementation is that DataFusion has UInt64 for state_field `count`,
+/// while Spark has Double for count. Also we have added `null_on_divide_by_zero`
+/// to be consistent with Spark's implementation.
+#[derive(Debug)]
+pub struct Variance {
+    name: String,
+    signature: Signature,
+    stats_type: StatsType,
+    null_on_divide_by_zero: bool,
+}
+
+impl Variance {
+    /// Create a new VARIANCE aggregate function
+    pub fn new(
+        name: impl Into<String>,
+        data_type: DataType,
+        stats_type: StatsType,
+        null_on_divide_by_zero: bool,
+    ) -> Self {
+        // the result of variance just support FLOAT64 data type.
+        assert!(matches!(data_type, DataType::Float64));
+        Self {
+            name: name.into(),
+            signature: Signature::numeric(1, Immutable),
+            stats_type,
+            null_on_divide_by_zero,
+        }
+    }
+}
+
+impl AggregateUDFImpl for Variance {
+    /// Return a reference to Any that can be used for downcasting
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn name(&self) -> &str {
+        &self.name
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        Ok(DataType::Float64)
+    }
+
+    fn accumulator(&self, _acc_args: AccumulatorArgs) -> Result<Box<dyn Accumulator>> {
+        Ok(Box::new(VarianceAccumulator::try_new(
+            self.stats_type,
+            self.null_on_divide_by_zero,
+        )?))
+    }
+
+    fn create_sliding_accumulator(&self, _args: AccumulatorArgs) -> Result<Box<dyn Accumulator>> {
+        Ok(Box::new(VarianceAccumulator::try_new(
+            self.stats_type,
+            self.null_on_divide_by_zero,
+        )?))
+    }
+
+    fn state_fields(&self, _args: StateFieldsArgs) -> Result<Vec<Field>> {
+        Ok(vec![
+            Field::new(
+                format_state_name(&self.name, "count"),
+                DataType::Float64,
+                true,
+            ),
+            Field::new(
+                format_state_name(&self.name, "mean"),
+                DataType::Float64,
+                true,
+            ),
+            Field::new(format_state_name(&self.name, "m2"), DataType::Float64, true),
+        ])
+    }
+
+    fn default_value(&self, _data_type: &DataType) -> Result<ScalarValue> {
+        Ok(ScalarValue::Float64(None))
+    }
+}
+
+/// An accumulator to compute variance
+#[derive(Debug)]
+pub struct VarianceAccumulator {
+    m2: f64,
+    mean: f64,
+    count: f64,
+    stats_type: StatsType,
+    null_on_divide_by_zero: bool,
+}
+
+impl VarianceAccumulator {
+    /// Creates a new `VarianceAccumulator`
+    pub fn try_new(s_type: StatsType, null_on_divide_by_zero: bool) -> Result<Self> {
+        Ok(Self {
+            m2: 0_f64,
+            mean: 0_f64,
+            count: 0_f64,
+            stats_type: s_type,
+            null_on_divide_by_zero,
+        })
+    }
+
+    pub fn get_count(&self) -> f64 {
+        self.count
+    }
+
+    pub fn get_mean(&self) -> f64 {
+        self.mean
+    }
+
+    pub fn get_m2(&self) -> f64 {
+        self.m2
+    }
+}
+
+impl Accumulator for VarianceAccumulator {
+    fn state(&mut self) -> Result<Vec<ScalarValue>> {
+        Ok(vec![
+            ScalarValue::from(self.count),
+            ScalarValue::from(self.mean),
+            ScalarValue::from(self.m2),
+        ])
+    }
+
+    fn update_batch(&mut self, values: &[ArrayRef]) -> Result<()> {
+        let arr = downcast_value!(&values[0], Float64Array).iter().flatten();
+
+        for value in arr {
+            let new_count = self.count + 1.0;
+            let delta1 = value - self.mean;
+            let new_mean = delta1 / new_count + self.mean;
+            let delta2 = value - new_mean;
+            let new_m2 = self.m2 + delta1 * delta2;
+
+            self.count += 1.0;
+            self.mean = new_mean;
+            self.m2 = new_m2;
+        }
+
+        Ok(())
+    }
+
+    fn retract_batch(&mut self, values: &[ArrayRef]) -> Result<()> {
+        let arr = downcast_value!(&values[0], Float64Array).iter().flatten();
+
+        for value in arr {
+            let new_count = self.count - 1.0;
+            let delta1 = self.mean - value;
+            let new_mean = delta1 / new_count + self.mean;
+            let delta2 = new_mean - value;
+            let new_m2 = self.m2 - delta1 * delta2;
+
+            self.count -= 1.0;
+            self.mean = new_mean;
+            self.m2 = new_m2;
+        }
+
+        Ok(())
+    }
+
+    fn merge_batch(&mut self, states: &[ArrayRef]) -> Result<()> {
+        let counts = downcast_value!(states[0], Float64Array);
+        let means = downcast_value!(states[1], Float64Array);
+        let m2s = downcast_value!(states[2], Float64Array);
+
+        for i in 0..counts.len() {
+            let c = counts.value(i);
+            if c == 0_f64 {
+                continue;
+            }
+            let new_count = self.count + c;
+            let new_mean = self.mean * self.count / new_count + means.value(i) * c / new_count;
+            let delta = self.mean - means.value(i);
+            let new_m2 = self.m2 + m2s.value(i) + delta * delta * self.count * c / new_count;
+
+            self.count = new_count;
+            self.mean = new_mean;
+            self.m2 = new_m2;
+        }
+        Ok(())
+    }
+
+    fn evaluate(&mut self) -> Result<ScalarValue> {
+        let count = match self.stats_type {
+            StatsType::Population => self.count,
+            StatsType::Sample => {
+                if self.count > 0.0 {
+                    self.count - 1.0
+                } else {
+                    self.count
+                }
+            }
+        };
+
+        Ok(ScalarValue::Float64(match self.count {
+            count if count == 0.0 => None,
+            count if count == 1.0 && StatsType::Sample == self.stats_type => {
+                if self.null_on_divide_by_zero {
+                    None
+                } else {
+                    Some(f64::NAN)
+                }
+            }
+            _ => Some(self.m2 / count),
+        }))
+    }
+
+    fn size(&self) -> usize {
+        std::mem::size_of_val(self)
+    }
+}

From 1ba1e2b7e00e388b1f4d66c8a31711040d5b9ce6 Mon Sep 17 00:00:00 2001
From: Andy Grove <agrove@apache.org>
Date: Wed, 22 Jan 2025 10:17:55 -0700
Subject: [PATCH 56/68] fix merge conflicts (#1320)

---
 src/avg.rs         | 341 ----------------------------
 src/avg_decimal.rs | 522 ------------------------------------------
 src/covariance.rs  | 306 -------------------------
 src/strings.rs     | 290 -----------------------
 src/sum_decimal.rs | 555 ---------------------------------------------
 src/temporal.rs    | 510 -----------------------------------------
 6 files changed, 2524 deletions(-)
 delete mode 100644 src/avg.rs
 delete mode 100644 src/avg_decimal.rs
 delete mode 100644 src/covariance.rs
 delete mode 100644 src/strings.rs
 delete mode 100644 src/sum_decimal.rs
 delete mode 100644 src/temporal.rs

diff --git a/src/avg.rs b/src/avg.rs
deleted file mode 100644
index 816440ac9ade..000000000000
--- a/src/avg.rs
+++ /dev/null
@@ -1,341 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-use arrow::compute::sum;
-use arrow_array::{
-    builder::PrimitiveBuilder,
-    cast::AsArray,
-    types::{Float64Type, Int64Type},
-    Array, ArrayRef, ArrowNumericType, Int64Array, PrimitiveArray,
-};
-use arrow_schema::{DataType, Field};
-use datafusion::logical_expr::{
-    type_coercion::aggregates::avg_return_type, Accumulator, EmitTo, GroupsAccumulator, Signature,
-};
-use datafusion_common::{not_impl_err, Result, ScalarValue};
-use datafusion_physical_expr::expressions::format_state_name;
-use std::{any::Any, sync::Arc};
-
-use arrow_array::ArrowNativeTypeOp;
-use datafusion_expr::function::{AccumulatorArgs, StateFieldsArgs};
-use datafusion_expr::Volatility::Immutable;
-use datafusion_expr::{AggregateUDFImpl, ReversedUDAF};
-use DataType::*;
-
-/// AVG aggregate expression
-#[derive(Debug, Clone)]
-pub struct Avg {
-    name: String,
-    signature: Signature,
-    // expr: Arc<dyn PhysicalExpr>,
-    input_data_type: DataType,
-    result_data_type: DataType,
-}
-
-impl Avg {
-    /// Create a new AVG aggregate function
-    pub fn new(name: impl Into<String>, data_type: DataType) -> Self {
-        let result_data_type = avg_return_type("avg", &data_type).unwrap();
-
-        Self {
-            name: name.into(),
-            signature: Signature::user_defined(Immutable),
-            input_data_type: data_type,
-            result_data_type,
-        }
-    }
-}
-
-impl AggregateUDFImpl for Avg {
-    /// Return a reference to Any that can be used for downcasting
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
-    fn accumulator(&self, _acc_args: AccumulatorArgs) -> Result<Box<dyn Accumulator>> {
-        // instantiate specialized accumulator based for the type
-        match (&self.input_data_type, &self.result_data_type) {
-            (Float64, Float64) => Ok(Box::<AvgAccumulator>::default()),
-            _ => not_impl_err!(
-                "AvgAccumulator for ({} --> {})",
-                self.input_data_type,
-                self.result_data_type
-            ),
-        }
-    }
-
-    fn state_fields(&self, _args: StateFieldsArgs) -> Result<Vec<Field>> {
-        Ok(vec![
-            Field::new(
-                format_state_name(&self.name, "sum"),
-                self.input_data_type.clone(),
-                true,
-            ),
-            Field::new(
-                format_state_name(&self.name, "count"),
-                DataType::Int64,
-                true,
-            ),
-        ])
-    }
-
-    fn name(&self) -> &str {
-        &self.name
-    }
-
-    fn reverse_expr(&self) -> ReversedUDAF {
-        ReversedUDAF::Identical
-    }
-
-    fn groups_accumulator_supported(&self, _args: AccumulatorArgs) -> bool {
-        true
-    }
-
-    fn create_groups_accumulator(
-        &self,
-        _args: AccumulatorArgs,
-    ) -> Result<Box<dyn GroupsAccumulator>> {
-        // instantiate specialized accumulator based for the type
-        match (&self.input_data_type, &self.result_data_type) {
-            (Float64, Float64) => Ok(Box::new(AvgGroupsAccumulator::<Float64Type, _>::new(
-                &self.input_data_type,
-                |sum: f64, count: i64| Ok(sum / count as f64),
-            ))),
-
-            _ => not_impl_err!(
-                "AvgGroupsAccumulator for ({} --> {})",
-                self.input_data_type,
-                self.result_data_type
-            ),
-        }
-    }
-
-    fn default_value(&self, _data_type: &DataType) -> Result<ScalarValue> {
-        Ok(ScalarValue::Float64(None))
-    }
-
-    fn signature(&self) -> &Signature {
-        &self.signature
-    }
-
-    fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
-        avg_return_type(self.name(), &arg_types[0])
-    }
-}
-
-/// An accumulator to compute the average
-#[derive(Debug, Default)]
-pub struct AvgAccumulator {
-    sum: Option<f64>,
-    count: i64,
-}
-
-impl Accumulator for AvgAccumulator {
-    fn state(&mut self) -> Result<Vec<ScalarValue>> {
-        Ok(vec![
-            ScalarValue::Float64(self.sum),
-            ScalarValue::from(self.count),
-        ])
-    }
-
-    fn update_batch(&mut self, values: &[ArrayRef]) -> Result<()> {
-        let values = values[0].as_primitive::<Float64Type>();
-        self.count += (values.len() - values.null_count()) as i64;
-        let v = self.sum.get_or_insert(0.);
-        if let Some(x) = sum(values) {
-            *v += x;
-        }
-        Ok(())
-    }
-
-    fn merge_batch(&mut self, states: &[ArrayRef]) -> Result<()> {
-        // counts are summed
-        self.count += sum(states[1].as_primitive::<Int64Type>()).unwrap_or_default();
-
-        // sums are summed
-        if let Some(x) = sum(states[0].as_primitive::<Float64Type>()) {
-            let v = self.sum.get_or_insert(0.);
-            *v += x;
-        }
-        Ok(())
-    }
-
-    fn evaluate(&mut self) -> Result<ScalarValue> {
-        if self.count == 0 {
-            // If all input are nulls, count will be 0 and we will get null after the division.
-            // This is consistent with Spark Average implementation.
-            Ok(ScalarValue::Float64(None))
-        } else {
-            Ok(ScalarValue::Float64(
-                self.sum.map(|f| f / self.count as f64),
-            ))
-        }
-    }
-
-    fn size(&self) -> usize {
-        std::mem::size_of_val(self)
-    }
-}
-
-/// An accumulator to compute the average of `[PrimitiveArray<T>]`.
-/// Stores values as native types, and does overflow checking
-///
-/// F: Function that calculates the average value from a sum of
-/// T::Native and a total count
-#[derive(Debug)]
-struct AvgGroupsAccumulator<T, F>
-where
-    T: ArrowNumericType + Send,
-    F: Fn(T::Native, i64) -> Result<T::Native> + Send,
-{
-    /// The type of the returned average
-    return_data_type: DataType,
-
-    /// Count per group (use i64 to make Int64Array)
-    counts: Vec<i64>,
-
-    /// Sums per group, stored as the native type
-    sums: Vec<T::Native>,
-
-    /// Function that computes the final average (value / count)
-    avg_fn: F,
-}
-
-impl<T, F> AvgGroupsAccumulator<T, F>
-where
-    T: ArrowNumericType + Send,
-    F: Fn(T::Native, i64) -> Result<T::Native> + Send,
-{
-    pub fn new(return_data_type: &DataType, avg_fn: F) -> Self {
-        Self {
-            return_data_type: return_data_type.clone(),
-            counts: vec![],
-            sums: vec![],
-            avg_fn,
-        }
-    }
-}
-
-impl<T, F> GroupsAccumulator for AvgGroupsAccumulator<T, F>
-where
-    T: ArrowNumericType + Send,
-    F: Fn(T::Native, i64) -> Result<T::Native> + Send,
-{
-    fn update_batch(
-        &mut self,
-        values: &[ArrayRef],
-        group_indices: &[usize],
-        _opt_filter: Option<&arrow_array::BooleanArray>,
-        total_num_groups: usize,
-    ) -> Result<()> {
-        assert_eq!(values.len(), 1, "single argument to update_batch");
-        let values = values[0].as_primitive::<T>();
-        let data = values.values();
-
-        // increment counts, update sums
-        self.counts.resize(total_num_groups, 0);
-        self.sums.resize(total_num_groups, T::default_value());
-
-        let iter = group_indices.iter().zip(data.iter());
-        if values.null_count() == 0 {
-            for (&group_index, &value) in iter {
-                let sum = &mut self.sums[group_index];
-                *sum = (*sum).add_wrapping(value);
-                self.counts[group_index] += 1;
-            }
-        } else {
-            for (idx, (&group_index, &value)) in iter.enumerate() {
-                if values.is_null(idx) {
-                    continue;
-                }
-                let sum = &mut self.sums[group_index];
-                *sum = (*sum).add_wrapping(value);
-
-                self.counts[group_index] += 1;
-            }
-        }
-
-        Ok(())
-    }
-
-    fn merge_batch(
-        &mut self,
-        values: &[ArrayRef],
-        group_indices: &[usize],
-        _opt_filter: Option<&arrow_array::BooleanArray>,
-        total_num_groups: usize,
-    ) -> Result<()> {
-        assert_eq!(values.len(), 2, "two arguments to merge_batch");
-        // first batch is partial sums, second is counts
-        let partial_sums = values[0].as_primitive::<T>();
-        let partial_counts = values[1].as_primitive::<Int64Type>();
-        // update counts with partial counts
-        self.counts.resize(total_num_groups, 0);
-        let iter1 = group_indices.iter().zip(partial_counts.values().iter());
-        for (&group_index, &partial_count) in iter1 {
-            self.counts[group_index] += partial_count;
-        }
-
-        // update sums
-        self.sums.resize(total_num_groups, T::default_value());
-        let iter2 = group_indices.iter().zip(partial_sums.values().iter());
-        for (&group_index, &new_value) in iter2 {
-            let sum = &mut self.sums[group_index];
-            *sum = sum.add_wrapping(new_value);
-        }
-
-        Ok(())
-    }
-
-    fn evaluate(&mut self, emit_to: EmitTo) -> Result<ArrayRef> {
-        let counts = emit_to.take_needed(&mut self.counts);
-        let sums = emit_to.take_needed(&mut self.sums);
-        let mut builder = PrimitiveBuilder::<T>::with_capacity(sums.len());
-        let iter = sums.into_iter().zip(counts);
-
-        for (sum, count) in iter {
-            if count != 0 {
-                builder.append_value((self.avg_fn)(sum, count)?)
-            } else {
-                builder.append_null();
-            }
-        }
-        let array: PrimitiveArray<T> = builder.finish();
-
-        Ok(Arc::new(array))
-    }
-
-    // return arrays for sums and counts
-    fn state(&mut self, emit_to: EmitTo) -> Result<Vec<ArrayRef>> {
-        let counts = emit_to.take_needed(&mut self.counts);
-        let counts = Int64Array::new(counts.into(), None);
-
-        let sums = emit_to.take_needed(&mut self.sums);
-        let sums = PrimitiveArray::<T>::new(sums.into(), None)
-            .with_data_type(self.return_data_type.clone());
-
-        Ok(vec![
-            Arc::new(sums) as ArrayRef,
-            Arc::new(counts) as ArrayRef,
-        ])
-    }
-
-    fn size(&self) -> usize {
-        self.counts.capacity() * std::mem::size_of::<i64>()
-            + self.sums.capacity() * std::mem::size_of::<T>()
-    }
-}
diff --git a/src/avg_decimal.rs b/src/avg_decimal.rs
deleted file mode 100644
index 05fc28e58341..000000000000
--- a/src/avg_decimal.rs
+++ /dev/null
@@ -1,522 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-use arrow::{array::BooleanBufferBuilder, buffer::NullBuffer, compute::sum};
-use arrow_array::{
-    builder::PrimitiveBuilder,
-    cast::AsArray,
-    types::{Decimal128Type, Int64Type},
-    Array, ArrayRef, Decimal128Array, Int64Array, PrimitiveArray,
-};
-use arrow_schema::{DataType, Field};
-use datafusion::logical_expr::{Accumulator, EmitTo, GroupsAccumulator, Signature};
-use datafusion_common::{not_impl_err, Result, ScalarValue};
-use datafusion_physical_expr::expressions::format_state_name;
-use std::{any::Any, sync::Arc};
-
-use crate::utils::is_valid_decimal_precision;
-use arrow_array::ArrowNativeTypeOp;
-use arrow_data::decimal::{MAX_DECIMAL_FOR_EACH_PRECISION, MIN_DECIMAL_FOR_EACH_PRECISION};
-use datafusion::logical_expr::Volatility::Immutable;
-use datafusion_expr::function::{AccumulatorArgs, StateFieldsArgs};
-use datafusion_expr::type_coercion::aggregates::avg_return_type;
-use datafusion_expr::{AggregateUDFImpl, ReversedUDAF};
-use num::{integer::div_ceil, Integer};
-use DataType::*;
-
-/// AVG aggregate expression
-#[derive(Debug, Clone)]
-pub struct AvgDecimal {
-    signature: Signature,
-    sum_data_type: DataType,
-    result_data_type: DataType,
-}
-
-impl AvgDecimal {
-    /// Create a new AVG aggregate function
-    pub fn new(result_type: DataType, sum_type: DataType) -> Self {
-        Self {
-            signature: Signature::user_defined(Immutable),
-            result_data_type: result_type,
-            sum_data_type: sum_type,
-        }
-    }
-}
-
-impl AggregateUDFImpl for AvgDecimal {
-    /// Return a reference to Any that can be used for downcasting
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
-    fn accumulator(&self, _acc_args: AccumulatorArgs) -> Result<Box<dyn Accumulator>> {
-        match (&self.sum_data_type, &self.result_data_type) {
-            (Decimal128(sum_precision, sum_scale), Decimal128(target_precision, target_scale)) => {
-                Ok(Box::new(AvgDecimalAccumulator::new(
-                    *sum_scale,
-                    *sum_precision,
-                    *target_precision,
-                    *target_scale,
-                )))
-            }
-            _ => not_impl_err!(
-                "AvgDecimalAccumulator for ({} --> {})",
-                self.sum_data_type,
-                self.result_data_type
-            ),
-        }
-    }
-
-    fn state_fields(&self, _args: StateFieldsArgs) -> Result<Vec<Field>> {
-        Ok(vec![
-            Field::new(
-                format_state_name(self.name(), "sum"),
-                self.sum_data_type.clone(),
-                true,
-            ),
-            Field::new(
-                format_state_name(self.name(), "count"),
-                DataType::Int64,
-                true,
-            ),
-        ])
-    }
-
-    fn name(&self) -> &str {
-        "avg"
-    }
-
-    fn reverse_expr(&self) -> ReversedUDAF {
-        ReversedUDAF::Identical
-    }
-
-    fn groups_accumulator_supported(&self, _args: AccumulatorArgs) -> bool {
-        true
-    }
-
-    fn create_groups_accumulator(
-        &self,
-        _args: AccumulatorArgs,
-    ) -> Result<Box<dyn GroupsAccumulator>> {
-        // instantiate specialized accumulator based for the type
-        match (&self.sum_data_type, &self.result_data_type) {
-            (Decimal128(sum_precision, sum_scale), Decimal128(target_precision, target_scale)) => {
-                Ok(Box::new(AvgDecimalGroupsAccumulator::new(
-                    &self.result_data_type,
-                    &self.sum_data_type,
-                    *target_precision,
-                    *target_scale,
-                    *sum_precision,
-                    *sum_scale,
-                )))
-            }
-            _ => not_impl_err!(
-                "AvgDecimalGroupsAccumulator for ({} --> {})",
-                self.sum_data_type,
-                self.result_data_type
-            ),
-        }
-    }
-
-    fn default_value(&self, _data_type: &DataType) -> Result<ScalarValue> {
-        match &self.result_data_type {
-            Decimal128(target_precision, target_scale) => {
-                Ok(make_decimal128(None, *target_precision, *target_scale))
-            }
-            _ => not_impl_err!(
-                "The result_data_type of AvgDecimal should be Decimal128 but got{}",
-                self.result_data_type
-            ),
-        }
-    }
-
-    fn signature(&self) -> &Signature {
-        &self.signature
-    }
-
-    fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
-        avg_return_type(self.name(), &arg_types[0])
-    }
-}
-
-/// An accumulator to compute the average for decimals
-#[derive(Debug)]
-struct AvgDecimalAccumulator {
-    sum: Option<i128>,
-    count: i64,
-    is_empty: bool,
-    is_not_null: bool,
-    sum_scale: i8,
-    sum_precision: u8,
-    target_precision: u8,
-    target_scale: i8,
-}
-
-impl AvgDecimalAccumulator {
-    pub fn new(sum_scale: i8, sum_precision: u8, target_precision: u8, target_scale: i8) -> Self {
-        Self {
-            sum: None,
-            count: 0,
-            is_empty: true,
-            is_not_null: true,
-            sum_scale,
-            sum_precision,
-            target_precision,
-            target_scale,
-        }
-    }
-
-    fn update_single(&mut self, values: &Decimal128Array, idx: usize) {
-        let v = unsafe { values.value_unchecked(idx) };
-        let (new_sum, is_overflow) = match self.sum {
-            Some(sum) => sum.overflowing_add(v),
-            None => (v, false),
-        };
-
-        if is_overflow || !is_valid_decimal_precision(new_sum, self.sum_precision) {
-            // Overflow: set buffer accumulator to null
-            self.is_not_null = false;
-            return;
-        }
-
-        self.sum = Some(new_sum);
-
-        if let Some(new_count) = self.count.checked_add(1) {
-            self.count = new_count;
-        } else {
-            self.is_not_null = false;
-            return;
-        }
-
-        self.is_not_null = true;
-    }
-}
-
-fn make_decimal128(value: Option<i128>, precision: u8, scale: i8) -> ScalarValue {
-    ScalarValue::Decimal128(value, precision, scale)
-}
-
-impl Accumulator for AvgDecimalAccumulator {
-    fn state(&mut self) -> Result<Vec<ScalarValue>> {
-        Ok(vec![
-            ScalarValue::Decimal128(self.sum, self.sum_precision, self.sum_scale),
-            ScalarValue::from(self.count),
-        ])
-    }
-
-    fn update_batch(&mut self, values: &[ArrayRef]) -> Result<()> {
-        if !self.is_empty && !self.is_not_null {
-            // This means there's a overflow in decimal, so we will just skip the rest
-            // of the computation
-            return Ok(());
-        }
-
-        let values = &values[0];
-        let data = values.as_primitive::<Decimal128Type>();
-
-        self.is_empty = self.is_empty && values.len() == values.null_count();
-
-        if values.null_count() == 0 {
-            for i in 0..data.len() {
-                self.update_single(data, i);
-            }
-        } else {
-            for i in 0..data.len() {
-                if data.is_null(i) {
-                    continue;
-                }
-                self.update_single(data, i);
-            }
-        }
-        Ok(())
-    }
-
-    fn merge_batch(&mut self, states: &[ArrayRef]) -> Result<()> {
-        // counts are summed
-        self.count += sum(states[1].as_primitive::<Int64Type>()).unwrap_or_default();
-
-        // sums are summed
-        if let Some(x) = sum(states[0].as_primitive::<Decimal128Type>()) {
-            let v = self.sum.get_or_insert(0);
-            let (result, overflowed) = v.overflowing_add(x);
-            if overflowed {
-                // Set to None if overflow happens
-                self.sum = None;
-            } else {
-                *v = result;
-            }
-        }
-        Ok(())
-    }
-
-    fn evaluate(&mut self) -> Result<ScalarValue> {
-        let scaler = 10_i128.pow(self.target_scale.saturating_sub(self.sum_scale) as u32);
-        let target_min = MIN_DECIMAL_FOR_EACH_PRECISION[self.target_precision as usize - 1];
-        let target_max = MAX_DECIMAL_FOR_EACH_PRECISION[self.target_precision as usize - 1];
-
-        let result = self
-            .sum
-            .map(|v| avg(v, self.count as i128, target_min, target_max, scaler));
-
-        match result {
-            Some(value) => Ok(make_decimal128(
-                value,
-                self.target_precision,
-                self.target_scale,
-            )),
-            _ => Ok(make_decimal128(
-                None,
-                self.target_precision,
-                self.target_scale,
-            )),
-        }
-    }
-
-    fn size(&self) -> usize {
-        std::mem::size_of_val(self)
-    }
-}
-
-#[derive(Debug)]
-struct AvgDecimalGroupsAccumulator {
-    /// Tracks if the value is null
-    is_not_null: BooleanBufferBuilder,
-
-    // Tracks if the value is empty
-    is_empty: BooleanBufferBuilder,
-
-    /// The type of the avg return type
-    return_data_type: DataType,
-    target_precision: u8,
-    target_scale: i8,
-
-    /// Count per group (use i64 to make Int64Array)
-    counts: Vec<i64>,
-
-    /// Sums per group, stored as i128
-    sums: Vec<i128>,
-
-    /// The type of the sum
-    sum_data_type: DataType,
-    /// This is input_precision + 10 to be consistent with Spark
-    sum_precision: u8,
-    sum_scale: i8,
-}
-
-impl AvgDecimalGroupsAccumulator {
-    pub fn new(
-        return_data_type: &DataType,
-        sum_data_type: &DataType,
-        target_precision: u8,
-        target_scale: i8,
-        sum_precision: u8,
-        sum_scale: i8,
-    ) -> Self {
-        Self {
-            is_not_null: BooleanBufferBuilder::new(0),
-            is_empty: BooleanBufferBuilder::new(0),
-            return_data_type: return_data_type.clone(),
-            target_precision,
-            target_scale,
-            sum_data_type: sum_data_type.clone(),
-            sum_precision,
-            sum_scale,
-            counts: vec![],
-            sums: vec![],
-        }
-    }
-
-    fn is_overflow(&self, index: usize) -> bool {
-        !self.is_empty.get_bit(index) && !self.is_not_null.get_bit(index)
-    }
-
-    fn update_single(&mut self, group_index: usize, value: i128) {
-        if self.is_overflow(group_index) {
-            // This means there's a overflow in decimal, so we will just skip the rest
-            // of the computation
-            return;
-        }
-
-        self.is_empty.set_bit(group_index, false);
-        let (new_sum, is_overflow) = self.sums[group_index].overflowing_add(value);
-        self.counts[group_index] += 1;
-
-        if is_overflow || !is_valid_decimal_precision(new_sum, self.sum_precision) {
-            // Overflow: set buffer accumulator to null
-            self.is_not_null.set_bit(group_index, false);
-            return;
-        }
-
-        self.sums[group_index] = new_sum;
-        self.is_not_null.set_bit(group_index, true)
-    }
-}
-
-fn ensure_bit_capacity(builder: &mut BooleanBufferBuilder, capacity: usize) {
-    if builder.len() < capacity {
-        let additional = capacity - builder.len();
-        builder.append_n(additional, true);
-    }
-}
-
-impl GroupsAccumulator for AvgDecimalGroupsAccumulator {
-    fn update_batch(
-        &mut self,
-        values: &[ArrayRef],
-        group_indices: &[usize],
-        _opt_filter: Option<&arrow_array::BooleanArray>,
-        total_num_groups: usize,
-    ) -> Result<()> {
-        assert_eq!(values.len(), 1, "single argument to update_batch");
-        let values = values[0].as_primitive::<Decimal128Type>();
-        let data = values.values();
-
-        // increment counts, update sums
-        self.counts.resize(total_num_groups, 0);
-        self.sums.resize(total_num_groups, 0);
-        ensure_bit_capacity(&mut self.is_empty, total_num_groups);
-        ensure_bit_capacity(&mut self.is_not_null, total_num_groups);
-
-        let iter = group_indices.iter().zip(data.iter());
-        if values.null_count() == 0 {
-            for (&group_index, &value) in iter {
-                self.update_single(group_index, value);
-            }
-        } else {
-            for (idx, (&group_index, &value)) in iter.enumerate() {
-                if values.is_null(idx) {
-                    continue;
-                }
-                self.update_single(group_index, value);
-            }
-        }
-        Ok(())
-    }
-
-    fn merge_batch(
-        &mut self,
-        values: &[ArrayRef],
-        group_indices: &[usize],
-        _opt_filter: Option<&arrow_array::BooleanArray>,
-        total_num_groups: usize,
-    ) -> Result<()> {
-        assert_eq!(values.len(), 2, "two arguments to merge_batch");
-        // first batch is partial sums, second is counts
-        let partial_sums = values[0].as_primitive::<Decimal128Type>();
-        let partial_counts = values[1].as_primitive::<Int64Type>();
-        // update counts with partial counts
-        self.counts.resize(total_num_groups, 0);
-        let iter1 = group_indices.iter().zip(partial_counts.values().iter());
-        for (&group_index, &partial_count) in iter1 {
-            self.counts[group_index] += partial_count;
-        }
-
-        // update sums
-        self.sums.resize(total_num_groups, 0);
-        let iter2 = group_indices.iter().zip(partial_sums.values().iter());
-        for (&group_index, &new_value) in iter2 {
-            let sum = &mut self.sums[group_index];
-            *sum = sum.add_wrapping(new_value);
-        }
-
-        Ok(())
-    }
-
-    fn evaluate(&mut self, emit_to: EmitTo) -> Result<ArrayRef> {
-        let counts = emit_to.take_needed(&mut self.counts);
-        let sums = emit_to.take_needed(&mut self.sums);
-
-        let mut builder = PrimitiveBuilder::<Decimal128Type>::with_capacity(sums.len())
-            .with_data_type(self.return_data_type.clone());
-        let iter = sums.into_iter().zip(counts);
-
-        let scaler = 10_i128.pow(self.target_scale.saturating_sub(self.sum_scale) as u32);
-        let target_min = MIN_DECIMAL_FOR_EACH_PRECISION[self.target_precision as usize - 1];
-        let target_max = MAX_DECIMAL_FOR_EACH_PRECISION[self.target_precision as usize - 1];
-
-        for (sum, count) in iter {
-            if count != 0 {
-                match avg(sum, count as i128, target_min, target_max, scaler) {
-                    Some(value) => {
-                        builder.append_value(value);
-                    }
-                    _ => {
-                        builder.append_null();
-                    }
-                }
-            } else {
-                builder.append_null();
-            }
-        }
-        let array: PrimitiveArray<Decimal128Type> = builder.finish();
-
-        Ok(Arc::new(array))
-    }
-
-    // return arrays for sums and counts
-    fn state(&mut self, emit_to: EmitTo) -> Result<Vec<ArrayRef>> {
-        let nulls = self.is_not_null.finish();
-        let nulls = Some(NullBuffer::new(nulls));
-
-        let counts = emit_to.take_needed(&mut self.counts);
-        let counts = Int64Array::new(counts.into(), nulls.clone());
-
-        let sums = emit_to.take_needed(&mut self.sums);
-        let sums =
-            Decimal128Array::new(sums.into(), nulls).with_data_type(self.sum_data_type.clone());
-
-        Ok(vec![
-            Arc::new(sums) as ArrayRef,
-            Arc::new(counts) as ArrayRef,
-        ])
-    }
-
-    fn size(&self) -> usize {
-        self.counts.capacity() * std::mem::size_of::<i64>()
-            + self.sums.capacity() * std::mem::size_of::<i128>()
-    }
-}
-
-/// Returns the `sum`/`count` as a i128 Decimal128 with
-/// target_scale and target_precision and return None if overflows.
-///
-/// * sum: The total sum value stored as Decimal128 with sum_scale
-/// * count: total count, stored as a i128 (*NOT* a Decimal128 value)
-/// * target_min: The minimum output value possible to represent with the target precision
-/// * target_max: The maximum output value possible to represent with the target precision
-/// * scaler: scale factor for avg
-#[inline(always)]
-fn avg(sum: i128, count: i128, target_min: i128, target_max: i128, scaler: i128) -> Option<i128> {
-    if let Some(value) = sum.checked_mul(scaler) {
-        // `sum / count` with ROUND_HALF_UP
-        let (div, rem) = value.div_rem(&count);
-        let half = div_ceil(count, 2);
-        let half_neg = half.neg_wrapping();
-        let new_value = match value >= 0 {
-            true if rem >= half => div.add_wrapping(1),
-            false if rem <= half_neg => div.sub_wrapping(1),
-            _ => div,
-        };
-        if new_value >= target_min && new_value <= target_max {
-            Some(new_value)
-        } else {
-            None
-        }
-    } else {
-        None
-    }
-}
diff --git a/src/covariance.rs b/src/covariance.rs
deleted file mode 100644
index fa3563cdea55..000000000000
--- a/src/covariance.rs
+++ /dev/null
@@ -1,306 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-use std::any::Any;
-
-use arrow::{
-    array::{ArrayRef, Float64Array},
-    compute::cast,
-    datatypes::{DataType, Field},
-};
-use datafusion::logical_expr::Accumulator;
-use datafusion_common::{
-    downcast_value, unwrap_or_internal_err, DataFusionError, Result, ScalarValue,
-};
-use datafusion_expr::function::{AccumulatorArgs, StateFieldsArgs};
-use datafusion_expr::type_coercion::aggregates::NUMERICS;
-use datafusion_expr::{AggregateUDFImpl, Signature, Volatility};
-use datafusion_physical_expr::expressions::format_state_name;
-use datafusion_physical_expr::expressions::StatsType;
-
-/// COVAR_SAMP and COVAR_POP aggregate expression
-/// The implementation mostly is the same as the DataFusion's implementation. The reason
-/// we have our own implementation is that DataFusion has UInt64 for state_field count,
-/// while Spark has Double for count.
-#[derive(Debug, Clone)]
-pub struct Covariance {
-    name: String,
-    signature: Signature,
-    stats_type: StatsType,
-    null_on_divide_by_zero: bool,
-}
-
-impl Covariance {
-    /// Create a new COVAR aggregate function
-    pub fn new(
-        name: impl Into<String>,
-        data_type: DataType,
-        stats_type: StatsType,
-        null_on_divide_by_zero: bool,
-    ) -> Self {
-        // the result of covariance just support FLOAT64 data type.
-        assert!(matches!(data_type, DataType::Float64));
-        Self {
-            name: name.into(),
-            signature: Signature::uniform(2, NUMERICS.to_vec(), Volatility::Immutable),
-            stats_type,
-            null_on_divide_by_zero,
-        }
-    }
-}
-
-impl AggregateUDFImpl for Covariance {
-    /// Return a reference to Any that can be used for downcasting
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
-    fn name(&self) -> &str {
-        &self.name
-    }
-
-    fn signature(&self) -> &Signature {
-        &self.signature
-    }
-
-    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
-        Ok(DataType::Float64)
-    }
-    fn default_value(&self, _data_type: &DataType) -> Result<ScalarValue> {
-        Ok(ScalarValue::Float64(None))
-    }
-
-    fn accumulator(&self, _acc_args: AccumulatorArgs) -> Result<Box<dyn Accumulator>> {
-        Ok(Box::new(CovarianceAccumulator::try_new(
-            self.stats_type,
-            self.null_on_divide_by_zero,
-        )?))
-    }
-
-    fn state_fields(&self, _args: StateFieldsArgs) -> Result<Vec<Field>> {
-        Ok(vec![
-            Field::new(
-                format_state_name(&self.name, "count"),
-                DataType::Float64,
-                true,
-            ),
-            Field::new(
-                format_state_name(&self.name, "mean1"),
-                DataType::Float64,
-                true,
-            ),
-            Field::new(
-                format_state_name(&self.name, "mean2"),
-                DataType::Float64,
-                true,
-            ),
-            Field::new(
-                format_state_name(&self.name, "algo_const"),
-                DataType::Float64,
-                true,
-            ),
-        ])
-    }
-}
-
-/// An accumulator to compute covariance
-#[derive(Debug)]
-pub struct CovarianceAccumulator {
-    algo_const: f64,
-    mean1: f64,
-    mean2: f64,
-    count: f64,
-    stats_type: StatsType,
-    null_on_divide_by_zero: bool,
-}
-
-impl CovarianceAccumulator {
-    /// Creates a new `CovarianceAccumulator`
-    pub fn try_new(s_type: StatsType, null_on_divide_by_zero: bool) -> Result<Self> {
-        Ok(Self {
-            algo_const: 0_f64,
-            mean1: 0_f64,
-            mean2: 0_f64,
-            count: 0_f64,
-            stats_type: s_type,
-            null_on_divide_by_zero,
-        })
-    }
-
-    pub fn get_count(&self) -> f64 {
-        self.count
-    }
-
-    pub fn get_mean1(&self) -> f64 {
-        self.mean1
-    }
-
-    pub fn get_mean2(&self) -> f64 {
-        self.mean2
-    }
-
-    pub fn get_algo_const(&self) -> f64 {
-        self.algo_const
-    }
-}
-
-impl Accumulator for CovarianceAccumulator {
-    fn state(&mut self) -> Result<Vec<ScalarValue>> {
-        Ok(vec![
-            ScalarValue::from(self.count),
-            ScalarValue::from(self.mean1),
-            ScalarValue::from(self.mean2),
-            ScalarValue::from(self.algo_const),
-        ])
-    }
-
-    fn update_batch(&mut self, values: &[ArrayRef]) -> Result<()> {
-        let values1 = &cast(&values[0], &DataType::Float64)?;
-        let values2 = &cast(&values[1], &DataType::Float64)?;
-
-        let mut arr1 = downcast_value!(values1, Float64Array).iter().flatten();
-        let mut arr2 = downcast_value!(values2, Float64Array).iter().flatten();
-
-        for i in 0..values1.len() {
-            let value1 = if values1.is_valid(i) {
-                arr1.next()
-            } else {
-                None
-            };
-            let value2 = if values2.is_valid(i) {
-                arr2.next()
-            } else {
-                None
-            };
-
-            if value1.is_none() || value2.is_none() {
-                continue;
-            }
-
-            let value1 = unwrap_or_internal_err!(value1);
-            let value2 = unwrap_or_internal_err!(value2);
-            let new_count = self.count + 1.0;
-            let delta1 = value1 - self.mean1;
-            let new_mean1 = delta1 / new_count + self.mean1;
-            let delta2 = value2 - self.mean2;
-            let new_mean2 = delta2 / new_count + self.mean2;
-            let new_c = delta1 * (value2 - new_mean2) + self.algo_const;
-
-            self.count += 1.0;
-            self.mean1 = new_mean1;
-            self.mean2 = new_mean2;
-            self.algo_const = new_c;
-        }
-
-        Ok(())
-    }
-
-    fn retract_batch(&mut self, values: &[ArrayRef]) -> Result<()> {
-        let values1 = &cast(&values[0], &DataType::Float64)?;
-        let values2 = &cast(&values[1], &DataType::Float64)?;
-        let mut arr1 = downcast_value!(values1, Float64Array).iter().flatten();
-        let mut arr2 = downcast_value!(values2, Float64Array).iter().flatten();
-
-        for i in 0..values1.len() {
-            let value1 = if values1.is_valid(i) {
-                arr1.next()
-            } else {
-                None
-            };
-            let value2 = if values2.is_valid(i) {
-                arr2.next()
-            } else {
-                None
-            };
-
-            if value1.is_none() || value2.is_none() {
-                continue;
-            }
-
-            let value1 = unwrap_or_internal_err!(value1);
-            let value2 = unwrap_or_internal_err!(value2);
-
-            let new_count = self.count - 1.0;
-            let delta1 = self.mean1 - value1;
-            let new_mean1 = delta1 / new_count + self.mean1;
-            let delta2 = self.mean2 - value2;
-            let new_mean2 = delta2 / new_count + self.mean2;
-            let new_c = self.algo_const - delta1 * (new_mean2 - value2);
-
-            self.count -= 1.0;
-            self.mean1 = new_mean1;
-            self.mean2 = new_mean2;
-            self.algo_const = new_c;
-        }
-
-        Ok(())
-    }
-
-    fn merge_batch(&mut self, states: &[ArrayRef]) -> Result<()> {
-        let counts = downcast_value!(states[0], Float64Array);
-        let means1 = downcast_value!(states[1], Float64Array);
-        let means2 = downcast_value!(states[2], Float64Array);
-        let cs = downcast_value!(states[3], Float64Array);
-
-        for i in 0..counts.len() {
-            let c = counts.value(i);
-            if c == 0.0 {
-                continue;
-            }
-            let new_count = self.count + c;
-            let new_mean1 = self.mean1 * self.count / new_count + means1.value(i) * c / new_count;
-            let new_mean2 = self.mean2 * self.count / new_count + means2.value(i) * c / new_count;
-            let delta1 = self.mean1 - means1.value(i);
-            let delta2 = self.mean2 - means2.value(i);
-            let new_c =
-                self.algo_const + cs.value(i) + delta1 * delta2 * self.count * c / new_count;
-
-            self.count = new_count;
-            self.mean1 = new_mean1;
-            self.mean2 = new_mean2;
-            self.algo_const = new_c;
-        }
-        Ok(())
-    }
-
-    fn evaluate(&mut self) -> Result<ScalarValue> {
-        if self.count == 0.0 {
-            return Ok(ScalarValue::Float64(None));
-        }
-
-        let count = match self.stats_type {
-            StatsType::Population => self.count,
-            StatsType::Sample if self.count > 1.0 => self.count - 1.0,
-            StatsType::Sample => {
-                // self.count == 1.0
-                return if self.null_on_divide_by_zero {
-                    Ok(ScalarValue::Float64(None))
-                } else {
-                    Ok(ScalarValue::Float64(Some(f64::NAN)))
-                };
-            }
-        };
-
-        Ok(ScalarValue::Float64(Some(self.algo_const / count)))
-    }
-
-    fn size(&self) -> usize {
-        std::mem::size_of_val(self)
-    }
-}
diff --git a/src/strings.rs b/src/strings.rs
deleted file mode 100644
index c2706b589652..000000000000
--- a/src/strings.rs
+++ /dev/null
@@ -1,290 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#![allow(deprecated)]
-
-use crate::kernels::strings::{string_space, substring};
-use arrow::{
-    compute::{
-        contains_dyn, contains_utf8_scalar_dyn, ends_with_dyn, ends_with_utf8_scalar_dyn, like_dyn,
-        like_utf8_scalar_dyn, starts_with_dyn, starts_with_utf8_scalar_dyn,
-    },
-    record_batch::RecordBatch,
-};
-use arrow_schema::{DataType, Schema};
-use datafusion::logical_expr::ColumnarValue;
-use datafusion_common::{DataFusionError, ScalarValue::Utf8};
-use datafusion_physical_expr::PhysicalExpr;
-use std::{
-    any::Any,
-    fmt::{Display, Formatter},
-    hash::Hash,
-    sync::Arc,
-};
-
-macro_rules! make_predicate_function {
-    ($name: ident, $kernel: ident, $str_scalar_kernel: ident) => {
-        #[derive(Debug, Eq)]
-        pub struct $name {
-            left: Arc<dyn PhysicalExpr>,
-            right: Arc<dyn PhysicalExpr>,
-        }
-
-        impl $name {
-            pub fn new(left: Arc<dyn PhysicalExpr>, right: Arc<dyn PhysicalExpr>) -> Self {
-                Self { left, right }
-            }
-        }
-
-        impl Display for $name {
-            fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
-                write!(f, "$name [left: {}, right: {}]", self.left, self.right)
-            }
-        }
-
-        impl Hash for $name {
-            fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
-                self.left.hash(state);
-                self.right.hash(state);
-            }
-        }
-
-        impl PartialEq for $name {
-            fn eq(&self, other: &Self) -> bool {
-                self.left.eq(&other.left) && self.right.eq(&other.right)
-            }
-        }
-
-        impl PhysicalExpr for $name {
-            fn as_any(&self) -> &dyn Any {
-                self
-            }
-
-            fn data_type(&self, _: &Schema) -> datafusion_common::Result<DataType> {
-                Ok(DataType::Boolean)
-            }
-
-            fn nullable(&self, _: &Schema) -> datafusion_common::Result<bool> {
-                Ok(true)
-            }
-
-            fn evaluate(&self, batch: &RecordBatch) -> datafusion_common::Result<ColumnarValue> {
-                let left_arg = self.left.evaluate(batch)?;
-                let right_arg = self.right.evaluate(batch)?;
-
-                let array = match (left_arg, right_arg) {
-                    // array (op) scalar
-                    (ColumnarValue::Array(array), ColumnarValue::Scalar(Utf8(Some(string)))) => {
-                        $str_scalar_kernel(&array, string.as_str())
-                    }
-                    (ColumnarValue::Array(_), ColumnarValue::Scalar(other)) => {
-                        return Err(DataFusionError::Execution(format!(
-                            "Should be String but got: {:?}",
-                            other
-                        )))
-                    }
-                    // array (op) array
-                    (ColumnarValue::Array(array1), ColumnarValue::Array(array2)) => {
-                        $kernel(&array1, &array2)
-                    }
-                    // scalar (op) scalar should be folded at Spark optimizer
-                    _ => {
-                        return Err(DataFusionError::Execution(
-                            "Predicate on two literals should be folded at Spark".to_string(),
-                        ))
-                    }
-                }?;
-
-                Ok(ColumnarValue::Array(Arc::new(array)))
-            }
-
-            fn children(&self) -> Vec<&Arc<dyn PhysicalExpr>> {
-                vec![&self.left, &self.right]
-            }
-
-            fn with_new_children(
-                self: Arc<Self>,
-                children: Vec<Arc<dyn PhysicalExpr>>,
-            ) -> datafusion_common::Result<Arc<dyn PhysicalExpr>> {
-                Ok(Arc::new($name::new(
-                    children[0].clone(),
-                    children[1].clone(),
-                )))
-            }
-        }
-    };
-}
-
-make_predicate_function!(Like, like_dyn, like_utf8_scalar_dyn);
-
-make_predicate_function!(StartsWith, starts_with_dyn, starts_with_utf8_scalar_dyn);
-
-make_predicate_function!(EndsWith, ends_with_dyn, ends_with_utf8_scalar_dyn);
-
-make_predicate_function!(Contains, contains_dyn, contains_utf8_scalar_dyn);
-
-#[derive(Debug, Eq)]
-pub struct SubstringExpr {
-    pub child: Arc<dyn PhysicalExpr>,
-    pub start: i64,
-    pub len: u64,
-}
-
-impl Hash for SubstringExpr {
-    fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
-        self.child.hash(state);
-        self.start.hash(state);
-        self.len.hash(state);
-    }
-}
-
-impl PartialEq for SubstringExpr {
-    fn eq(&self, other: &Self) -> bool {
-        self.child.eq(&other.child) && self.start.eq(&other.start) && self.len.eq(&other.len)
-    }
-}
-#[derive(Debug, Eq)]
-pub struct StringSpaceExpr {
-    pub child: Arc<dyn PhysicalExpr>,
-}
-
-impl Hash for StringSpaceExpr {
-    fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
-        self.child.hash(state);
-    }
-}
-
-impl PartialEq for StringSpaceExpr {
-    fn eq(&self, other: &Self) -> bool {
-        self.child.eq(&other.child)
-    }
-}
-
-impl SubstringExpr {
-    pub fn new(child: Arc<dyn PhysicalExpr>, start: i64, len: u64) -> Self {
-        Self { child, start, len }
-    }
-}
-
-impl StringSpaceExpr {
-    pub fn new(child: Arc<dyn PhysicalExpr>) -> Self {
-        Self { child }
-    }
-}
-
-impl Display for SubstringExpr {
-    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
-        write!(
-            f,
-            "StringSpace [start: {}, len: {}, child: {}]",
-            self.start, self.len, self.child
-        )
-    }
-}
-
-impl Display for StringSpaceExpr {
-    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
-        write!(f, "StringSpace [child: {}] ", self.child)
-    }
-}
-
-impl PhysicalExpr for SubstringExpr {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
-    fn data_type(&self, input_schema: &Schema) -> datafusion_common::Result<DataType> {
-        self.child.data_type(input_schema)
-    }
-
-    fn nullable(&self, _: &Schema) -> datafusion_common::Result<bool> {
-        Ok(true)
-    }
-
-    fn evaluate(&self, batch: &RecordBatch) -> datafusion_common::Result<ColumnarValue> {
-        let arg = self.child.evaluate(batch)?;
-        match arg {
-            ColumnarValue::Array(array) => {
-                let result = substring(&array, self.start, self.len)?;
-
-                Ok(ColumnarValue::Array(result))
-            }
-            _ => Err(DataFusionError::Execution(
-                "Substring(scalar) should be fold in Spark JVM side.".to_string(),
-            )),
-        }
-    }
-
-    fn children(&self) -> Vec<&Arc<dyn PhysicalExpr>> {
-        vec![&self.child]
-    }
-
-    fn with_new_children(
-        self: Arc<Self>,
-        children: Vec<Arc<dyn PhysicalExpr>>,
-    ) -> datafusion_common::Result<Arc<dyn PhysicalExpr>> {
-        Ok(Arc::new(SubstringExpr::new(
-            Arc::clone(&children[0]),
-            self.start,
-            self.len,
-        )))
-    }
-}
-
-impl PhysicalExpr for StringSpaceExpr {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
-    fn data_type(&self, input_schema: &Schema) -> datafusion_common::Result<DataType> {
-        match self.child.data_type(input_schema)? {
-            DataType::Dictionary(key_type, _) => {
-                Ok(DataType::Dictionary(key_type, Box::new(DataType::Utf8)))
-            }
-            _ => Ok(DataType::Utf8),
-        }
-    }
-
-    fn nullable(&self, _: &Schema) -> datafusion_common::Result<bool> {
-        Ok(true)
-    }
-
-    fn evaluate(&self, batch: &RecordBatch) -> datafusion_common::Result<ColumnarValue> {
-        let arg = self.child.evaluate(batch)?;
-        match arg {
-            ColumnarValue::Array(array) => {
-                let result = string_space(&array)?;
-
-                Ok(ColumnarValue::Array(result))
-            }
-            _ => Err(DataFusionError::Execution(
-                "StringSpace(scalar) should be fold in Spark JVM side.".to_string(),
-            )),
-        }
-    }
-
-    fn children(&self) -> Vec<&Arc<dyn PhysicalExpr>> {
-        vec![&self.child]
-    }
-
-    fn with_new_children(
-        self: Arc<Self>,
-        children: Vec<Arc<dyn PhysicalExpr>>,
-    ) -> datafusion_common::Result<Arc<dyn PhysicalExpr>> {
-        Ok(Arc::new(StringSpaceExpr::new(Arc::clone(&children[0]))))
-    }
-}
diff --git a/src/sum_decimal.rs b/src/sum_decimal.rs
deleted file mode 100644
index f3f34d9bfa9d..000000000000
--- a/src/sum_decimal.rs
+++ /dev/null
@@ -1,555 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-use crate::utils::{is_valid_decimal_precision, unlikely};
-use arrow::{
-    array::BooleanBufferBuilder,
-    buffer::{BooleanBuffer, NullBuffer},
-};
-use arrow_array::{
-    cast::AsArray, types::Decimal128Type, Array, ArrayRef, BooleanArray, Decimal128Array,
-};
-use arrow_schema::{DataType, Field};
-use datafusion::logical_expr::{Accumulator, EmitTo, GroupsAccumulator};
-use datafusion_common::{DataFusionError, Result as DFResult, ScalarValue};
-use datafusion_expr::function::{AccumulatorArgs, StateFieldsArgs};
-use datafusion_expr::Volatility::Immutable;
-use datafusion_expr::{AggregateUDFImpl, ReversedUDAF, Signature};
-use std::{any::Any, ops::BitAnd, sync::Arc};
-
-#[derive(Debug)]
-pub struct SumDecimal {
-    /// Aggregate function signature
-    signature: Signature,
-    /// The data type of the SUM result. This will always be a decimal type
-    /// with the same precision and scale as specified in this struct
-    result_type: DataType,
-    /// Decimal precision
-    precision: u8,
-    /// Decimal scale
-    scale: i8,
-}
-
-impl SumDecimal {
-    pub fn try_new(data_type: DataType) -> DFResult<Self> {
-        // The `data_type` is the SUM result type passed from Spark side
-        let (precision, scale) = match data_type {
-            DataType::Decimal128(p, s) => (p, s),
-            _ => {
-                return Err(DataFusionError::Internal(
-                    "Invalid data type for SumDecimal".into(),
-                ))
-            }
-        };
-        Ok(Self {
-            signature: Signature::user_defined(Immutable),
-            result_type: data_type,
-            precision,
-            scale,
-        })
-    }
-}
-
-impl AggregateUDFImpl for SumDecimal {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
-    fn accumulator(&self, _args: AccumulatorArgs) -> DFResult<Box<dyn Accumulator>> {
-        Ok(Box::new(SumDecimalAccumulator::new(
-            self.precision,
-            self.scale,
-        )))
-    }
-
-    fn state_fields(&self, _args: StateFieldsArgs) -> DFResult<Vec<Field>> {
-        let fields = vec![
-            Field::new(self.name(), self.result_type.clone(), self.is_nullable()),
-            Field::new("is_empty", DataType::Boolean, false),
-        ];
-        Ok(fields)
-    }
-
-    fn name(&self) -> &str {
-        "sum"
-    }
-
-    fn signature(&self) -> &Signature {
-        &self.signature
-    }
-
-    fn return_type(&self, _arg_types: &[DataType]) -> DFResult<DataType> {
-        Ok(self.result_type.clone())
-    }
-
-    fn groups_accumulator_supported(&self, _args: AccumulatorArgs) -> bool {
-        true
-    }
-
-    fn create_groups_accumulator(
-        &self,
-        _args: AccumulatorArgs,
-    ) -> DFResult<Box<dyn GroupsAccumulator>> {
-        Ok(Box::new(SumDecimalGroupsAccumulator::new(
-            self.result_type.clone(),
-            self.precision,
-        )))
-    }
-
-    fn default_value(&self, _data_type: &DataType) -> DFResult<ScalarValue> {
-        ScalarValue::new_primitive::<Decimal128Type>(
-            None,
-            &DataType::Decimal128(self.precision, self.scale),
-        )
-    }
-
-    fn reverse_expr(&self) -> ReversedUDAF {
-        ReversedUDAF::Identical
-    }
-
-    fn is_nullable(&self) -> bool {
-        // SumDecimal is always nullable because overflows can cause null values
-        true
-    }
-}
-
-#[derive(Debug)]
-struct SumDecimalAccumulator {
-    sum: i128,
-    is_empty: bool,
-    is_not_null: bool,
-
-    precision: u8,
-    scale: i8,
-}
-
-impl SumDecimalAccumulator {
-    fn new(precision: u8, scale: i8) -> Self {
-        Self {
-            sum: 0,
-            is_empty: true,
-            is_not_null: true,
-            precision,
-            scale,
-        }
-    }
-
-    fn update_single(&mut self, values: &Decimal128Array, idx: usize) {
-        let v = unsafe { values.value_unchecked(idx) };
-        let (new_sum, is_overflow) = self.sum.overflowing_add(v);
-
-        if is_overflow || !is_valid_decimal_precision(new_sum, self.precision) {
-            // Overflow: set buffer accumulator to null
-            self.is_not_null = false;
-            return;
-        }
-
-        self.sum = new_sum;
-        self.is_not_null = true;
-    }
-}
-
-impl Accumulator for SumDecimalAccumulator {
-    fn update_batch(&mut self, values: &[ArrayRef]) -> DFResult<()> {
-        assert_eq!(
-            values.len(),
-            1,
-            "Expect only one element in 'values' but found {}",
-            values.len()
-        );
-
-        if !self.is_empty && !self.is_not_null {
-            // This means there's a overflow in decimal, so we will just skip the rest
-            // of the computation
-            return Ok(());
-        }
-
-        let values = &values[0];
-        let data = values.as_primitive::<Decimal128Type>();
-
-        self.is_empty = self.is_empty && values.len() == values.null_count();
-
-        if values.null_count() == 0 {
-            for i in 0..data.len() {
-                self.update_single(data, i);
-            }
-        } else {
-            for i in 0..data.len() {
-                if data.is_null(i) {
-                    continue;
-                }
-                self.update_single(data, i);
-            }
-        }
-
-        Ok(())
-    }
-
-    fn evaluate(&mut self) -> DFResult<ScalarValue> {
-        // For each group:
-        //   1. if `is_empty` is true, it means either there is no value or all values for the group
-        //      are null, in this case we'll return null
-        //   2. if `is_empty` is false, but `null_state` is true, it means there's an overflow. In
-        //      non-ANSI mode Spark returns null.
-        if self.is_empty || !self.is_not_null {
-            ScalarValue::new_primitive::<Decimal128Type>(
-                None,
-                &DataType::Decimal128(self.precision, self.scale),
-            )
-        } else {
-            ScalarValue::try_new_decimal128(self.sum, self.precision, self.scale)
-        }
-    }
-
-    fn size(&self) -> usize {
-        std::mem::size_of_val(self)
-    }
-
-    fn state(&mut self) -> DFResult<Vec<ScalarValue>> {
-        let sum = if self.is_not_null {
-            ScalarValue::try_new_decimal128(self.sum, self.precision, self.scale)?
-        } else {
-            ScalarValue::new_primitive::<Decimal128Type>(
-                None,
-                &DataType::Decimal128(self.precision, self.scale),
-            )?
-        };
-        Ok(vec![sum, ScalarValue::from(self.is_empty)])
-    }
-
-    fn merge_batch(&mut self, states: &[ArrayRef]) -> DFResult<()> {
-        assert_eq!(
-            states.len(),
-            2,
-            "Expect two element in 'states' but found {}",
-            states.len()
-        );
-        assert_eq!(states[0].len(), 1);
-        assert_eq!(states[1].len(), 1);
-
-        let that_sum = states[0].as_primitive::<Decimal128Type>();
-        let that_is_empty = states[1].as_any().downcast_ref::<BooleanArray>().unwrap();
-
-        let this_overflow = !self.is_empty && !self.is_not_null;
-        let that_overflow = !that_is_empty.value(0) && that_sum.is_null(0);
-
-        self.is_not_null = !this_overflow && !that_overflow;
-        self.is_empty = self.is_empty && that_is_empty.value(0);
-
-        if self.is_not_null {
-            self.sum += that_sum.value(0);
-        }
-
-        Ok(())
-    }
-}
-
-struct SumDecimalGroupsAccumulator {
-    // Whether aggregate buffer for a particular group is null. True indicates it is not null.
-    is_not_null: BooleanBufferBuilder,
-    is_empty: BooleanBufferBuilder,
-    sum: Vec<i128>,
-    result_type: DataType,
-    precision: u8,
-}
-
-impl SumDecimalGroupsAccumulator {
-    fn new(result_type: DataType, precision: u8) -> Self {
-        Self {
-            is_not_null: BooleanBufferBuilder::new(0),
-            is_empty: BooleanBufferBuilder::new(0),
-            sum: Vec::new(),
-            result_type,
-            precision,
-        }
-    }
-
-    fn is_overflow(&self, index: usize) -> bool {
-        !self.is_empty.get_bit(index) && !self.is_not_null.get_bit(index)
-    }
-
-    fn update_single(&mut self, group_index: usize, value: i128) {
-        if unlikely(self.is_overflow(group_index)) {
-            // This means there's a overflow in decimal, so we will just skip the rest
-            // of the computation
-            return;
-        }
-
-        self.is_empty.set_bit(group_index, false);
-        let (new_sum, is_overflow) = self.sum[group_index].overflowing_add(value);
-
-        if is_overflow || !is_valid_decimal_precision(new_sum, self.precision) {
-            // Overflow: set buffer accumulator to null
-            self.is_not_null.set_bit(group_index, false);
-            return;
-        }
-
-        self.sum[group_index] = new_sum;
-        self.is_not_null.set_bit(group_index, true)
-    }
-}
-
-fn ensure_bit_capacity(builder: &mut BooleanBufferBuilder, capacity: usize) {
-    if builder.len() < capacity {
-        let additional = capacity - builder.len();
-        builder.append_n(additional, true);
-    }
-}
-
-/// Build a boolean buffer from the state and reset the state, based on the emit_to
-/// strategy.
-fn build_bool_state(state: &mut BooleanBufferBuilder, emit_to: &EmitTo) -> BooleanBuffer {
-    let bool_state: BooleanBuffer = state.finish();
-
-    match emit_to {
-        EmitTo::All => bool_state,
-        EmitTo::First(n) => {
-            // split off the first N values in bool_state
-            let first_n_bools: BooleanBuffer = bool_state.iter().take(*n).collect();
-            // reset the existing seen buffer
-            for seen in bool_state.iter().skip(*n) {
-                state.append(seen);
-            }
-            first_n_bools
-        }
-    }
-}
-
-impl GroupsAccumulator for SumDecimalGroupsAccumulator {
-    fn update_batch(
-        &mut self,
-        values: &[ArrayRef],
-        group_indices: &[usize],
-        opt_filter: Option<&BooleanArray>,
-        total_num_groups: usize,
-    ) -> DFResult<()> {
-        assert!(opt_filter.is_none(), "opt_filter is not supported yet");
-        assert_eq!(values.len(), 1);
-        let values = values[0].as_primitive::<Decimal128Type>();
-        let data = values.values();
-
-        // Update size for the accumulate states
-        self.sum.resize(total_num_groups, 0);
-        ensure_bit_capacity(&mut self.is_empty, total_num_groups);
-        ensure_bit_capacity(&mut self.is_not_null, total_num_groups);
-
-        let iter = group_indices.iter().zip(data.iter());
-        if values.null_count() == 0 {
-            for (&group_index, &value) in iter {
-                self.update_single(group_index, value);
-            }
-        } else {
-            for (idx, (&group_index, &value)) in iter.enumerate() {
-                if values.is_null(idx) {
-                    continue;
-                }
-                self.update_single(group_index, value);
-            }
-        }
-
-        Ok(())
-    }
-
-    fn evaluate(&mut self, emit_to: EmitTo) -> DFResult<ArrayRef> {
-        // For each group:
-        //   1. if `is_empty` is true, it means either there is no value or all values for the group
-        //      are null, in this case we'll return null
-        //   2. if `is_empty` is false, but `null_state` is true, it means there's an overflow. In
-        //      non-ANSI mode Spark returns null.
-        let nulls = build_bool_state(&mut self.is_not_null, &emit_to);
-        let is_empty = build_bool_state(&mut self.is_empty, &emit_to);
-        let x = (!&is_empty).bitand(&nulls);
-
-        let result = emit_to.take_needed(&mut self.sum);
-        let result = Decimal128Array::new(result.into(), Some(NullBuffer::new(x)))
-            .with_data_type(self.result_type.clone());
-
-        Ok(Arc::new(result))
-    }
-
-    fn state(&mut self, emit_to: EmitTo) -> DFResult<Vec<ArrayRef>> {
-        let nulls = build_bool_state(&mut self.is_not_null, &emit_to);
-        let nulls = Some(NullBuffer::new(nulls));
-
-        let sum = emit_to.take_needed(&mut self.sum);
-        let sum = Decimal128Array::new(sum.into(), nulls.clone())
-            .with_data_type(self.result_type.clone());
-
-        let is_empty = build_bool_state(&mut self.is_empty, &emit_to);
-        let is_empty = BooleanArray::new(is_empty, None);
-
-        Ok(vec![
-            Arc::new(sum) as ArrayRef,
-            Arc::new(is_empty) as ArrayRef,
-        ])
-    }
-
-    fn merge_batch(
-        &mut self,
-        values: &[ArrayRef],
-        group_indices: &[usize],
-        opt_filter: Option<&BooleanArray>,
-        total_num_groups: usize,
-    ) -> DFResult<()> {
-        assert_eq!(
-            values.len(),
-            2,
-            "Expected two arrays: 'sum' and 'is_empty', but found {}",
-            values.len()
-        );
-        assert!(opt_filter.is_none(), "opt_filter is not supported yet");
-
-        // Make sure we have enough capacity for the additional groups
-        self.sum.resize(total_num_groups, 0);
-        ensure_bit_capacity(&mut self.is_empty, total_num_groups);
-        ensure_bit_capacity(&mut self.is_not_null, total_num_groups);
-
-        let that_sum = &values[0];
-        let that_sum = that_sum.as_primitive::<Decimal128Type>();
-        let that_is_empty = &values[1];
-        let that_is_empty = that_is_empty
-            .as_any()
-            .downcast_ref::<BooleanArray>()
-            .unwrap();
-
-        group_indices
-            .iter()
-            .enumerate()
-            .for_each(|(idx, &group_index)| unsafe {
-                let this_overflow = self.is_overflow(group_index);
-                let that_is_empty = that_is_empty.value_unchecked(idx);
-                let that_overflow = !that_is_empty && that_sum.is_null(idx);
-                let is_overflow = this_overflow || that_overflow;
-
-                // This part follows the logic in Spark:
-                //   `org.apache.spark.sql.catalyst.expressions.aggregate.Sum`
-                self.is_not_null.set_bit(group_index, !is_overflow);
-                self.is_empty.set_bit(
-                    group_index,
-                    self.is_empty.get_bit(group_index) && that_is_empty,
-                );
-                if !is_overflow {
-                    // .. otherwise, the sum value for this particular index must not be null,
-                    // and thus we merge both values and update this sum.
-                    self.sum[group_index] += that_sum.value_unchecked(idx);
-                }
-            });
-
-        Ok(())
-    }
-
-    fn size(&self) -> usize {
-        self.sum.capacity() * std::mem::size_of::<i128>()
-            + self.is_empty.capacity() / 8
-            + self.is_not_null.capacity() / 8
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-    use arrow::datatypes::*;
-    use arrow_array::builder::{Decimal128Builder, StringBuilder};
-    use arrow_array::RecordBatch;
-    use datafusion::execution::TaskContext;
-    use datafusion::physical_plan::aggregates::{AggregateExec, AggregateMode, PhysicalGroupBy};
-    use datafusion::physical_plan::memory::MemoryExec;
-    use datafusion::physical_plan::ExecutionPlan;
-    use datafusion_common::Result;
-    use datafusion_expr::AggregateUDF;
-    use datafusion_physical_expr::aggregate::AggregateExprBuilder;
-    use datafusion_physical_expr::expressions::Column;
-    use datafusion_physical_expr::PhysicalExpr;
-    use futures::StreamExt;
-
-    #[test]
-    fn invalid_data_type() {
-        assert!(SumDecimal::try_new(DataType::Int32).is_err());
-    }
-
-    #[tokio::test]
-    async fn sum_no_overflow() -> Result<()> {
-        let num_rows = 8192;
-        let batch = create_record_batch(num_rows);
-        let mut batches = Vec::new();
-        for _ in 0..10 {
-            batches.push(batch.clone());
-        }
-        let partitions = &[batches];
-        let c0: Arc<dyn PhysicalExpr> = Arc::new(Column::new("c0", 0));
-        let c1: Arc<dyn PhysicalExpr> = Arc::new(Column::new("c1", 1));
-
-        let data_type = DataType::Decimal128(8, 2);
-        let schema = Arc::clone(&partitions[0][0].schema());
-        let scan: Arc<dyn ExecutionPlan> =
-            Arc::new(MemoryExec::try_new(partitions, Arc::clone(&schema), None).unwrap());
-
-        let aggregate_udf = Arc::new(AggregateUDF::new_from_impl(SumDecimal::try_new(
-            data_type.clone(),
-        )?));
-
-        let aggr_expr = AggregateExprBuilder::new(aggregate_udf, vec![c1])
-            .schema(Arc::clone(&schema))
-            .alias("sum")
-            .with_ignore_nulls(false)
-            .with_distinct(false)
-            .build()?;
-
-        let aggregate = Arc::new(AggregateExec::try_new(
-            AggregateMode::Partial,
-            PhysicalGroupBy::new_single(vec![(c0, "c0".to_string())]),
-            vec![aggr_expr.into()],
-            vec![None], // no filter expressions
-            scan,
-            Arc::clone(&schema),
-        )?);
-
-        let mut stream = aggregate
-            .execute(0, Arc::new(TaskContext::default()))
-            .unwrap();
-        while let Some(batch) = stream.next().await {
-            let _batch = batch?;
-        }
-
-        Ok(())
-    }
-
-    fn create_record_batch(num_rows: usize) -> RecordBatch {
-        let mut decimal_builder = Decimal128Builder::with_capacity(num_rows);
-        let mut string_builder = StringBuilder::with_capacity(num_rows, num_rows * 32);
-        for i in 0..num_rows {
-            decimal_builder.append_value(i as i128);
-            string_builder.append_value(format!("this is string #{}", i % 1024));
-        }
-        let decimal_array = Arc::new(decimal_builder.finish());
-        let string_array = Arc::new(string_builder.finish());
-
-        let mut fields = vec![];
-        let mut columns: Vec<ArrayRef> = vec![];
-
-        // string column
-        fields.push(Field::new("c0", DataType::Utf8, false));
-        columns.push(string_array);
-
-        // decimal column
-        fields.push(Field::new("c1", DataType::Decimal128(38, 10), false));
-        columns.push(decimal_array);
-
-        let schema = Schema::new(fields);
-        RecordBatch::try_new(Arc::new(schema), columns).unwrap()
-    }
-}
diff --git a/src/temporal.rs b/src/temporal.rs
deleted file mode 100644
index fb549f9ce818..000000000000
--- a/src/temporal.rs
+++ /dev/null
@@ -1,510 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-use crate::utils::array_with_timezone;
-use arrow::{
-    compute::{date_part, DatePart},
-    record_batch::RecordBatch,
-};
-use arrow_schema::{DataType, Schema, TimeUnit::Microsecond};
-use datafusion::logical_expr::ColumnarValue;
-use datafusion_common::{DataFusionError, ScalarValue::Utf8};
-use datafusion_physical_expr::PhysicalExpr;
-use std::hash::Hash;
-use std::{
-    any::Any,
-    fmt::{Debug, Display, Formatter},
-    sync::Arc,
-};
-
-use crate::kernels::temporal::{
-    date_trunc_array_fmt_dyn, date_trunc_dyn, timestamp_trunc_array_fmt_dyn, timestamp_trunc_dyn,
-};
-
-#[derive(Debug, Eq)]
-pub struct HourExpr {
-    /// An array with DataType::Timestamp(TimeUnit::Microsecond, None)
-    child: Arc<dyn PhysicalExpr>,
-    timezone: String,
-}
-
-impl Hash for HourExpr {
-    fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
-        self.child.hash(state);
-        self.timezone.hash(state);
-    }
-}
-impl PartialEq for HourExpr {
-    fn eq(&self, other: &Self) -> bool {
-        self.child.eq(&other.child) && self.timezone.eq(&other.timezone)
-    }
-}
-
-impl HourExpr {
-    pub fn new(child: Arc<dyn PhysicalExpr>, timezone: String) -> Self {
-        HourExpr { child, timezone }
-    }
-}
-
-impl Display for HourExpr {
-    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
-        write!(
-            f,
-            "Hour [timezone:{}, child: {}]",
-            self.timezone, self.child
-        )
-    }
-}
-
-impl PhysicalExpr for HourExpr {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
-    fn data_type(&self, input_schema: &Schema) -> datafusion_common::Result<DataType> {
-        match self.child.data_type(input_schema).unwrap() {
-            DataType::Dictionary(key_type, _) => {
-                Ok(DataType::Dictionary(key_type, Box::new(DataType::Int32)))
-            }
-            _ => Ok(DataType::Int32),
-        }
-    }
-
-    fn nullable(&self, _: &Schema) -> datafusion_common::Result<bool> {
-        Ok(true)
-    }
-
-    fn evaluate(&self, batch: &RecordBatch) -> datafusion_common::Result<ColumnarValue> {
-        let arg = self.child.evaluate(batch)?;
-        match arg {
-            ColumnarValue::Array(array) => {
-                let array = array_with_timezone(
-                    array,
-                    self.timezone.clone(),
-                    Some(&DataType::Timestamp(
-                        Microsecond,
-                        Some(self.timezone.clone().into()),
-                    )),
-                )?;
-                let result = date_part(&array, DatePart::Hour)?;
-
-                Ok(ColumnarValue::Array(result))
-            }
-            _ => Err(DataFusionError::Execution(
-                "Hour(scalar) should be fold in Spark JVM side.".to_string(),
-            )),
-        }
-    }
-
-    fn children(&self) -> Vec<&Arc<dyn PhysicalExpr>> {
-        vec![&self.child]
-    }
-
-    fn with_new_children(
-        self: Arc<Self>,
-        children: Vec<Arc<dyn PhysicalExpr>>,
-    ) -> Result<Arc<dyn PhysicalExpr>, DataFusionError> {
-        Ok(Arc::new(HourExpr::new(
-            Arc::clone(&children[0]),
-            self.timezone.clone(),
-        )))
-    }
-}
-
-#[derive(Debug, Eq)]
-pub struct MinuteExpr {
-    /// An array with DataType::Timestamp(TimeUnit::Microsecond, None)
-    child: Arc<dyn PhysicalExpr>,
-    timezone: String,
-}
-
-impl Hash for MinuteExpr {
-    fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
-        self.child.hash(state);
-        self.timezone.hash(state);
-    }
-}
-impl PartialEq for MinuteExpr {
-    fn eq(&self, other: &Self) -> bool {
-        self.child.eq(&other.child) && self.timezone.eq(&other.timezone)
-    }
-}
-
-impl MinuteExpr {
-    pub fn new(child: Arc<dyn PhysicalExpr>, timezone: String) -> Self {
-        MinuteExpr { child, timezone }
-    }
-}
-
-impl Display for MinuteExpr {
-    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
-        write!(
-            f,
-            "Minute [timezone:{}, child: {}]",
-            self.timezone, self.child
-        )
-    }
-}
-
-impl PhysicalExpr for MinuteExpr {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
-    fn data_type(&self, input_schema: &Schema) -> datafusion_common::Result<DataType> {
-        match self.child.data_type(input_schema).unwrap() {
-            DataType::Dictionary(key_type, _) => {
-                Ok(DataType::Dictionary(key_type, Box::new(DataType::Int32)))
-            }
-            _ => Ok(DataType::Int32),
-        }
-    }
-
-    fn nullable(&self, _: &Schema) -> datafusion_common::Result<bool> {
-        Ok(true)
-    }
-
-    fn evaluate(&self, batch: &RecordBatch) -> datafusion_common::Result<ColumnarValue> {
-        let arg = self.child.evaluate(batch)?;
-        match arg {
-            ColumnarValue::Array(array) => {
-                let array = array_with_timezone(
-                    array,
-                    self.timezone.clone(),
-                    Some(&DataType::Timestamp(
-                        Microsecond,
-                        Some(self.timezone.clone().into()),
-                    )),
-                )?;
-                let result = date_part(&array, DatePart::Minute)?;
-
-                Ok(ColumnarValue::Array(result))
-            }
-            _ => Err(DataFusionError::Execution(
-                "Minute(scalar) should be fold in Spark JVM side.".to_string(),
-            )),
-        }
-    }
-
-    fn children(&self) -> Vec<&Arc<dyn PhysicalExpr>> {
-        vec![&self.child]
-    }
-
-    fn with_new_children(
-        self: Arc<Self>,
-        children: Vec<Arc<dyn PhysicalExpr>>,
-    ) -> Result<Arc<dyn PhysicalExpr>, DataFusionError> {
-        Ok(Arc::new(MinuteExpr::new(
-            Arc::clone(&children[0]),
-            self.timezone.clone(),
-        )))
-    }
-}
-
-#[derive(Debug, Eq)]
-pub struct SecondExpr {
-    /// An array with DataType::Timestamp(TimeUnit::Microsecond, None)
-    child: Arc<dyn PhysicalExpr>,
-    timezone: String,
-}
-
-impl Hash for SecondExpr {
-    fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
-        self.child.hash(state);
-        self.timezone.hash(state);
-    }
-}
-impl PartialEq for SecondExpr {
-    fn eq(&self, other: &Self) -> bool {
-        self.child.eq(&other.child) && self.timezone.eq(&other.timezone)
-    }
-}
-
-impl SecondExpr {
-    pub fn new(child: Arc<dyn PhysicalExpr>, timezone: String) -> Self {
-        SecondExpr { child, timezone }
-    }
-}
-
-impl Display for SecondExpr {
-    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
-        write!(
-            f,
-            "Second (timezone:{}, child: {}]",
-            self.timezone, self.child
-        )
-    }
-}
-
-impl PhysicalExpr for SecondExpr {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
-    fn data_type(&self, input_schema: &Schema) -> datafusion_common::Result<DataType> {
-        match self.child.data_type(input_schema).unwrap() {
-            DataType::Dictionary(key_type, _) => {
-                Ok(DataType::Dictionary(key_type, Box::new(DataType::Int32)))
-            }
-            _ => Ok(DataType::Int32),
-        }
-    }
-
-    fn nullable(&self, _: &Schema) -> datafusion_common::Result<bool> {
-        Ok(true)
-    }
-
-    fn evaluate(&self, batch: &RecordBatch) -> datafusion_common::Result<ColumnarValue> {
-        let arg = self.child.evaluate(batch)?;
-        match arg {
-            ColumnarValue::Array(array) => {
-                let array = array_with_timezone(
-                    array,
-                    self.timezone.clone(),
-                    Some(&DataType::Timestamp(
-                        Microsecond,
-                        Some(self.timezone.clone().into()),
-                    )),
-                )?;
-                let result = date_part(&array, DatePart::Second)?;
-
-                Ok(ColumnarValue::Array(result))
-            }
-            _ => Err(DataFusionError::Execution(
-                "Second(scalar) should be fold in Spark JVM side.".to_string(),
-            )),
-        }
-    }
-
-    fn children(&self) -> Vec<&Arc<dyn PhysicalExpr>> {
-        vec![&self.child]
-    }
-
-    fn with_new_children(
-        self: Arc<Self>,
-        children: Vec<Arc<dyn PhysicalExpr>>,
-    ) -> Result<Arc<dyn PhysicalExpr>, DataFusionError> {
-        Ok(Arc::new(SecondExpr::new(
-            Arc::clone(&children[0]),
-            self.timezone.clone(),
-        )))
-    }
-}
-
-#[derive(Debug, Eq)]
-pub struct DateTruncExpr {
-    /// An array with DataType::Date32
-    child: Arc<dyn PhysicalExpr>,
-    /// Scalar UTF8 string matching the valid values in Spark SQL: https://spark.apache.org/docs/latest/api/sql/index.html#trunc
-    format: Arc<dyn PhysicalExpr>,
-}
-
-impl Hash for DateTruncExpr {
-    fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
-        self.child.hash(state);
-        self.format.hash(state);
-    }
-}
-impl PartialEq for DateTruncExpr {
-    fn eq(&self, other: &Self) -> bool {
-        self.child.eq(&other.child) && self.format.eq(&other.format)
-    }
-}
-
-impl DateTruncExpr {
-    pub fn new(child: Arc<dyn PhysicalExpr>, format: Arc<dyn PhysicalExpr>) -> Self {
-        DateTruncExpr { child, format }
-    }
-}
-
-impl Display for DateTruncExpr {
-    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
-        write!(
-            f,
-            "DateTrunc [child:{}, format: {}]",
-            self.child, self.format
-        )
-    }
-}
-
-impl PhysicalExpr for DateTruncExpr {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
-    fn data_type(&self, input_schema: &Schema) -> datafusion_common::Result<DataType> {
-        self.child.data_type(input_schema)
-    }
-
-    fn nullable(&self, _: &Schema) -> datafusion_common::Result<bool> {
-        Ok(true)
-    }
-
-    fn evaluate(&self, batch: &RecordBatch) -> datafusion_common::Result<ColumnarValue> {
-        let date = self.child.evaluate(batch)?;
-        let format = self.format.evaluate(batch)?;
-        match (date, format) {
-            (ColumnarValue::Array(date), ColumnarValue::Scalar(Utf8(Some(format)))) => {
-                let result = date_trunc_dyn(&date, format)?;
-                Ok(ColumnarValue::Array(result))
-            }
-            (ColumnarValue::Array(date), ColumnarValue::Array(formats)) => {
-                let result = date_trunc_array_fmt_dyn(&date, &formats)?;
-                Ok(ColumnarValue::Array(result))
-            }
-            _ => Err(DataFusionError::Execution(
-                "Invalid input to function DateTrunc. Expected (PrimitiveArray<Date32>, Scalar) or \
-                    (PrimitiveArray<Date32>, StringArray)".to_string(),
-            )),
-        }
-    }
-
-    fn children(&self) -> Vec<&Arc<dyn PhysicalExpr>> {
-        vec![&self.child]
-    }
-
-    fn with_new_children(
-        self: Arc<Self>,
-        children: Vec<Arc<dyn PhysicalExpr>>,
-    ) -> Result<Arc<dyn PhysicalExpr>, DataFusionError> {
-        Ok(Arc::new(DateTruncExpr::new(
-            Arc::clone(&children[0]),
-            Arc::clone(&self.format),
-        )))
-    }
-}
-
-#[derive(Debug, Eq)]
-pub struct TimestampTruncExpr {
-    /// An array with DataType::Timestamp(TimeUnit::Microsecond, None)
-    child: Arc<dyn PhysicalExpr>,
-    /// Scalar UTF8 string matching the valid values in Spark SQL: https://spark.apache.org/docs/latest/api/sql/index.html#date_trunc
-    format: Arc<dyn PhysicalExpr>,
-    /// String containing a timezone name. The name must be found in the standard timezone
-    /// database (https://en.wikipedia.org/wiki/List_of_tz_database_time_zones). The string is
-    /// later parsed into a chrono::TimeZone.
-    /// Timestamp arrays in this implementation are kept in arrays of UTC timestamps (in micros)
-    /// along with a single value for the associated TimeZone. The timezone offset is applied
-    /// just before any operations on the timestamp
-    timezone: String,
-}
-
-impl Hash for TimestampTruncExpr {
-    fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
-        self.child.hash(state);
-        self.format.hash(state);
-        self.timezone.hash(state);
-    }
-}
-impl PartialEq for TimestampTruncExpr {
-    fn eq(&self, other: &Self) -> bool {
-        self.child.eq(&other.child)
-            && self.format.eq(&other.format)
-            && self.timezone.eq(&other.timezone)
-    }
-}
-
-impl TimestampTruncExpr {
-    pub fn new(
-        child: Arc<dyn PhysicalExpr>,
-        format: Arc<dyn PhysicalExpr>,
-        timezone: String,
-    ) -> Self {
-        TimestampTruncExpr {
-            child,
-            format,
-            timezone,
-        }
-    }
-}
-
-impl Display for TimestampTruncExpr {
-    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
-        write!(
-            f,
-            "TimestampTrunc [child:{}, format:{}, timezone: {}]",
-            self.child, self.format, self.timezone
-        )
-    }
-}
-
-impl PhysicalExpr for TimestampTruncExpr {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
-    fn data_type(&self, input_schema: &Schema) -> datafusion_common::Result<DataType> {
-        match self.child.data_type(input_schema)? {
-            DataType::Dictionary(key_type, _) => Ok(DataType::Dictionary(
-                key_type,
-                Box::new(DataType::Timestamp(Microsecond, None)),
-            )),
-            _ => Ok(DataType::Timestamp(Microsecond, None)),
-        }
-    }
-
-    fn nullable(&self, _: &Schema) -> datafusion_common::Result<bool> {
-        Ok(true)
-    }
-
-    fn evaluate(&self, batch: &RecordBatch) -> datafusion_common::Result<ColumnarValue> {
-        let timestamp = self.child.evaluate(batch)?;
-        let format = self.format.evaluate(batch)?;
-        let tz = self.timezone.clone();
-        match (timestamp, format) {
-            (ColumnarValue::Array(ts), ColumnarValue::Scalar(Utf8(Some(format)))) => {
-                let ts = array_with_timezone(
-                    ts,
-                    tz.clone(),
-                    Some(&DataType::Timestamp(Microsecond, Some(tz.into()))),
-                )?;
-                let result = timestamp_trunc_dyn(&ts, format)?;
-                Ok(ColumnarValue::Array(result))
-            }
-            (ColumnarValue::Array(ts), ColumnarValue::Array(formats)) => {
-                let ts = array_with_timezone(
-                    ts,
-                    tz.clone(),
-                    Some(&DataType::Timestamp(Microsecond, Some(tz.into()))),
-                )?;
-                let result = timestamp_trunc_array_fmt_dyn(&ts, &formats)?;
-                Ok(ColumnarValue::Array(result))
-            }
-            _ => Err(DataFusionError::Execution(
-                "Invalid input to function TimestampTrunc. \
-                    Expected (PrimitiveArray<TimestampMicrosecondType>, Scalar, String) or \
-                    (PrimitiveArray<TimestampMicrosecondType>, StringArray, String)"
-                    .to_string(),
-            )),
-        }
-    }
-
-    fn children(&self) -> Vec<&Arc<dyn PhysicalExpr>> {
-        vec![&self.child]
-    }
-
-    fn with_new_children(
-        self: Arc<Self>,
-        children: Vec<Arc<dyn PhysicalExpr>>,
-    ) -> Result<Arc<dyn PhysicalExpr>, DataFusionError> {
-        Ok(Arc::new(TimestampTruncExpr::new(
-            Arc::clone(&children[0]),
-            Arc::clone(&self.format),
-            self.timezone.clone(),
-        )))
-    }
-}

From d9fe1b37076623a8fe259129178e164b4728befc Mon Sep 17 00:00:00 2001
From: Matt Butrovich <mbutrovich@users.noreply.github.com>
Date: Thu, 23 Jan 2025 18:23:06 -0500
Subject: [PATCH 57/68] chore: Fix merge conflicts from merging
 comet-parquet-exec into main (#1323)

* Remove extra files after merging main into comet-parquet-exec.

* Clean up imports in CometTestBase.
---
 src/lib.rs            |   2 -
 src/schema_adapter.rs | 376 ------------------------------------------
 src/variance.rs       | 247 ---------------------------
 3 files changed, 625 deletions(-)
 delete mode 100644 src/schema_adapter.rs
 delete mode 100644 src/variance.rs

diff --git a/src/lib.rs b/src/lib.rs
index c9cfab27d40d..9bf6bb24f7c7 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -22,9 +22,7 @@
 mod error;
 
 mod kernels;
-mod schema_adapter;
 mod static_invoke;
-pub use schema_adapter::SparkSchemaAdapterFactory;
 pub use static_invoke::*;
 
 mod struct_funcs;
diff --git a/src/schema_adapter.rs b/src/schema_adapter.rs
deleted file mode 100644
index 161ad6f164d9..000000000000
--- a/src/schema_adapter.rs
+++ /dev/null
@@ -1,376 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-//! Custom schema adapter that uses Spark-compatible casts
-
-use crate::cast::cast_supported;
-use crate::{spark_cast, SparkCastOptions};
-use arrow_array::{new_null_array, Array, RecordBatch, RecordBatchOptions};
-use arrow_schema::{Schema, SchemaRef};
-use datafusion::datasource::schema_adapter::{SchemaAdapter, SchemaAdapterFactory, SchemaMapper};
-use datafusion_common::plan_err;
-use datafusion_expr::ColumnarValue;
-use std::sync::Arc;
-
-/// An implementation of DataFusion's `SchemaAdapterFactory` that uses a Spark-compatible
-/// `cast` implementation.
-#[derive(Clone, Debug)]
-pub struct SparkSchemaAdapterFactory {
-    /// Spark cast options
-    cast_options: SparkCastOptions,
-}
-
-impl SparkSchemaAdapterFactory {
-    pub fn new(options: SparkCastOptions) -> Self {
-        Self {
-            cast_options: options,
-        }
-    }
-}
-
-impl SchemaAdapterFactory for SparkSchemaAdapterFactory {
-    /// Create a new factory for mapping batches from a file schema to a table
-    /// schema.
-    ///
-    /// This is a convenience for [`DefaultSchemaAdapterFactory::create`] with
-    /// the same schema for both the projected table schema and the table
-    /// schema.
-    fn create(
-        &self,
-        required_schema: SchemaRef,
-        table_schema: SchemaRef,
-    ) -> Box<dyn SchemaAdapter> {
-        Box::new(SparkSchemaAdapter {
-            required_schema,
-            table_schema,
-            cast_options: self.cast_options.clone(),
-        })
-    }
-}
-
-/// This SchemaAdapter requires both the table schema and the projected table
-/// schema. See  [`SchemaMapping`] for more details
-#[derive(Clone, Debug)]
-pub struct SparkSchemaAdapter {
-    /// The schema for the table, projected to include only the fields being output (projected) by the
-    /// associated ParquetExec
-    required_schema: SchemaRef,
-    /// The entire table schema for the table we're using this to adapt.
-    ///
-    /// This is used to evaluate any filters pushed down into the scan
-    /// which may refer to columns that are not referred to anywhere
-    /// else in the plan.
-    table_schema: SchemaRef,
-    /// Spark cast options
-    cast_options: SparkCastOptions,
-}
-
-impl SchemaAdapter for SparkSchemaAdapter {
-    /// Map a column index in the table schema to a column index in a particular
-    /// file schema
-    ///
-    /// Panics if index is not in range for the table schema
-    fn map_column_index(&self, index: usize, file_schema: &Schema) -> Option<usize> {
-        let field = self.required_schema.field(index);
-        Some(file_schema.fields.find(field.name())?.0)
-    }
-
-    /// Creates a `SchemaMapping` for casting or mapping the columns from the
-    /// file schema to the table schema.
-    ///
-    /// If the provided `file_schema` contains columns of a different type to
-    /// the expected `table_schema`, the method will attempt to cast the array
-    /// data from the file schema to the table schema where possible.
-    ///
-    /// Returns a [`SchemaMapping`] that can be applied to the output batch
-    /// along with an ordered list of columns to project from the file
-    fn map_schema(
-        &self,
-        file_schema: &Schema,
-    ) -> datafusion_common::Result<(Arc<dyn SchemaMapper>, Vec<usize>)> {
-        let mut projection = Vec::with_capacity(file_schema.fields().len());
-        let mut field_mappings = vec![None; self.required_schema.fields().len()];
-
-        for (file_idx, file_field) in file_schema.fields.iter().enumerate() {
-            if let Some((table_idx, table_field)) =
-                self.required_schema.fields().find(file_field.name())
-            {
-                if cast_supported(
-                    file_field.data_type(),
-                    table_field.data_type(),
-                    &self.cast_options,
-                ) {
-                    field_mappings[table_idx] = Some(projection.len());
-                    projection.push(file_idx);
-                } else {
-                    return plan_err!(
-                        "Cannot cast file schema field {} of type {:?} to required schema field of type {:?}",
-                        file_field.name(),
-                        file_field.data_type(),
-                        table_field.data_type()
-                    );
-                }
-            }
-        }
-
-        Ok((
-            Arc::new(SchemaMapping {
-                required_schema: Arc::<Schema>::clone(&self.required_schema),
-                field_mappings,
-                table_schema: Arc::<Schema>::clone(&self.table_schema),
-                cast_options: self.cast_options.clone(),
-            }),
-            projection,
-        ))
-    }
-}
-
-// TODO SchemaMapping is mostly copied from DataFusion but calls spark_cast
-// instead of arrow cast - can we reduce the amount of code copied here and make
-// the DataFusion version more extensible?
-
-/// The SchemaMapping struct holds a mapping from the file schema to the table
-/// schema and any necessary type conversions.
-///
-/// Note, because `map_batch` and `map_partial_batch` functions have different
-/// needs, this struct holds two schemas:
-///
-/// 1. The projected **table** schema
-/// 2. The full table schema
-///
-/// [`map_batch`] is used by the ParquetOpener to produce a RecordBatch which
-/// has the projected schema, since that's the schema which is supposed to come
-/// out of the execution of this query. Thus `map_batch` uses
-/// `projected_table_schema` as it can only operate on the projected fields.
-///
-/// [`map_partial_batch`]  is used to create a RecordBatch with a schema that
-/// can be used for Parquet predicate pushdown, meaning that it may contain
-/// fields which are not in the projected schema (as the fields that parquet
-/// pushdown filters operate can be completely distinct from the fields that are
-/// projected (output) out of the ParquetExec). `map_partial_batch` thus uses
-/// `table_schema` to create the resulting RecordBatch (as it could be operating
-/// on any fields in the schema).
-///
-/// [`map_batch`]: Self::map_batch
-/// [`map_partial_batch`]: Self::map_partial_batch
-#[derive(Debug)]
-pub struct SchemaMapping {
-    /// The schema of the table. This is the expected schema after conversion
-    /// and it should match the schema of the query result.
-    required_schema: SchemaRef,
-    /// Mapping from field index in `projected_table_schema` to index in
-    /// projected file_schema.
-    ///
-    /// They are Options instead of just plain `usize`s because the table could
-    /// have fields that don't exist in the file.
-    field_mappings: Vec<Option<usize>>,
-    /// The entire table schema, as opposed to the projected_table_schema (which
-    /// only contains the columns that we are projecting out of this query).
-    /// This contains all fields in the table, regardless of if they will be
-    /// projected out or not.
-    table_schema: SchemaRef,
-    /// Spark cast options
-    cast_options: SparkCastOptions,
-}
-
-impl SchemaMapper for SchemaMapping {
-    /// Adapts a `RecordBatch` to match the `projected_table_schema` using the stored mapping and
-    /// conversions. The produced RecordBatch has a schema that contains only the projected
-    /// columns, so if one needs a RecordBatch with a schema that references columns which are not
-    /// in the projected, it would be better to use `map_partial_batch`
-    fn map_batch(&self, batch: RecordBatch) -> datafusion_common::Result<RecordBatch> {
-        let batch_rows = batch.num_rows();
-        let batch_cols = batch.columns().to_vec();
-
-        let cols = self
-            .required_schema
-            // go through each field in the projected schema
-            .fields()
-            .iter()
-            // and zip it with the index that maps fields from the projected table schema to the
-            // projected file schema in `batch`
-            .zip(&self.field_mappings)
-            // and for each one...
-            .map(|(field, file_idx)| {
-                file_idx.map_or_else(
-                    // If this field only exists in the table, and not in the file, then we know
-                    // that it's null, so just return that.
-                    || Ok(new_null_array(field.data_type(), batch_rows)),
-                    // However, if it does exist in both, then try to cast it to the correct output
-                    // type
-                    |batch_idx| {
-                        spark_cast(
-                            ColumnarValue::Array(Arc::clone(&batch_cols[batch_idx])),
-                            field.data_type(),
-                            &self.cast_options,
-                        )?
-                        .into_array(batch_rows)
-                    },
-                )
-            })
-            .collect::<datafusion_common::Result<Vec<_>, _>>()?;
-
-        // Necessary to handle empty batches
-        let options = RecordBatchOptions::new().with_row_count(Some(batch.num_rows()));
-
-        let schema = Arc::<Schema>::clone(&self.required_schema);
-        let record_batch = RecordBatch::try_new_with_options(schema, cols, &options)?;
-        Ok(record_batch)
-    }
-
-    /// Adapts a [`RecordBatch`]'s schema into one that has all the correct output types and only
-    /// contains the fields that exist in both the file schema and table schema.
-    ///
-    /// Unlike `map_batch` this method also preserves the columns that
-    /// may not appear in the final output (`projected_table_schema`) but may
-    /// appear in push down predicates
-    fn map_partial_batch(&self, batch: RecordBatch) -> datafusion_common::Result<RecordBatch> {
-        let batch_cols = batch.columns().to_vec();
-        let schema = batch.schema();
-
-        // for each field in the batch's schema (which is based on a file, not a table)...
-        let (cols, fields) = schema
-            .fields()
-            .iter()
-            .zip(batch_cols.iter())
-            .flat_map(|(field, batch_col)| {
-                self.table_schema
-                    // try to get the same field from the table schema that we have stored in self
-                    .field_with_name(field.name())
-                    // and if we don't have it, that's fine, ignore it. This may occur when we've
-                    // created an external table whose fields are a subset of the fields in this
-                    // file, then tried to read data from the file into this table. If that is the
-                    // case here, it's fine to ignore because we don't care about this field
-                    // anyways
-                    .ok()
-                    // but if we do have it,
-                    .map(|table_field| {
-                        // try to cast it into the correct output type. we don't want to ignore this
-                        // error, though, so it's propagated.
-                        spark_cast(
-                            ColumnarValue::Array(Arc::clone(batch_col)),
-                            table_field.data_type(),
-                            &self.cast_options,
-                        )?
-                        .into_array(batch_col.len())
-                        // and if that works, return the field and column.
-                        .map(|new_col| (new_col, table_field.clone()))
-                    })
-            })
-            .collect::<Result<Vec<_>, _>>()?
-            .into_iter()
-            .unzip::<_, _, Vec<_>, Vec<_>>();
-
-        // Necessary to handle empty batches
-        let options = RecordBatchOptions::new().with_row_count(Some(batch.num_rows()));
-
-        let schema = Arc::new(Schema::new_with_metadata(fields, schema.metadata().clone()));
-        let record_batch = RecordBatch::try_new_with_options(schema, cols, &options)?;
-        Ok(record_batch)
-    }
-}
-
-#[cfg(test)]
-mod test {
-    use crate::test_common::file_util::get_temp_filename;
-    use crate::{EvalMode, SparkCastOptions, SparkSchemaAdapterFactory};
-    use arrow::array::{Int32Array, StringArray};
-    use arrow::datatypes::{DataType, Field, Schema};
-    use arrow::record_batch::RecordBatch;
-    use arrow_array::UInt32Array;
-    use arrow_schema::SchemaRef;
-    use datafusion::datasource::listing::PartitionedFile;
-    use datafusion::datasource::physical_plan::{FileScanConfig, ParquetExec};
-    use datafusion::execution::object_store::ObjectStoreUrl;
-    use datafusion::execution::TaskContext;
-    use datafusion::physical_plan::ExecutionPlan;
-    use datafusion_common::DataFusionError;
-    use futures::StreamExt;
-    use parquet::arrow::ArrowWriter;
-    use std::fs::File;
-    use std::sync::Arc;
-
-    #[tokio::test]
-    async fn parquet_roundtrip_int_as_string() -> Result<(), DataFusionError> {
-        let file_schema = Arc::new(Schema::new(vec![
-            Field::new("id", DataType::Int32, false),
-            Field::new("name", DataType::Utf8, false),
-        ]));
-
-        let ids = Arc::new(Int32Array::from(vec![1, 2, 3])) as Arc<dyn arrow::array::Array>;
-        let names = Arc::new(StringArray::from(vec!["Alice", "Bob", "Charlie"]))
-            as Arc<dyn arrow::array::Array>;
-        let batch = RecordBatch::try_new(Arc::clone(&file_schema), vec![ids, names])?;
-
-        let required_schema = Arc::new(Schema::new(vec![
-            Field::new("id", DataType::Utf8, false),
-            Field::new("name", DataType::Utf8, false),
-        ]));
-
-        let _ = roundtrip(&batch, required_schema).await?;
-
-        Ok(())
-    }
-
-    #[tokio::test]
-    async fn parquet_roundtrip_unsigned_int() -> Result<(), DataFusionError> {
-        let file_schema = Arc::new(Schema::new(vec![Field::new("id", DataType::UInt32, false)]));
-
-        let ids = Arc::new(UInt32Array::from(vec![1, 2, 3])) as Arc<dyn arrow::array::Array>;
-        let batch = RecordBatch::try_new(Arc::clone(&file_schema), vec![ids])?;
-
-        let required_schema = Arc::new(Schema::new(vec![Field::new("id", DataType::Int32, false)]));
-
-        let _ = roundtrip(&batch, required_schema).await?;
-
-        Ok(())
-    }
-
-    /// Create a Parquet file containing a single batch and then read the batch back using
-    /// the specified required_schema. This will cause the SchemaAdapter code to be used.
-    async fn roundtrip(
-        batch: &RecordBatch,
-        required_schema: SchemaRef,
-    ) -> Result<RecordBatch, DataFusionError> {
-        let filename = get_temp_filename();
-        let filename = filename.as_path().as_os_str().to_str().unwrap().to_string();
-        let file = File::create(&filename)?;
-        let mut writer = ArrowWriter::try_new(file, Arc::clone(&batch.schema()), None)?;
-        writer.write(batch)?;
-        writer.close()?;
-
-        let object_store_url = ObjectStoreUrl::local_filesystem();
-        let file_scan_config = FileScanConfig::new(object_store_url, required_schema)
-            .with_file_groups(vec![vec![PartitionedFile::from_path(
-                filename.to_string(),
-            )?]]);
-
-        let mut spark_cast_options = SparkCastOptions::new(EvalMode::Legacy, "UTC", false);
-        spark_cast_options.allow_cast_unsigned_ints = true;
-
-        let parquet_exec = ParquetExec::builder(file_scan_config)
-            .with_schema_adapter_factory(Arc::new(SparkSchemaAdapterFactory::new(
-                spark_cast_options,
-            )))
-            .build();
-
-        let mut stream = parquet_exec
-            .execute(0, Arc::new(TaskContext::default()))
-            .unwrap();
-        stream.next().await.unwrap()
-    }
-}
diff --git a/src/variance.rs b/src/variance.rs
deleted file mode 100644
index e71d713f5929..000000000000
--- a/src/variance.rs
+++ /dev/null
@@ -1,247 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-use std::any::Any;
-
-use arrow::{
-    array::{ArrayRef, Float64Array},
-    datatypes::{DataType, Field},
-};
-use datafusion::logical_expr::Accumulator;
-use datafusion_common::{downcast_value, DataFusionError, Result, ScalarValue};
-use datafusion_expr::function::{AccumulatorArgs, StateFieldsArgs};
-use datafusion_expr::Volatility::Immutable;
-use datafusion_expr::{AggregateUDFImpl, Signature};
-use datafusion_physical_expr::expressions::format_state_name;
-use datafusion_physical_expr::expressions::StatsType;
-
-/// VAR_SAMP and VAR_POP aggregate expression
-/// The implementation mostly is the same as the DataFusion's implementation. The reason
-/// we have our own implementation is that DataFusion has UInt64 for state_field `count`,
-/// while Spark has Double for count. Also we have added `null_on_divide_by_zero`
-/// to be consistent with Spark's implementation.
-#[derive(Debug)]
-pub struct Variance {
-    name: String,
-    signature: Signature,
-    stats_type: StatsType,
-    null_on_divide_by_zero: bool,
-}
-
-impl Variance {
-    /// Create a new VARIANCE aggregate function
-    pub fn new(
-        name: impl Into<String>,
-        data_type: DataType,
-        stats_type: StatsType,
-        null_on_divide_by_zero: bool,
-    ) -> Self {
-        // the result of variance just support FLOAT64 data type.
-        assert!(matches!(data_type, DataType::Float64));
-        Self {
-            name: name.into(),
-            signature: Signature::numeric(1, Immutable),
-            stats_type,
-            null_on_divide_by_zero,
-        }
-    }
-}
-
-impl AggregateUDFImpl for Variance {
-    /// Return a reference to Any that can be used for downcasting
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
-    fn name(&self) -> &str {
-        &self.name
-    }
-
-    fn signature(&self) -> &Signature {
-        &self.signature
-    }
-
-    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
-        Ok(DataType::Float64)
-    }
-
-    fn accumulator(&self, _acc_args: AccumulatorArgs) -> Result<Box<dyn Accumulator>> {
-        Ok(Box::new(VarianceAccumulator::try_new(
-            self.stats_type,
-            self.null_on_divide_by_zero,
-        )?))
-    }
-
-    fn create_sliding_accumulator(&self, _args: AccumulatorArgs) -> Result<Box<dyn Accumulator>> {
-        Ok(Box::new(VarianceAccumulator::try_new(
-            self.stats_type,
-            self.null_on_divide_by_zero,
-        )?))
-    }
-
-    fn state_fields(&self, _args: StateFieldsArgs) -> Result<Vec<Field>> {
-        Ok(vec![
-            Field::new(
-                format_state_name(&self.name, "count"),
-                DataType::Float64,
-                true,
-            ),
-            Field::new(
-                format_state_name(&self.name, "mean"),
-                DataType::Float64,
-                true,
-            ),
-            Field::new(format_state_name(&self.name, "m2"), DataType::Float64, true),
-        ])
-    }
-
-    fn default_value(&self, _data_type: &DataType) -> Result<ScalarValue> {
-        Ok(ScalarValue::Float64(None))
-    }
-}
-
-/// An accumulator to compute variance
-#[derive(Debug)]
-pub struct VarianceAccumulator {
-    m2: f64,
-    mean: f64,
-    count: f64,
-    stats_type: StatsType,
-    null_on_divide_by_zero: bool,
-}
-
-impl VarianceAccumulator {
-    /// Creates a new `VarianceAccumulator`
-    pub fn try_new(s_type: StatsType, null_on_divide_by_zero: bool) -> Result<Self> {
-        Ok(Self {
-            m2: 0_f64,
-            mean: 0_f64,
-            count: 0_f64,
-            stats_type: s_type,
-            null_on_divide_by_zero,
-        })
-    }
-
-    pub fn get_count(&self) -> f64 {
-        self.count
-    }
-
-    pub fn get_mean(&self) -> f64 {
-        self.mean
-    }
-
-    pub fn get_m2(&self) -> f64 {
-        self.m2
-    }
-}
-
-impl Accumulator for VarianceAccumulator {
-    fn state(&mut self) -> Result<Vec<ScalarValue>> {
-        Ok(vec![
-            ScalarValue::from(self.count),
-            ScalarValue::from(self.mean),
-            ScalarValue::from(self.m2),
-        ])
-    }
-
-    fn update_batch(&mut self, values: &[ArrayRef]) -> Result<()> {
-        let arr = downcast_value!(&values[0], Float64Array).iter().flatten();
-
-        for value in arr {
-            let new_count = self.count + 1.0;
-            let delta1 = value - self.mean;
-            let new_mean = delta1 / new_count + self.mean;
-            let delta2 = value - new_mean;
-            let new_m2 = self.m2 + delta1 * delta2;
-
-            self.count += 1.0;
-            self.mean = new_mean;
-            self.m2 = new_m2;
-        }
-
-        Ok(())
-    }
-
-    fn retract_batch(&mut self, values: &[ArrayRef]) -> Result<()> {
-        let arr = downcast_value!(&values[0], Float64Array).iter().flatten();
-
-        for value in arr {
-            let new_count = self.count - 1.0;
-            let delta1 = self.mean - value;
-            let new_mean = delta1 / new_count + self.mean;
-            let delta2 = new_mean - value;
-            let new_m2 = self.m2 - delta1 * delta2;
-
-            self.count -= 1.0;
-            self.mean = new_mean;
-            self.m2 = new_m2;
-        }
-
-        Ok(())
-    }
-
-    fn merge_batch(&mut self, states: &[ArrayRef]) -> Result<()> {
-        let counts = downcast_value!(states[0], Float64Array);
-        let means = downcast_value!(states[1], Float64Array);
-        let m2s = downcast_value!(states[2], Float64Array);
-
-        for i in 0..counts.len() {
-            let c = counts.value(i);
-            if c == 0_f64 {
-                continue;
-            }
-            let new_count = self.count + c;
-            let new_mean = self.mean * self.count / new_count + means.value(i) * c / new_count;
-            let delta = self.mean - means.value(i);
-            let new_m2 = self.m2 + m2s.value(i) + delta * delta * self.count * c / new_count;
-
-            self.count = new_count;
-            self.mean = new_mean;
-            self.m2 = new_m2;
-        }
-        Ok(())
-    }
-
-    fn evaluate(&mut self) -> Result<ScalarValue> {
-        let count = match self.stats_type {
-            StatsType::Population => self.count,
-            StatsType::Sample => {
-                if self.count > 0.0 {
-                    self.count - 1.0
-                } else {
-                    self.count
-                }
-            }
-        };
-
-        Ok(ScalarValue::Float64(match self.count {
-            count if count == 0.0 => None,
-            count if count == 1.0 && StatsType::Sample == self.stats_type => {
-                if self.null_on_divide_by_zero {
-                    None
-                } else {
-                    Some(f64::NAN)
-                }
-            }
-            _ => Some(self.m2 / count),
-        }))
-    }
-
-    fn size(&self) -> usize {
-        std::mem::size_of_val(self)
-    }
-}

From 35821f1802c6f20782d523fe262a8de0e395bfb0 Mon Sep 17 00:00:00 2001
From: Andy Grove <agrove@apache.org>
Date: Thu, 30 Jan 2025 11:35:11 -0700
Subject: [PATCH 58/68] chore: Prepare for DataFusion 45 (bump to DataFusion
 rev 5592834 + Arrow 54.0.0) (#1332)

* bump DataFusion to rev 5592834

* update FilterExec

* fix regression

* fmt

* revert change

* fix regression

* fix

* use temp datafusion branch

* try removing Field::new_with_dict

* clippy

* coerce types for CASE expressions

* save experiments

* test passes

* remove debug logging

* remove debug logging

* remove debug logging

* revert test change

* clippy

* revert whitespace change

* Set rust-version to 1.81
---
 src/conversion_funcs/cast.rs | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/conversion_funcs/cast.rs b/src/conversion_funcs/cast.rs
index 0f597613255b..22b09f6106c9 100644
--- a/src/conversion_funcs/cast.rs
+++ b/src/conversion_funcs/cast.rs
@@ -988,6 +988,9 @@ fn is_datafusion_spark_compatible(
         return true;
     }
     match from_type {
+        DataType::Null => {
+            matches!(to_type, DataType::List(_))
+        }
         DataType::Boolean => matches!(
             to_type,
             DataType::Int8

From 53aec1a56685befef19bf0ac1d2648f22e9b3819 Mon Sep 17 00:00:00 2001
From: Andy Grove <agrove@apache.org>
Date: Fri, 31 Jan 2025 13:09:34 -0700
Subject: [PATCH 59/68] move benches

---
 {benches => datafusion/functions-spark/benches}/aggregate.rs     | 0
 .../functions-spark/benches}/cast_from_string.rs                 | 0
 {benches => datafusion/functions-spark/benches}/cast_numeric.rs  | 0
 {benches => datafusion/functions-spark/benches}/conditional.rs   | 0
 {benches => datafusion/functions-spark/benches}/decimal_div.rs   | 0
 datafusion/functions-spark/src/test_common/mod.rs                | 1 +
 6 files changed, 1 insertion(+)
 rename {benches => datafusion/functions-spark/benches}/aggregate.rs (100%)
 rename {benches => datafusion/functions-spark/benches}/cast_from_string.rs (100%)
 rename {benches => datafusion/functions-spark/benches}/cast_numeric.rs (100%)
 rename {benches => datafusion/functions-spark/benches}/conditional.rs (100%)
 rename {benches => datafusion/functions-spark/benches}/decimal_div.rs (100%)

diff --git a/benches/aggregate.rs b/datafusion/functions-spark/benches/aggregate.rs
similarity index 100%
rename from benches/aggregate.rs
rename to datafusion/functions-spark/benches/aggregate.rs
diff --git a/benches/cast_from_string.rs b/datafusion/functions-spark/benches/cast_from_string.rs
similarity index 100%
rename from benches/cast_from_string.rs
rename to datafusion/functions-spark/benches/cast_from_string.rs
diff --git a/benches/cast_numeric.rs b/datafusion/functions-spark/benches/cast_numeric.rs
similarity index 100%
rename from benches/cast_numeric.rs
rename to datafusion/functions-spark/benches/cast_numeric.rs
diff --git a/benches/conditional.rs b/datafusion/functions-spark/benches/conditional.rs
similarity index 100%
rename from benches/conditional.rs
rename to datafusion/functions-spark/benches/conditional.rs
diff --git a/benches/decimal_div.rs b/datafusion/functions-spark/benches/decimal_div.rs
similarity index 100%
rename from benches/decimal_div.rs
rename to datafusion/functions-spark/benches/decimal_div.rs
diff --git a/datafusion/functions-spark/src/test_common/mod.rs b/datafusion/functions-spark/src/test_common/mod.rs
index efd25a4a2af0..f2edb8035e03 100644
--- a/datafusion/functions-spark/src/test_common/mod.rs
+++ b/datafusion/functions-spark/src/test_common/mod.rs
@@ -14,4 +14,5 @@
 // KIND, either express or implied.  See the License for the
 // specific language governing permissions and limitations
 // under the License.
+
 pub mod file_util;

From 1c0901a4b5fa661cbc387ac433348601f642c530 Mon Sep 17 00:00:00 2001
From: Andy Grove <agrove@apache.org>
Date: Fri, 31 Jan 2025 13:12:06 -0700
Subject: [PATCH 60/68] fix Cargo.toml

---
 datafusion/functions-spark/Cargo.toml | 178 +-------------------------
 1 file changed, 1 insertion(+), 177 deletions(-)

diff --git a/datafusion/functions-spark/Cargo.toml b/datafusion/functions-spark/Cargo.toml
index 3552057e68de..4594e2270e52 100644
--- a/datafusion/functions-spark/Cargo.toml
+++ b/datafusion/functions-spark/Cargo.toml
@@ -15,183 +15,8 @@
 # specific language governing permissions and limitations
 # under the License.
 
-<<<<<<< HEAD
-[workspace]
-# datafusion-cli is excluded because of its Cargo.lock. See datafusion-cli/README.md.
-exclude = ["datafusion-cli", "dev/depcheck"]
-members = [
-    "datafusion/common",
-    "datafusion/common-runtime",
-    "datafusion/catalog",
-    "datafusion/core",
-    "datafusion/expr",
-    "datafusion/expr-common",
-    "datafusion/execution",
-    "datafusion/ffi",
-    "datafusion/functions",
-    "datafusion/functions-aggregate",
-    "datafusion/functions-aggregate-common",
-    "datafusion/functions-table",
-    "datafusion/functions-nested",
-    "datafusion/functions-window",
-    "datafusion/functions-window-common",
-    "datafusion/optimizer",
-    "datafusion/physical-expr",
-    "datafusion/physical-expr-common",
-    "datafusion/physical-optimizer",
-    "datafusion/physical-plan",
-    "datafusion/proto",
-    "datafusion/proto/gen",
-    "datafusion/proto-common",
-    "datafusion/proto-common/gen",
-    "datafusion/sql",
-    "datafusion/sqllogictest",
-    "datafusion/substrait",
-    "datafusion/wasmtest",
-    "datafusion-examples",
-    "datafusion-examples/examples/ffi/ffi_example_table_provider",
-    "datafusion-examples/examples/ffi/ffi_module_interface",
-    "datafusion-examples/examples/ffi/ffi_module_loader",
-    "test-utils",
-    "benchmarks",
-    "datafusion/macros",
-    "datafusion/doc",
-]
-resolver = "2"
-
-[workspace.package]
-authors = ["Apache DataFusion <dev@datafusion.apache.org>"]
-edition = "2021"
-homepage = "https://datafusion.apache.org"
-license = "Apache-2.0"
-readme = "README.md"
-repository = "https://github.com/apache/datafusion"
-rust-version = "1.81.0"
-version = "44.0.0"
-
-[workspace.dependencies]
-# We turn off default-features for some dependencies here so the workspaces which inherit them can
-# selectively turn them on if needed, since we can override default-features = true (from false)
-# for the inherited dependency but cannot do the reverse (override from true to false).
-#
-# See for more details: https://github.com/rust-lang/cargo/issues/11329
-ahash = { version = "0.8", default-features = false, features = [
-    "runtime-rng",
-] }
-arrow = { version = "54.0.0", features = [
-    "prettyprint",
-] }
-arrow-array = { version = "54.0.0", default-features = false, features = [
-    "chrono-tz",
-] }
-arrow-buffer = { version = "54.0.0", default-features = false }
-arrow-flight = { version = "54.0.0", features = [
-    "flight-sql-experimental",
-] }
-arrow-ipc = { version = "54.0.0", default-features = false, features = [
-    "lz4",
-] }
-arrow-ord = { version = "54.0.0", default-features = false }
-arrow-schema = { version = "54.0.0", default-features = false }
-async-trait = "0.1.73"
-bigdecimal = "0.4.7"
-bytes = "1.4"
-chrono = { version = "0.4.38", default-features = false }
-ctor = "0.2.9"
-dashmap = "6.0.1"
-datafusion = { path = "datafusion/core", version = "44.0.0", default-features = false }
-datafusion-catalog = { path = "datafusion/catalog", version = "44.0.0" }
-datafusion-common = { path = "datafusion/common", version = "44.0.0", default-features = false }
-datafusion-common-runtime = { path = "datafusion/common-runtime", version = "44.0.0" }
-datafusion-doc = { path = "datafusion/doc", version = "44.0.0" }
-datafusion-execution = { path = "datafusion/execution", version = "44.0.0" }
-datafusion-expr = { path = "datafusion/expr", version = "44.0.0" }
-datafusion-expr-common = { path = "datafusion/expr-common", version = "44.0.0" }
-datafusion-ffi = { path = "datafusion/ffi", version = "44.0.0" }
-datafusion-functions = { path = "datafusion/functions", version = "44.0.0" }
-datafusion-functions-aggregate = { path = "datafusion/functions-aggregate", version = "44.0.0" }
-datafusion-functions-aggregate-common = { path = "datafusion/functions-aggregate-common", version = "44.0.0" }
-datafusion-functions-nested = { path = "datafusion/functions-nested", version = "44.0.0" }
-datafusion-functions-table = { path = "datafusion/functions-table", version = "44.0.0" }
-datafusion-functions-window = { path = "datafusion/functions-window", version = "44.0.0" }
-datafusion-functions-window-common = { path = "datafusion/functions-window-common", version = "44.0.0" }
-datafusion-macros = { path = "datafusion/macros", version = "44.0.0" }
-datafusion-optimizer = { path = "datafusion/optimizer", version = "44.0.0", default-features = false }
-datafusion-physical-expr = { path = "datafusion/physical-expr", version = "44.0.0", default-features = false }
-datafusion-physical-expr-common = { path = "datafusion/physical-expr-common", version = "44.0.0", default-features = false }
-datafusion-physical-optimizer = { path = "datafusion/physical-optimizer", version = "44.0.0" }
-datafusion-physical-plan = { path = "datafusion/physical-plan", version = "44.0.0" }
-datafusion-proto = { path = "datafusion/proto", version = "44.0.0" }
-datafusion-proto-common = { path = "datafusion/proto-common", version = "44.0.0" }
-datafusion-sql = { path = "datafusion/sql", version = "44.0.0" }
-doc-comment = "0.3"
-env_logger = "0.11"
-futures = "0.3"
-half = { version = "2.2.1", default-features = false }
-hashbrown = { version = "0.14.5", features = ["raw"] }
-indexmap = "2.7.1"
-itertools = "0.14"
-log = "^0.4"
-object_store = { version = "0.11.0", default-features = false }
-parking_lot = "0.12"
-parquet = { version = "54.0.0", default-features = false, features = [
-    "arrow",
-    "async",
-    "object_store",
-] }
-pbjson = { version = "0.7.0" }
-# Should match arrow-flight's version of prost.
-prost = "0.13.1"
-prost-derive = "0.13.1"
-rand = "0.8"
-recursive = "0.1.1"
-regex = "1.8"
-rstest = "0.24.0"
-serde_json = "1"
-sqlparser = { version = "0.53.0", features = ["visitor"] }
-tempfile = "3"
-tokio = { version = "1.36", features = ["macros", "rt", "sync"] }
-url = "2.5.4"
-
-[profile.release]
-codegen-units = 1
-lto = true
-
-# the release profile takes a long time to build so we can use this profile during development to save time
-# cargo build --profile release-nonlto
-[profile.release-nonlto]
-codegen-units = 16
-debug = false
-debug-assertions = false
-incremental = false
-inherits = "release"
-lto = false
-opt-level = 3
-overflow-checks = false
-panic = 'unwind'
-rpath = false
-
-[profile.ci]
-inherits = "dev"
-incremental = false
-
-# ci turns off debug info, etc for dependencies to allow for smaller binaries making caching more effective
-[profile.ci.package."*"]
-debug = false
-debug-assertions = false
-strip = "debuginfo"
-incremental = false
-
-[workspace.lints.clippy]
-# Detects large stack-allocated futures that may cause stack overflow crashes (see threshold in clippy.toml)
-large_futures = "warn"
-
-[workspace.lints.rust]
-unexpected_cfgs = { level = "warn", check-cfg = ["cfg(tarpaulin)"] }
-unused_qualifications = "deny"
-=======
 [package]
-name = "datafusion-comet-spark-expr"
+name = "datafusion-functions-spark"
 description = "DataFusion expressions that emulate Apache Spark's behavior"
 version = { workspace = true }
 homepage = { workspace = true }
@@ -253,4 +78,3 @@ harness = false
 name = "aggregate"
 harness = false
 
->>>>>>> comet/main

From c404172711d6a0cba545b77f3259f763fb5d9320 Mon Sep 17 00:00:00 2001
From: Andy Grove <agrove@apache.org>
Date: Fri, 31 Jan 2025 13:15:49 -0700
Subject: [PATCH 61/68] fix header format

---
 .../functions-spark/benches/aggregate.rs      |  2 +-
 .../src/agg_funcs/covariance.rs               | 34 +++++++++----------
 2 files changed, 17 insertions(+), 19 deletions(-)

diff --git a/datafusion/functions-spark/benches/aggregate.rs b/datafusion/functions-spark/benches/aggregate.rs
index 051ac5eb6273..e74478d8ae96 100644
--- a/datafusion/functions-spark/benches/aggregate.rs
+++ b/datafusion/functions-spark/benches/aggregate.rs
@@ -13,7 +13,7 @@
 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 // KIND, either express or implied.  See the License for the
 // specific language governing permissions and limitations
-// under the License.use arrow::array::{ArrayRef, BooleanBuilder, Int32Builder, RecordBatch, StringBuilder};
+// under the License.
 
 use arrow::datatypes::{DataType, Field, Schema};
 use arrow_array::builder::{Decimal128Builder, StringBuilder};
diff --git a/datafusion/functions-spark/src/agg_funcs/covariance.rs b/datafusion/functions-spark/src/agg_funcs/covariance.rs
index fa3563cdea55..4b875ea80659 100644
--- a/datafusion/functions-spark/src/agg_funcs/covariance.rs
+++ b/datafusion/functions-spark/src/agg_funcs/covariance.rs
@@ -1,21 +1,19 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
 
 use std::any::Any;
 

From 024912ac3e717ef75701426de7a87ca218dd84b2 Mon Sep 17 00:00:00 2001
From: Andy Grove <agrove@apache.org>
Date: Fri, 31 Jan 2025 13:22:56 -0700
Subject: [PATCH 62/68] add to workspace

---
 Cargo.toml                            | 2 ++
 datafusion/functions-spark/Cargo.toml | 6 +++---
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/Cargo.toml b/Cargo.toml
index 85b26f802f05..63b9c0d3315e 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -32,6 +32,7 @@ members = [
     "datafusion/functions-aggregate-common",
     "datafusion/functions-table",
     "datafusion/functions-nested",
+    "datafusion/functions-spark",
     "datafusion/functions-window",
     "datafusion/functions-window-common",
     "datafusion/optimizer",
@@ -84,6 +85,7 @@ arrow-array = { version = "54.0.0", default-features = false, features = [
     "chrono-tz",
 ] }
 arrow-buffer = { version = "54.0.0", default-features = false }
+arrow-data = { version = "54.0.0", default-features = false }
 arrow-flight = { version = "54.0.0", features = [
     "flight-sql-experimental",
 ] }
diff --git a/datafusion/functions-spark/Cargo.toml b/datafusion/functions-spark/Cargo.toml
index 4594e2270e52..08361c06b0dc 100644
--- a/datafusion/functions-spark/Cargo.toml
+++ b/datafusion/functions-spark/Cargo.toml
@@ -38,10 +38,10 @@ datafusion-common = { workspace = true }
 datafusion-expr = { workspace = true }
 datafusion-expr-common = { workspace = true }
 datafusion-physical-expr = { workspace = true }
-chrono-tz = { workspace = true }
-num = { workspace = true }
+chrono-tz = "0.10.1"
+num = "0.4.3"
 regex = { workspace = true }
-thiserror = { workspace = true }
+thiserror = "2.0.11"
 futures = { workspace = true }
 twox-hash = "2.0.0"
 rand = { workspace = true }

From 071268d2942e560f91ef864d0374ab1352ffac89 Mon Sep 17 00:00:00 2001
From: Andy Grove <agrove@apache.org>
Date: Fri, 31 Jan 2025 13:29:56 -0700
Subject: [PATCH 63/68] prettier

---
 datafusion/functions-spark/README.md | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/datafusion/functions-spark/README.md b/datafusion/functions-spark/README.md
index 7bc5e18cbd64..4e6673d290d9 100644
--- a/datafusion/functions-spark/README.md
+++ b/datafusion/functions-spark/README.md
@@ -1,4 +1,5 @@
 <<<<<<< HEAD
+
 <!---
   Licensed to the Apache Software Foundation (ASF) under one
   or more contributor license agreements.  See the NOTICE file
@@ -175,8 +176,9 @@ compatible versions of all dependencies. If you encounter either of these
 problems, please open an issue or PR.
 
 [cargo book]: https://doc.rust-lang.org/cargo/guide/cargo-toml-vs-cargo-lock.html
-[cargo ci guide]: https://doc.rust-lang.org/cargo/guide/continuous-integration.html#verifying-latest-dependencies
-=======
+
+# [cargo ci guide]: https://doc.rust-lang.org/cargo/guide/continuous-integration.html#verifying-latest-dependencies
+
 <!--
 Licensed to the Apache Software Foundation (ASF) under one
 or more contributor license agreements.  See the NOTICE file
@@ -198,6 +200,7 @@ under the License.
 
 # datafusion-comet-spark-expr: Spark-compatible Expressions
 
-This crate provides Apache Spark-compatible expressions for use with DataFusion and is maintained as part of the 
+This crate provides Apache Spark-compatible expressions for use with DataFusion and is maintained as part of the
 [Apache DataFusion Comet](https://github.com/apache/datafusion-comet/) subproject.
->>>>>>> comet/main
+
+> > > > > > > comet/main

From f563cb7c8c926a8c4056510c2a489a247c471ff1 Mon Sep 17 00:00:00 2001
From: Andy Grove <agrove@apache.org>
Date: Fri, 31 Jan 2025 13:30:21 -0700
Subject: [PATCH 64/68] fmt

---
 .../functions-spark/benches/aggregate.rs      |   7 +-
 .../benches/cast_from_string.rs               |   9 +-
 .../functions-spark/benches/cast_numeric.rs   |   9 +-
 .../functions-spark/benches/conditional.rs    |   3 +-
 .../functions-spark/src/agg_funcs/avg.rs      |  13 +-
 .../src/agg_funcs/avg_decimal.rs              |  73 ++++---
 .../src/agg_funcs/correlation.rs              |  24 ++-
 .../src/agg_funcs/covariance.rs               |  11 +-
 .../functions-spark/src/agg_funcs/stddev.rs   |   9 +-
 .../src/agg_funcs/sum_decimal.rs              |   4 +-
 .../functions-spark/src/agg_funcs/variance.rs |  11 +-
 .../src/array_funcs/array_insert.rs           |  17 +-
 .../array_funcs/get_array_struct_fields.rs    |   7 +-
 .../src/array_funcs/list_extract.rs           |  15 +-
 .../src/bitwise_funcs/bitwise_not.rs          |   3 +-
 .../functions-spark/src/comet_scalar_funcs.rs |   9 +-
 .../src/conditional_funcs/if_expr.rs          |  16 +-
 .../src/conversion_funcs/cast.rs              | 169 +++++++++++-----
 .../src/datetime_funcs/date_arithmetic.rs     |   6 +-
 .../functions-spark/src/hash_funcs/murmur3.rs |  17 +-
 .../functions-spark/src/hash_funcs/utils.rs   |  22 ++-
 .../src/hash_funcs/xxhash64.rs                |   9 +-
 .../functions-spark/src/kernels/strings.rs    |   9 +-
 .../functions-spark/src/kernels/temporal.rs   | 181 ++++++++++--------
 datafusion/functions-spark/src/lib.rs         |   6 +-
 .../functions-spark/src/math_funcs/ceil.rs    |  14 +-
 .../functions-spark/src/math_funcs/div.rs     |   8 +-
 .../functions-spark/src/math_funcs/floor.rs   |  14 +-
 .../functions-spark/src/math_funcs/hex.rs     |   7 +-
 .../src/math_funcs/internal/checkoverflow.rs  |   6 +-
 .../src/math_funcs/internal/make_decimal.rs   |   4 +-
 .../src/math_funcs/internal/unscaled_value.rs |   6 +-
 .../src/math_funcs/negative.rs                |  28 ++-
 .../functions-spark/src/math_funcs/round.rs   |  32 +++-
 .../functions-spark/src/math_funcs/unhex.rs   |  12 +-
 .../src/predicate_funcs/is_nan.rs             |  12 +-
 .../src/predicate_funcs/rlike.rs              |   9 +-
 .../char_varchar_utils/read_side_padding.rs   |  17 +-
 .../functions-spark/src/string_funcs/chr.rs   |   4 +-
 .../src/string_funcs/prediction.rs            |  24 ++-
 .../src/string_funcs/substring.rs             |   4 +-
 .../src/struct_funcs/create_named_struct.rs   |   9 +-
 .../src/struct_funcs/get_struct_field.rs      |   6 +-
 datafusion/functions-spark/src/timezone.rs    |  13 +-
 datafusion/functions-spark/src/utils.rs       |  76 ++++----
 45 files changed, 640 insertions(+), 324 deletions(-)

diff --git a/datafusion/functions-spark/benches/aggregate.rs b/datafusion/functions-spark/benches/aggregate.rs
index e74478d8ae96..5791197ac13b 100644
--- a/datafusion/functions-spark/benches/aggregate.rs
+++ b/datafusion/functions-spark/benches/aggregate.rs
@@ -24,7 +24,9 @@ use datafusion::execution::TaskContext;
 use datafusion::functions_aggregate::average::avg_udaf;
 use datafusion::functions_aggregate::sum::sum_udaf;
 use datafusion::physical_expr::PhysicalExpr;
-use datafusion::physical_plan::aggregates::{AggregateExec, AggregateMode, PhysicalGroupBy};
+use datafusion::physical_plan::aggregates::{
+    AggregateExec, AggregateMode, PhysicalGroupBy,
+};
 use datafusion::physical_plan::memory::MemoryExec;
 use datafusion::physical_plan::ExecutionPlan;
 use datafusion_comet_spark_expr::AvgDecimal;
@@ -121,7 +123,8 @@ async fn agg_test(
     let schema = &partitions[0][0].schema();
     let scan: Arc<dyn ExecutionPlan> =
         Arc::new(MemoryExec::try_new(partitions, Arc::clone(schema), None).unwrap());
-    let aggregate = create_aggregate(scan, c0.clone(), c1.clone(), schema, aggregate_udf, alias);
+    let aggregate =
+        create_aggregate(scan, c0.clone(), c1.clone(), schema, aggregate_udf, alias);
     let mut stream = aggregate
         .execute(0, Arc::new(TaskContext::default()))
         .unwrap();
diff --git a/datafusion/functions-spark/benches/cast_from_string.rs b/datafusion/functions-spark/benches/cast_from_string.rs
index c6b0bcf39794..ad76abe6650f 100644
--- a/datafusion/functions-spark/benches/cast_from_string.rs
+++ b/datafusion/functions-spark/benches/cast_from_string.rs
@@ -26,9 +26,12 @@ fn criterion_benchmark(c: &mut Criterion) {
     let batch = create_utf8_batch();
     let expr = Arc::new(Column::new("a", 0));
     let spark_cast_options = SparkCastOptions::new(EvalMode::Legacy, "", false);
-    let cast_string_to_i8 = Cast::new(expr.clone(), DataType::Int8, spark_cast_options.clone());
-    let cast_string_to_i16 = Cast::new(expr.clone(), DataType::Int16, spark_cast_options.clone());
-    let cast_string_to_i32 = Cast::new(expr.clone(), DataType::Int32, spark_cast_options.clone());
+    let cast_string_to_i8 =
+        Cast::new(expr.clone(), DataType::Int8, spark_cast_options.clone());
+    let cast_string_to_i16 =
+        Cast::new(expr.clone(), DataType::Int16, spark_cast_options.clone());
+    let cast_string_to_i32 =
+        Cast::new(expr.clone(), DataType::Int32, spark_cast_options.clone());
     let cast_string_to_i64 = Cast::new(expr, DataType::Int64, spark_cast_options);
 
     let mut group = c.benchmark_group("cast_string_to_int");
diff --git a/datafusion/functions-spark/benches/cast_numeric.rs b/datafusion/functions-spark/benches/cast_numeric.rs
index 8ec8b2f89183..7f040d2960a8 100644
--- a/datafusion/functions-spark/benches/cast_numeric.rs
+++ b/datafusion/functions-spark/benches/cast_numeric.rs
@@ -25,9 +25,12 @@ use std::sync::Arc;
 fn criterion_benchmark(c: &mut Criterion) {
     let batch = create_int32_batch();
     let expr = Arc::new(Column::new("a", 0));
-    let spark_cast_options = SparkCastOptions::new_without_timezone(EvalMode::Legacy, false);
-    let cast_i32_to_i8 = Cast::new(expr.clone(), DataType::Int8, spark_cast_options.clone());
-    let cast_i32_to_i16 = Cast::new(expr.clone(), DataType::Int16, spark_cast_options.clone());
+    let spark_cast_options =
+        SparkCastOptions::new_without_timezone(EvalMode::Legacy, false);
+    let cast_i32_to_i8 =
+        Cast::new(expr.clone(), DataType::Int8, spark_cast_options.clone());
+    let cast_i32_to_i16 =
+        Cast::new(expr.clone(), DataType::Int16, spark_cast_options.clone());
     let cast_i32_to_i64 = Cast::new(expr, DataType::Int64, spark_cast_options);
 
     let mut group = c.benchmark_group("cast_int_to_int");
diff --git a/datafusion/functions-spark/benches/conditional.rs b/datafusion/functions-spark/benches/conditional.rs
index 97cd9fcb9f3a..2cf51c53247b 100644
--- a/datafusion/functions-spark/benches/conditional.rs
+++ b/datafusion/functions-spark/benches/conditional.rs
@@ -100,7 +100,8 @@ fn criterion_benchmark(c: &mut Criterion) {
     // CASE WHEN c1 <= 500 THEN c2 [ELSE NULL] END
     c.bench_function("case_when: column or null", |b| {
         let expr = Arc::new(
-            CaseExpr::try_new(None, vec![(predicate.clone(), make_col("c2", 1))], None).unwrap(),
+            CaseExpr::try_new(None, vec![(predicate.clone(), make_col("c2", 1))], None)
+                .unwrap(),
         );
         b.iter(|| black_box(expr.evaluate(black_box(&batch)).unwrap()))
     });
diff --git a/datafusion/functions-spark/src/agg_funcs/avg.rs b/datafusion/functions-spark/src/agg_funcs/avg.rs
index 816440ac9ade..0618596d1a6a 100644
--- a/datafusion/functions-spark/src/agg_funcs/avg.rs
+++ b/datafusion/functions-spark/src/agg_funcs/avg.rs
@@ -24,7 +24,8 @@ use arrow_array::{
 };
 use arrow_schema::{DataType, Field};
 use datafusion::logical_expr::{
-    type_coercion::aggregates::avg_return_type, Accumulator, EmitTo, GroupsAccumulator, Signature,
+    type_coercion::aggregates::avg_return_type, Accumulator, EmitTo, GroupsAccumulator,
+    Signature,
 };
 use datafusion_common::{not_impl_err, Result, ScalarValue};
 use datafusion_physical_expr::expressions::format_state_name;
@@ -111,10 +112,12 @@ impl AggregateUDFImpl for Avg {
     ) -> Result<Box<dyn GroupsAccumulator>> {
         // instantiate specialized accumulator based for the type
         match (&self.input_data_type, &self.result_data_type) {
-            (Float64, Float64) => Ok(Box::new(AvgGroupsAccumulator::<Float64Type, _>::new(
-                &self.input_data_type,
-                |sum: f64, count: i64| Ok(sum / count as f64),
-            ))),
+            (Float64, Float64) => {
+                Ok(Box::new(AvgGroupsAccumulator::<Float64Type, _>::new(
+                    &self.input_data_type,
+                    |sum: f64, count: i64| Ok(sum / count as f64),
+                )))
+            }
 
             _ => not_impl_err!(
                 "AvgGroupsAccumulator for ({} --> {})",
diff --git a/datafusion/functions-spark/src/agg_funcs/avg_decimal.rs b/datafusion/functions-spark/src/agg_funcs/avg_decimal.rs
index 05fc28e58341..e51f359decf1 100644
--- a/datafusion/functions-spark/src/agg_funcs/avg_decimal.rs
+++ b/datafusion/functions-spark/src/agg_funcs/avg_decimal.rs
@@ -30,7 +30,9 @@ use std::{any::Any, sync::Arc};
 
 use crate::utils::is_valid_decimal_precision;
 use arrow_array::ArrowNativeTypeOp;
-use arrow_data::decimal::{MAX_DECIMAL_FOR_EACH_PRECISION, MIN_DECIMAL_FOR_EACH_PRECISION};
+use arrow_data::decimal::{
+    MAX_DECIMAL_FOR_EACH_PRECISION, MIN_DECIMAL_FOR_EACH_PRECISION,
+};
 use datafusion::logical_expr::Volatility::Immutable;
 use datafusion_expr::function::{AccumulatorArgs, StateFieldsArgs};
 use datafusion_expr::type_coercion::aggregates::avg_return_type;
@@ -65,14 +67,15 @@ impl AggregateUDFImpl for AvgDecimal {
 
     fn accumulator(&self, _acc_args: AccumulatorArgs) -> Result<Box<dyn Accumulator>> {
         match (&self.sum_data_type, &self.result_data_type) {
-            (Decimal128(sum_precision, sum_scale), Decimal128(target_precision, target_scale)) => {
-                Ok(Box::new(AvgDecimalAccumulator::new(
-                    *sum_scale,
-                    *sum_precision,
-                    *target_precision,
-                    *target_scale,
-                )))
-            }
+            (
+                Decimal128(sum_precision, sum_scale),
+                Decimal128(target_precision, target_scale),
+            ) => Ok(Box::new(AvgDecimalAccumulator::new(
+                *sum_scale,
+                *sum_precision,
+                *target_precision,
+                *target_scale,
+            ))),
             _ => not_impl_err!(
                 "AvgDecimalAccumulator for ({} --> {})",
                 self.sum_data_type,
@@ -114,16 +117,17 @@ impl AggregateUDFImpl for AvgDecimal {
     ) -> Result<Box<dyn GroupsAccumulator>> {
         // instantiate specialized accumulator based for the type
         match (&self.sum_data_type, &self.result_data_type) {
-            (Decimal128(sum_precision, sum_scale), Decimal128(target_precision, target_scale)) => {
-                Ok(Box::new(AvgDecimalGroupsAccumulator::new(
-                    &self.result_data_type,
-                    &self.sum_data_type,
-                    *target_precision,
-                    *target_scale,
-                    *sum_precision,
-                    *sum_scale,
-                )))
-            }
+            (
+                Decimal128(sum_precision, sum_scale),
+                Decimal128(target_precision, target_scale),
+            ) => Ok(Box::new(AvgDecimalGroupsAccumulator::new(
+                &self.result_data_type,
+                &self.sum_data_type,
+                *target_precision,
+                *target_scale,
+                *sum_precision,
+                *sum_scale,
+            ))),
             _ => not_impl_err!(
                 "AvgDecimalGroupsAccumulator for ({} --> {})",
                 self.sum_data_type,
@@ -167,7 +171,12 @@ struct AvgDecimalAccumulator {
 }
 
 impl AvgDecimalAccumulator {
-    pub fn new(sum_scale: i8, sum_precision: u8, target_precision: u8, target_scale: i8) -> Self {
+    pub fn new(
+        sum_scale: i8,
+        sum_precision: u8,
+        target_precision: u8,
+        target_scale: i8,
+    ) -> Self {
         Self {
             sum: None,
             count: 0,
@@ -265,8 +274,10 @@ impl Accumulator for AvgDecimalAccumulator {
 
     fn evaluate(&mut self) -> Result<ScalarValue> {
         let scaler = 10_i128.pow(self.target_scale.saturating_sub(self.sum_scale) as u32);
-        let target_min = MIN_DECIMAL_FOR_EACH_PRECISION[self.target_precision as usize - 1];
-        let target_max = MAX_DECIMAL_FOR_EACH_PRECISION[self.target_precision as usize - 1];
+        let target_min =
+            MIN_DECIMAL_FOR_EACH_PRECISION[self.target_precision as usize - 1];
+        let target_max =
+            MAX_DECIMAL_FOR_EACH_PRECISION[self.target_precision as usize - 1];
 
         let result = self
             .sum
@@ -445,8 +456,10 @@ impl GroupsAccumulator for AvgDecimalGroupsAccumulator {
         let iter = sums.into_iter().zip(counts);
 
         let scaler = 10_i128.pow(self.target_scale.saturating_sub(self.sum_scale) as u32);
-        let target_min = MIN_DECIMAL_FOR_EACH_PRECISION[self.target_precision as usize - 1];
-        let target_max = MAX_DECIMAL_FOR_EACH_PRECISION[self.target_precision as usize - 1];
+        let target_min =
+            MIN_DECIMAL_FOR_EACH_PRECISION[self.target_precision as usize - 1];
+        let target_max =
+            MAX_DECIMAL_FOR_EACH_PRECISION[self.target_precision as usize - 1];
 
         for (sum, count) in iter {
             if count != 0 {
@@ -476,8 +489,8 @@ impl GroupsAccumulator for AvgDecimalGroupsAccumulator {
         let counts = Int64Array::new(counts.into(), nulls.clone());
 
         let sums = emit_to.take_needed(&mut self.sums);
-        let sums =
-            Decimal128Array::new(sums.into(), nulls).with_data_type(self.sum_data_type.clone());
+        let sums = Decimal128Array::new(sums.into(), nulls)
+            .with_data_type(self.sum_data_type.clone());
 
         Ok(vec![
             Arc::new(sums) as ArrayRef,
@@ -500,7 +513,13 @@ impl GroupsAccumulator for AvgDecimalGroupsAccumulator {
 /// * target_max: The maximum output value possible to represent with the target precision
 /// * scaler: scale factor for avg
 #[inline(always)]
-fn avg(sum: i128, count: i128, target_min: i128, target_max: i128, scaler: i128) -> Option<i128> {
+fn avg(
+    sum: i128,
+    count: i128,
+    target_min: i128,
+    target_max: i128,
+    scaler: i128,
+) -> Option<i128> {
     if let Some(value) = sum.checked_mul(scaler) {
         // `sum / count` with ROUND_HALF_UP
         let (div, rem) = value.div_rem(&count);
diff --git a/datafusion/functions-spark/src/agg_funcs/correlation.rs b/datafusion/functions-spark/src/agg_funcs/correlation.rs
index 5d6f9e0b439e..81b042cf6c2a 100644
--- a/datafusion/functions-spark/src/agg_funcs/correlation.rs
+++ b/datafusion/functions-spark/src/agg_funcs/correlation.rs
@@ -46,7 +46,11 @@ pub struct Correlation {
 }
 
 impl Correlation {
-    pub fn new(name: impl Into<String>, data_type: DataType, null_on_divide_by_zero: bool) -> Self {
+    pub fn new(
+        name: impl Into<String>,
+        data_type: DataType,
+        null_on_divide_by_zero: bool,
+    ) -> Self {
         // the result of correlation just support FLOAT64 data type.
         assert!(matches!(data_type, DataType::Float64));
         Self {
@@ -133,9 +137,18 @@ impl CorrelationAccumulator {
     /// Creates a new `CorrelationAccumulator`
     pub fn try_new(null_on_divide_by_zero: bool) -> Result<Self> {
         Ok(Self {
-            covar: CovarianceAccumulator::try_new(StatsType::Population, null_on_divide_by_zero)?,
-            stddev1: StddevAccumulator::try_new(StatsType::Population, null_on_divide_by_zero)?,
-            stddev2: StddevAccumulator::try_new(StatsType::Population, null_on_divide_by_zero)?,
+            covar: CovarianceAccumulator::try_new(
+                StatsType::Population,
+                null_on_divide_by_zero,
+            )?,
+            stddev1: StddevAccumulator::try_new(
+                StatsType::Population,
+                null_on_divide_by_zero,
+            )?,
+            stddev2: StddevAccumulator::try_new(
+                StatsType::Population,
+                null_on_divide_by_zero,
+            )?,
             null_on_divide_by_zero,
         })
     }
@@ -238,7 +251,8 @@ impl Accumulator for CorrelationAccumulator {
     }
 
     fn size(&self) -> usize {
-        std::mem::size_of_val(self) - std::mem::size_of_val(&self.covar) + self.covar.size()
+        std::mem::size_of_val(self) - std::mem::size_of_val(&self.covar)
+            + self.covar.size()
             - std::mem::size_of_val(&self.stddev1)
             + self.stddev1.size()
             - std::mem::size_of_val(&self.stddev2)
diff --git a/datafusion/functions-spark/src/agg_funcs/covariance.rs b/datafusion/functions-spark/src/agg_funcs/covariance.rs
index 4b875ea80659..8ef6afe54f91 100644
--- a/datafusion/functions-spark/src/agg_funcs/covariance.rs
+++ b/datafusion/functions-spark/src/agg_funcs/covariance.rs
@@ -262,12 +262,15 @@ impl Accumulator for CovarianceAccumulator {
                 continue;
             }
             let new_count = self.count + c;
-            let new_mean1 = self.mean1 * self.count / new_count + means1.value(i) * c / new_count;
-            let new_mean2 = self.mean2 * self.count / new_count + means2.value(i) * c / new_count;
+            let new_mean1 =
+                self.mean1 * self.count / new_count + means1.value(i) * c / new_count;
+            let new_mean2 =
+                self.mean2 * self.count / new_count + means2.value(i) * c / new_count;
             let delta1 = self.mean1 - means1.value(i);
             let delta2 = self.mean2 - means2.value(i);
-            let new_c =
-                self.algo_const + cs.value(i) + delta1 * delta2 * self.count * c / new_count;
+            let new_c = self.algo_const
+                + cs.value(i)
+                + delta1 * delta2 * self.count * c / new_count;
 
             self.count = new_count;
             self.mean1 = new_mean1;
diff --git a/datafusion/functions-spark/src/agg_funcs/stddev.rs b/datafusion/functions-spark/src/agg_funcs/stddev.rs
index 39dffa1c8e08..3694ec302f87 100644
--- a/datafusion/functions-spark/src/agg_funcs/stddev.rs
+++ b/datafusion/functions-spark/src/agg_funcs/stddev.rs
@@ -57,9 +57,9 @@ impl Stddev {
             name: name.into(),
             signature: Signature::coercible(
                 vec![
-                    datafusion_expr_common::signature::TypeSignatureClass::Native(Arc::new(
-                        NativeType::Float64,
-                    )),
+                    datafusion_expr_common::signature::TypeSignatureClass::Native(
+                        Arc::new(NativeType::Float64),
+                    ),
                 ],
                 Volatility::Immutable,
             ),
@@ -175,6 +175,7 @@ impl Accumulator for StddevAccumulator {
     }
 
     fn size(&self) -> usize {
-        std::mem::align_of_val(self) - std::mem::align_of_val(&self.variance) + self.variance.size()
+        std::mem::align_of_val(self) - std::mem::align_of_val(&self.variance)
+            + self.variance.size()
     }
 }
diff --git a/datafusion/functions-spark/src/agg_funcs/sum_decimal.rs b/datafusion/functions-spark/src/agg_funcs/sum_decimal.rs
index f3f34d9bfa9d..f17f80f77d70 100644
--- a/datafusion/functions-spark/src/agg_funcs/sum_decimal.rs
+++ b/datafusion/functions-spark/src/agg_funcs/sum_decimal.rs
@@ -466,7 +466,9 @@ mod tests {
     use arrow_array::builder::{Decimal128Builder, StringBuilder};
     use arrow_array::RecordBatch;
     use datafusion::execution::TaskContext;
-    use datafusion::physical_plan::aggregates::{AggregateExec, AggregateMode, PhysicalGroupBy};
+    use datafusion::physical_plan::aggregates::{
+        AggregateExec, AggregateMode, PhysicalGroupBy,
+    };
     use datafusion::physical_plan::memory::MemoryExec;
     use datafusion::physical_plan::ExecutionPlan;
     use datafusion_common::Result;
diff --git a/datafusion/functions-spark/src/agg_funcs/variance.rs b/datafusion/functions-spark/src/agg_funcs/variance.rs
index e71d713f5929..99332f5571a1 100644
--- a/datafusion/functions-spark/src/agg_funcs/variance.rs
+++ b/datafusion/functions-spark/src/agg_funcs/variance.rs
@@ -86,7 +86,10 @@ impl AggregateUDFImpl for Variance {
         )?))
     }
 
-    fn create_sliding_accumulator(&self, _args: AccumulatorArgs) -> Result<Box<dyn Accumulator>> {
+    fn create_sliding_accumulator(
+        &self,
+        _args: AccumulatorArgs,
+    ) -> Result<Box<dyn Accumulator>> {
         Ok(Box::new(VarianceAccumulator::try_new(
             self.stats_type,
             self.null_on_divide_by_zero,
@@ -205,9 +208,11 @@ impl Accumulator for VarianceAccumulator {
                 continue;
             }
             let new_count = self.count + c;
-            let new_mean = self.mean * self.count / new_count + means.value(i) * c / new_count;
+            let new_mean =
+                self.mean * self.count / new_count + means.value(i) * c / new_count;
             let delta = self.mean - means.value(i);
-            let new_m2 = self.m2 + m2s.value(i) + delta * delta * self.count * c / new_count;
+            let new_m2 =
+                self.m2 + m2s.value(i) + delta * delta * self.count * c / new_count;
 
             self.count = new_count;
             self.mean = new_mean;
diff --git a/datafusion/functions-spark/src/array_funcs/array_insert.rs b/datafusion/functions-spark/src/array_funcs/array_insert.rs
index 08fb78905633..f37e470f5bc0 100644
--- a/datafusion/functions-spark/src/array_funcs/array_insert.rs
+++ b/datafusion/functions-spark/src/array_funcs/array_insert.rs
@@ -21,7 +21,9 @@ use arrow::{
     datatypes::ArrowNativeType,
     record_batch::RecordBatch,
 };
-use arrow_array::{make_array, Array, ArrayRef, GenericListArray, Int32Array, OffsetSizeTrait};
+use arrow_array::{
+    make_array, Array, ArrayRef, GenericListArray, Int32Array, OffsetSizeTrait,
+};
 use arrow_schema::{DataType, Field, Schema};
 use datafusion::logical_expr::ColumnarValue;
 use datafusion_common::{
@@ -207,8 +209,11 @@ fn array_insert<O: OffsetSizeTrait>(
     let item_data = items_array.to_data();
     let new_capacity = Capacities::Array(values_data.len() + item_data.len());
 
-    let mut mutable_values =
-        MutableArrayData::with_capacities(vec![&values_data, &item_data], true, new_capacity);
+    let mut mutable_values = MutableArrayData::with_capacities(
+        vec![&values_data, &item_data],
+        true,
+        new_capacity,
+    );
 
     let mut new_offsets = vec![O::usize_as(0)];
     let mut new_nulls = Vec::<bool>::with_capacity(list_array.len());
@@ -231,7 +236,8 @@ fn array_insert<O: OffsetSizeTrait>(
 
         if pos == 0 {
             return Err(DataFusionError::Internal(
-                "Position for array_insert should be greter or less than zero".to_string(),
+                "Position for array_insert should be greter or less than zero"
+                    .to_string(),
             ));
         }
 
@@ -260,7 +266,8 @@ fn array_insert<O: OffsetSizeTrait>(
                 mutable_values.extend(1, row_index, row_index + 1);
                 // In that case spark actualy makes array longer than expected;
                 // For example, if pos is equal to 5, len is eq to 3, than resulted len will be 5
-                new_offsets.push(new_offsets[row_index] + O::usize_as(new_array_len) + O::one());
+                new_offsets
+                    .push(new_offsets[row_index] + O::usize_as(new_array_len) + O::one());
             }
         } else {
             // This comment is takes from the Apache Spark source code as is:
diff --git a/datafusion/functions-spark/src/array_funcs/get_array_struct_fields.rs b/datafusion/functions-spark/src/array_funcs/get_array_struct_fields.rs
index 8b1633649c9e..3f2a0c44ee95 100644
--- a/datafusion/functions-spark/src/array_funcs/get_array_struct_fields.rs
+++ b/datafusion/functions-spark/src/array_funcs/get_array_struct_fields.rs
@@ -150,7 +150,12 @@ fn get_array_struct_fields<O: OffsetSizeTrait>(
     let field = Arc::clone(&values.fields()[ordinal]);
 
     let offsets = list_array.offsets();
-    let array = GenericListArray::new(field, offsets.clone(), column, list_array.nulls().cloned());
+    let array = GenericListArray::new(
+        field,
+        offsets.clone(),
+        column,
+        list_array.nulls().cloned(),
+    );
 
     Ok(ColumnarValue::Array(Arc::new(array)))
 }
diff --git a/datafusion/functions-spark/src/array_funcs/list_extract.rs b/datafusion/functions-spark/src/array_funcs/list_extract.rs
index c0f2291d9fe4..b24a1ebf74f3 100644
--- a/datafusion/functions-spark/src/array_funcs/list_extract.rs
+++ b/datafusion/functions-spark/src/array_funcs/list_extract.rs
@@ -15,7 +15,9 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use arrow::{array::MutableArrayData, datatypes::ArrowNativeType, record_batch::RecordBatch};
+use arrow::{
+    array::MutableArrayData, datatypes::ArrowNativeType, record_batch::RecordBatch,
+};
 use arrow_array::{Array, GenericListArray, Int32Array, OffsetSizeTrait};
 use arrow_schema::{DataType, FieldRef, Schema};
 use datafusion::logical_expr::ColumnarValue;
@@ -111,7 +113,9 @@ impl PhysicalExpr for ListExtract {
             .map(|d| {
                 d.evaluate(batch).map(|value| match value {
                     ColumnarValue::Scalar(scalar)
-                        if !scalar.data_type().equals_datatype(child_value.data_type()) =>
+                        if !scalar
+                            .data_type()
+                            .equals_datatype(child_value.data_type()) =>
                     {
                         scalar.cast_to(child_value.data_type())
                     }
@@ -230,9 +234,12 @@ fn list_extract<O: OffsetSizeTrait>(
 
     let default_data = default_value.to_array()?.to_data();
 
-    let mut mutable = MutableArrayData::new(vec![&data, &default_data], true, index_array.len());
+    let mut mutable =
+        MutableArrayData::new(vec![&data, &default_data], true, index_array.len());
 
-    for (row, (offset_window, index)) in offsets.windows(2).zip(index_array.values()).enumerate() {
+    for (row, (offset_window, index)) in
+        offsets.windows(2).zip(index_array.values()).enumerate()
+    {
         let start = offset_window[0].as_usize();
         let len = offset_window[1].as_usize() - start;
 
diff --git a/datafusion/functions-spark/src/bitwise_funcs/bitwise_not.rs b/datafusion/functions-spark/src/bitwise_funcs/bitwise_not.rs
index d7c31836fff6..668e319adf1c 100644
--- a/datafusion/functions-spark/src/bitwise_funcs/bitwise_not.rs
+++ b/datafusion/functions-spark/src/bitwise_funcs/bitwise_not.rs
@@ -158,7 +158,8 @@ mod tests {
             Some(3455),
         ]);
 
-        let batch = RecordBatch::try_new(Arc::new(schema.clone()), vec![Arc::new(input)])?;
+        let batch =
+            RecordBatch::try_new(Arc::new(schema.clone()), vec![Arc::new(input)])?;
 
         let result = expr.evaluate(&batch)?.into_array(batch.num_rows())?;
         let result = as_int32_array(&result).expect("failed to downcast to In32Array");
diff --git a/datafusion/functions-spark/src/comet_scalar_funcs.rs b/datafusion/functions-spark/src/comet_scalar_funcs.rs
index 6070e81d20a0..ca9ae8e33e17 100644
--- a/datafusion/functions-spark/src/comet_scalar_funcs.rs
+++ b/datafusion/functions-spark/src/comet_scalar_funcs.rs
@@ -17,15 +17,16 @@
 
 use crate::hash_funcs::*;
 use crate::{
-    spark_ceil, spark_date_add, spark_date_sub, spark_decimal_div, spark_floor, spark_hex,
-    spark_isnan, spark_make_decimal, spark_read_side_padding, spark_round, spark_unhex,
-    spark_unscaled_value, SparkChrFunc,
+    spark_ceil, spark_date_add, spark_date_sub, spark_decimal_div, spark_floor,
+    spark_hex, spark_isnan, spark_make_decimal, spark_read_side_padding, spark_round,
+    spark_unhex, spark_unscaled_value, SparkChrFunc,
 };
 use arrow_schema::DataType;
 use datafusion_common::{DataFusionError, Result as DataFusionResult};
 use datafusion_expr::registry::FunctionRegistry;
 use datafusion_expr::{
-    ColumnarValue, ScalarFunctionImplementation, ScalarUDF, ScalarUDFImpl, Signature, Volatility,
+    ColumnarValue, ScalarFunctionImplementation, ScalarUDF, ScalarUDFImpl, Signature,
+    Volatility,
 };
 use std::any::Any;
 use std::fmt::Debug;
diff --git a/datafusion/functions-spark/src/conditional_funcs/if_expr.rs b/datafusion/functions-spark/src/conditional_funcs/if_expr.rs
index 01c754ad6de9..5924028a3929 100644
--- a/datafusion/functions-spark/src/conditional_funcs/if_expr.rs
+++ b/datafusion/functions-spark/src/conditional_funcs/if_expr.rs
@@ -75,7 +75,8 @@ impl IfExpr {
             true_expr: Arc::clone(&true_expr),
             false_expr: Arc::clone(&false_expr),
             case_expr: Arc::new(
-                CaseExpr::try_new(None, vec![(if_expr, true_expr)], Some(false_expr)).unwrap(),
+                CaseExpr::try_new(None, vec![(if_expr, true_expr)], Some(false_expr))
+                    .unwrap(),
             ),
         }
     }
@@ -93,7 +94,9 @@ impl PhysicalExpr for IfExpr {
     }
 
     fn nullable(&self, _input_schema: &Schema) -> Result<bool> {
-        if self.true_expr.nullable(_input_schema)? || self.true_expr.nullable(_input_schema)? {
+        if self.true_expr.nullable(_input_schema)?
+            || self.true_expr.nullable(_input_schema)?
+        {
             Ok(true)
         } else {
             Ok(false)
@@ -160,7 +163,8 @@ mod tests {
         let result = expr?.evaluate(&batch)?.into_array(batch.num_rows())?;
         let result = as_int32_array(&result)?;
 
-        let expected = &Int32Array::from(vec![Some(123), Some(999), Some(999), Some(999)]);
+        let expected =
+            &Int32Array::from(vec![Some(123), Some(999), Some(999), Some(999)]);
 
         assert_eq!(expected, result);
 
@@ -175,7 +179,8 @@ mod tests {
         let schema_ref = batch.schema();
 
         // if a >=1 123 else 999
-        let if_expr = binary(col("a", &schema_ref)?, Operator::GtEq, lit(1), &schema_ref)?;
+        let if_expr =
+            binary(col("a", &schema_ref)?, Operator::GtEq, lit(1), &schema_ref)?;
         let true_expr = lit(123i32);
         let false_expr = lit(999i32);
 
@@ -183,7 +188,8 @@ mod tests {
         let result = expr?.evaluate(&batch)?.into_array(batch.num_rows())?;
         let result = as_int32_array(&result)?;
 
-        let expected = &Int32Array::from(vec![Some(123), Some(999), Some(999), Some(123)]);
+        let expected =
+            &Int32Array::from(vec![Some(123), Some(999), Some(999), Some(123)]);
         assert_eq!(expected, result);
 
         Ok(())
diff --git a/datafusion/functions-spark/src/conversion_funcs/cast.rs b/datafusion/functions-spark/src/conversion_funcs/cast.rs
index 22b09f6106c9..ac62fc9253ad 100644
--- a/datafusion/functions-spark/src/conversion_funcs/cast.rs
+++ b/datafusion/functions-spark/src/conversion_funcs/cast.rs
@@ -23,13 +23,13 @@ use arrow::{
         cast::AsArray,
         types::{Date32Type, Int16Type, Int32Type, Int8Type},
         Array, ArrayRef, BooleanArray, Decimal128Array, Float32Array, Float64Array,
-        GenericStringArray, Int16Array, Int32Array, Int64Array, Int8Array, OffsetSizeTrait,
-        PrimitiveArray,
+        GenericStringArray, Int16Array, Int32Array, Int64Array, Int8Array,
+        OffsetSizeTrait, PrimitiveArray,
     },
     compute::{cast_with_options, take, unary, CastOptions},
     datatypes::{
-        ArrowPrimitiveType, Decimal128Type, DecimalType, Float32Type, Float64Type, Int64Type,
-        TimestampMicrosecondType,
+        ArrowPrimitiveType, Decimal128Type, DecimalType, Float32Type, Float64Type,
+        Int64Type, TimestampMicrosecondType,
     },
     error::ArrowError,
     record_batch::RecordBatch,
@@ -246,7 +246,9 @@ fn can_cast_from_string(to_type: &DataType, options: &SparkCastOptions) -> bool
 fn can_cast_to_string(from_type: &DataType, options: &SparkCastOptions) -> bool {
     use DataType::*;
     match from_type {
-        Boolean | Int8 | Int16 | Int32 | Int64 | Date32 | Date64 | Timestamp(_, _) => true,
+        Boolean | Int8 | Int16 | Int32 | Int64 | Date32 | Date64 | Timestamp(_, _) => {
+            true
+        }
         Float32 | Float64 => {
             // There can be differences in precision.
             // For example, the input \"1.4E-45\" will produce 1.0E-45 " +
@@ -408,7 +410,8 @@ macro_rules! cast_utf8_to_int {
 macro_rules! cast_utf8_to_timestamp {
     ($array:expr, $eval_mode:expr, $array_type:ty, $cast_method:ident, $tz:expr) => {{
         let len = $array.len();
-        let mut cast_array = PrimitiveArray::<$array_type>::builder(len).with_timezone("UTC");
+        let mut cast_array =
+            PrimitiveArray::<$array_type>::builder(len).with_timezone("UTC");
         for i in 0..len {
             if $array.is_null(i) {
                 cast_array.append_null()
@@ -519,9 +522,9 @@ macro_rules! cast_int_to_int_macro {
             EvalMode::Legacy => cast_array
                 .iter()
                 .map(|value| match value {
-                    Some(value) => {
-                        Ok::<Option<$to_native_type>, SparkError>(Some(value as $to_native_type))
-                    }
+                    Some(value) => Ok::<Option<$to_native_type>, SparkError>(Some(
+                        value as $to_native_type,
+                    )),
                     _ => Ok(None),
                 })
                 .collect::<Result<PrimitiveArray<$to_arrow_primitive_type>, _>>(),
@@ -574,7 +577,8 @@ macro_rules! cast_float_to_int16_down {
                 .iter()
                 .map(|value| match value {
                     Some(value) => {
-                        let is_overflow = value.is_nan() || value.abs() as i32 == i32::MAX;
+                        let is_overflow =
+                            value.is_nan() || value.abs() as i32 == i32::MAX;
                         if is_overflow {
                             return Err(cast_overflow(
                                 &format!($format_str, value).replace("e", "E"),
@@ -636,8 +640,8 @@ macro_rules! cast_float_to_int32_up {
                 .iter()
                 .map(|value| match value {
                     Some(value) => {
-                        let is_overflow =
-                            value.is_nan() || value.abs() as $rust_dest_type == $max_dest_val;
+                        let is_overflow = value.is_nan()
+                            || value.abs() as $rust_dest_type == $max_dest_val;
                         if is_overflow {
                             return Err(cast_overflow(
                                 &format!($format_str, value).replace("e", "E"),
@@ -653,9 +657,9 @@ macro_rules! cast_float_to_int32_up {
             _ => cast_array
                 .iter()
                 .map(|value| match value {
-                    Some(value) => {
-                        Ok::<Option<$rust_dest_type>, SparkError>(Some(value as $rust_dest_type))
-                    }
+                    Some(value) => Ok::<Option<$rust_dest_type>, SparkError>(Some(
+                        value as $rust_dest_type,
+                    )),
                     None => Ok(None),
                 })
                 .collect::<Result<$dest_array_type, _>>()?,
@@ -688,7 +692,8 @@ macro_rules! cast_decimal_to_int16_down {
                 .map(|value| match value {
                     Some(value) => {
                         let divisor = 10_i128.pow($scale as u32);
-                        let (truncated, decimal) = (value / divisor, (value % divisor).abs());
+                        let (truncated, decimal) =
+                            (value / divisor, (value % divisor).abs());
                         let is_overflow = truncated.abs() > i32::MAX.into();
                         if is_overflow {
                             return Err(cast_overflow(
@@ -751,7 +756,8 @@ macro_rules! cast_decimal_to_int32_up {
                 .map(|value| match value {
                     Some(value) => {
                         let divisor = 10_i128.pow($scale as u32);
-                        let (truncated, decimal) = (value / divisor, (value % divisor).abs());
+                        let (truncated, decimal) =
+                            (value / divisor, (value % divisor).abs());
                         let is_overflow = truncated.abs() > $max_dest_val.into();
                         if is_overflow {
                             return Err(cast_overflow(
@@ -856,8 +862,10 @@ pub fn spark_cast(
             // some cases e.g., scalar subquery, Spark will not fold it, so we need to handle it
             // here.
             let array = scalar.to_array()?;
-            let scalar =
-                ScalarValue::try_from_array(&cast_array(array, data_type, cast_options)?, 0)?;
+            let scalar = ScalarValue::try_from_array(
+                &cast_array(array, data_type, cast_options)?,
+                0,
+            )?;
             Ok(ColumnarValue::Scalar(scalar))
         }
     }
@@ -875,7 +883,8 @@ fn cast_array(
     let array = match &from_type {
         Dictionary(key_type, value_type)
             if key_type.as_ref() == &Int32
-                && (value_type.as_ref() == &Utf8 || value_type.as_ref() == &LargeUtf8) =>
+                && (value_type.as_ref() == &Utf8
+                    || value_type.as_ref() == &LargeUtf8) =>
         {
             let dict_array = array
                 .as_any()
@@ -945,7 +954,9 @@ fn cast_array(
         | (Decimal128(_, _), Int64)
             if eval_mode != EvalMode::Try =>
         {
-            spark_cast_nonintegral_numeric_to_integral(&array, eval_mode, from_type, to_type)
+            spark_cast_nonintegral_numeric_to_integral(
+                &array, eval_mode, from_type, to_type,
+            )
         }
         (Struct(_), Utf8) => Ok(casts_struct_to_string(array.as_struct(), cast_options)?),
         (Struct(_), Struct(_)) => Ok(cast_struct_to_struct(
@@ -960,7 +971,11 @@ fn cast_array(
             Ok(cast_with_options(&array, to_type, &CAST_OPTIONS)?)
         }
         _ if cast_options.is_adapting_schema
-            || is_datafusion_spark_compatible(from_type, to_type, cast_options.allow_incompat) =>
+            || is_datafusion_spark_compatible(
+                from_type,
+                to_type,
+                cast_options.allow_incompat,
+            ) =>
         {
             // use DataFusion cast only when we know that it is compatible with Spark
             Ok(cast_with_options(&array, to_type, &CAST_OPTIONS)?)
@@ -1042,14 +1057,20 @@ fn is_datafusion_spark_compatible(
         ),
         DataType::Utf8 if allow_incompat => matches!(
             to_type,
-            DataType::Binary | DataType::Float32 | DataType::Float64 | DataType::Decimal128(_, _)
+            DataType::Binary
+                | DataType::Float32
+                | DataType::Float64
+                | DataType::Decimal128(_, _)
         ),
         DataType::Utf8 => matches!(to_type, DataType::Binary),
         DataType::Date32 => matches!(to_type, DataType::Utf8),
         DataType::Timestamp(_, _) => {
             matches!(
                 to_type,
-                DataType::Int64 | DataType::Date32 | DataType::Utf8 | DataType::Timestamp(_, _)
+                DataType::Int64
+                    | DataType::Date32
+                    | DataType::Utf8
+                    | DataType::Timestamp(_, _)
             )
         }
         DataType::Binary => {
@@ -1155,7 +1176,9 @@ fn cast_string_to_int<OffsetSize: OffsetSizeTrait>(
         .expect("cast_string_to_int expected a string array");
 
     let cast_array: ArrayRef = match to_type {
-        DataType::Int8 => cast_utf8_to_int!(string_array, eval_mode, Int8Type, cast_string_to_i8)?,
+        DataType::Int8 => {
+            cast_utf8_to_int!(string_array, eval_mode, Int8Type, cast_string_to_i8)?
+        }
         DataType::Int16 => {
             cast_utf8_to_int!(string_array, eval_mode, Int16Type, cast_string_to_i16)?
         }
@@ -1684,7 +1707,11 @@ fn do_cast_string_to_int<
 
 /// Either return Ok(None) or Err(SparkError::CastInvalidValue) depending on the evaluation mode
 #[inline]
-fn none_or_err<T>(eval_mode: EvalMode, type_name: &str, str: &str) -> SparkResult<Option<T>> {
+fn none_or_err<T>(
+    eval_mode: EvalMode,
+    type_name: &str,
+    str: &str,
+) -> SparkResult<Option<T>> {
     match eval_mode {
         EvalMode::Ansi => Err(invalid_value(str, "STRING", type_name)),
         _ => Ok(None),
@@ -1714,7 +1741,10 @@ impl Display for Cast {
         write!(
             f,
             "Cast [data_type: {}, timezone: {}, child: {}, eval_mode: {:?}]",
-            self.data_type, self.cast_options.timezone, self.child, &self.cast_options.eval_mode
+            self.data_type,
+            self.cast_options.timezone,
+            self.child,
+            &self.cast_options.eval_mode
         )
     }
 }
@@ -1929,27 +1959,45 @@ fn get_timestamp_values<T: TimeZone>(
     parse_timestamp_to_micros(timestamp_info, tz)
 }
 
-fn parse_str_to_year_timestamp<T: TimeZone>(value: &str, tz: &T) -> SparkResult<Option<i64>> {
+fn parse_str_to_year_timestamp<T: TimeZone>(
+    value: &str,
+    tz: &T,
+) -> SparkResult<Option<i64>> {
     get_timestamp_values(value, "year", tz)
 }
 
-fn parse_str_to_month_timestamp<T: TimeZone>(value: &str, tz: &T) -> SparkResult<Option<i64>> {
+fn parse_str_to_month_timestamp<T: TimeZone>(
+    value: &str,
+    tz: &T,
+) -> SparkResult<Option<i64>> {
     get_timestamp_values(value, "month", tz)
 }
 
-fn parse_str_to_day_timestamp<T: TimeZone>(value: &str, tz: &T) -> SparkResult<Option<i64>> {
+fn parse_str_to_day_timestamp<T: TimeZone>(
+    value: &str,
+    tz: &T,
+) -> SparkResult<Option<i64>> {
     get_timestamp_values(value, "day", tz)
 }
 
-fn parse_str_to_hour_timestamp<T: TimeZone>(value: &str, tz: &T) -> SparkResult<Option<i64>> {
+fn parse_str_to_hour_timestamp<T: TimeZone>(
+    value: &str,
+    tz: &T,
+) -> SparkResult<Option<i64>> {
     get_timestamp_values(value, "hour", tz)
 }
 
-fn parse_str_to_minute_timestamp<T: TimeZone>(value: &str, tz: &T) -> SparkResult<Option<i64>> {
+fn parse_str_to_minute_timestamp<T: TimeZone>(
+    value: &str,
+    tz: &T,
+) -> SparkResult<Option<i64>> {
     get_timestamp_values(value, "minute", tz)
 }
 
-fn parse_str_to_second_timestamp<T: TimeZone>(value: &str, tz: &T) -> SparkResult<Option<i64>> {
+fn parse_str_to_second_timestamp<T: TimeZone>(
+    value: &str,
+    tz: &T,
+) -> SparkResult<Option<i64>> {
     get_timestamp_values(value, "second", tz)
 }
 
@@ -1960,7 +2008,10 @@ fn parse_str_to_microsecond_timestamp<T: TimeZone>(
     get_timestamp_values(value, "microsecond", tz)
 }
 
-fn parse_str_to_time_only_timestamp<T: TimeZone>(value: &str, tz: &T) -> SparkResult<Option<i64>> {
+fn parse_str_to_time_only_timestamp<T: TimeZone>(
+    value: &str,
+    tz: &T,
+) -> SparkResult<Option<i64>> {
     let values: Vec<&str> = value.split('T').collect();
     let time_values: Vec<u32> = values[1]
         .split(':')
@@ -2052,7 +2103,9 @@ fn date_parser(date_str: &str, eval_mode: EvalMode) -> SparkResult<Option<i32>>
 
     //loop to the end of string until we have processed 3 segments,
     //exit loop on encountering any space ' ' or 'T' after the 3rd segment
-    while j < str_end_trimmed && (current_segment < 3 && !(bytes[j] == b' ' || bytes[j] == b'T')) {
+    while j < str_end_trimmed
+        && (current_segment < 3 && !(bytes[j] == b' ' || bytes[j] == b'T'))
+    {
         let b = bytes[j];
         if current_segment < 2 && b == b'-' {
             //check for validity of year and month segments if current byte is separator
@@ -2107,17 +2160,23 @@ fn date_parser(date_str: &str, eval_mode: EvalMode) -> SparkResult<Option<i32>>
 /// Dictionary arrays are already unpacked by the DataFusion cast() since Spark cannot specify
 /// Dictionary as to_type. The from_type is taken before the DataFusion cast() runs in
 /// expressions/cast.rs, so it can be still Dictionary.
-fn spark_cast_postprocess(array: ArrayRef, from_type: &DataType, to_type: &DataType) -> ArrayRef {
+fn spark_cast_postprocess(
+    array: ArrayRef,
+    from_type: &DataType,
+    to_type: &DataType,
+) -> ArrayRef {
     match (from_type, to_type) {
         (DataType::Timestamp(_, _), DataType::Int64) => {
             // See Spark's `Cast` expression
-            unary_dyn::<_, Int64Type>(&array, |v| div_floor(v, MICROS_PER_SECOND)).unwrap()
+            unary_dyn::<_, Int64Type>(&array, |v| div_floor(v, MICROS_PER_SECOND))
+                .unwrap()
         }
         (DataType::Dictionary(_, value_type), DataType::Int64)
             if matches!(value_type.as_ref(), &DataType::Timestamp(_, _)) =>
         {
             // See Spark's `Cast` expression
-            unary_dyn::<_, Int64Type>(&array, |v| div_floor(v, MICROS_PER_SECOND)).unwrap()
+            unary_dyn::<_, Int64Type>(&array, |v| div_floor(v, MICROS_PER_SECOND))
+                .unwrap()
         }
         (DataType::Timestamp(_, _), DataType::Utf8) => remove_trailing_zeroes(array),
         (DataType::Dictionary(_, value_type), DataType::Utf8)
@@ -2275,7 +2334,8 @@ mod tests {
             Some(253402346096000000)
         );
         assert_eq!(
-            timestamp_parser("10000-01-01T12:34:56.123456", EvalMode::Legacy, tz).unwrap(),
+            timestamp_parser("10000-01-01T12:34:56.123456", EvalMode::Legacy, tz)
+                .unwrap(),
             Some(253402346096123456)
         );
         // assert_eq!(
@@ -2415,7 +2475,8 @@ mod tests {
             Some("2020-01-01T"),
         ]));
 
-        let result = cast_string_to_date(&array, &DataType::Date32, EvalMode::Legacy).unwrap();
+        let result =
+            cast_string_to_date(&array, &DataType::Date32, EvalMode::Legacy).unwrap();
 
         let date32_array = result
             .as_any()
@@ -2440,9 +2501,12 @@ mod tests {
         ]));
 
         for eval_mode in &[EvalMode::Legacy, EvalMode::Try, EvalMode::Ansi] {
-            let result =
-                cast_string_to_date(&array_with_invalid_date, &DataType::Date32, *eval_mode)
-                    .unwrap();
+            let result = cast_string_to_date(
+                &array_with_invalid_date,
+                &DataType::Date32,
+                *eval_mode,
+            )
+            .unwrap();
 
             let date32_array = result
                 .as_any()
@@ -2472,9 +2536,12 @@ mod tests {
         ]));
 
         for eval_mode in &[EvalMode::Legacy, EvalMode::Try] {
-            let result =
-                cast_string_to_date(&array_with_invalid_date, &DataType::Date32, *eval_mode)
-                    .unwrap();
+            let result = cast_string_to_date(
+                &array_with_invalid_date,
+                &DataType::Date32,
+                *eval_mode,
+            )
+            .unwrap();
 
             let date32_array = result
                 .as_any()
@@ -2496,8 +2563,11 @@ mod tests {
             );
         }
 
-        let result =
-            cast_string_to_date(&array_with_invalid_date, &DataType::Date32, EvalMode::Ansi);
+        let result = cast_string_to_date(
+            &array_with_invalid_date,
+            &DataType::Date32,
+            EvalMode::Ansi,
+        );
         match result {
             Err(e) => assert!(
                 e.to_string().contains(
@@ -2549,7 +2619,8 @@ mod tests {
     #[test]
     fn test_cast_invalid_timezone() {
         let timestamps: PrimitiveArray<TimestampMicrosecondType> = vec![i64::MAX].into();
-        let cast_options = SparkCastOptions::new(EvalMode::Legacy, "Not a valid timezone", false);
+        let cast_options =
+            SparkCastOptions::new(EvalMode::Legacy, "Not a valid timezone", false);
         let result = cast_array(
             Arc::new(timestamps.with_timezone("Europe/Copenhagen")),
             &DataType::Date32,
diff --git a/datafusion/functions-spark/src/datetime_funcs/date_arithmetic.rs b/datafusion/functions-spark/src/datetime_funcs/date_arithmetic.rs
index cc4da9af705d..602b2104e49c 100644
--- a/datafusion/functions-spark/src/datetime_funcs/date_arithmetic.rs
+++ b/datafusion/functions-spark/src/datetime_funcs/date_arithmetic.rs
@@ -30,7 +30,8 @@ use std::sync::Arc;
 macro_rules! scalar_date_arithmetic {
     ($start:expr, $days:expr, $op:expr) => {{
         let interval = IntervalDayTime::new(*$days as i32, 0);
-        let interval_cv = ColumnarValue::Scalar(ScalarValue::IntervalDayTime(Some(interval)));
+        let interval_cv =
+            ColumnarValue::Scalar(ScalarValue::IntervalDayTime(Some(interval)));
         datum::apply($start, &interval_cv, $op)
     }};
 }
@@ -38,7 +39,8 @@ macro_rules! array_date_arithmetic {
     ($days:expr, $interval_builder:expr, $intType:ty) => {{
         for day in $days.as_primitive::<$intType>().into_iter() {
             if let Some(non_null_day) = day {
-                $interval_builder.append_value(IntervalDayTime::new(non_null_day as i32, 0));
+                $interval_builder
+                    .append_value(IntervalDayTime::new(non_null_day as i32, 0));
             } else {
                 $interval_builder.append_null();
             }
diff --git a/datafusion/functions-spark/src/hash_funcs/murmur3.rs b/datafusion/functions-spark/src/hash_funcs/murmur3.rs
index 3ed70ba741c4..2590f716e5ab 100644
--- a/datafusion/functions-spark/src/hash_funcs/murmur3.rs
+++ b/datafusion/functions-spark/src/hash_funcs/murmur3.rs
@@ -25,7 +25,9 @@ use datafusion_expr::ColumnarValue;
 use std::sync::Arc;
 
 /// Spark compatible murmur3 hash (just `hash` in Spark) in vectorized execution fashion
-pub fn spark_murmur3_hash(args: &[ColumnarValue]) -> Result<ColumnarValue, DataFusionError> {
+pub fn spark_murmur3_hash(
+    args: &[ColumnarValue],
+) -> Result<ColumnarValue, DataFusionError> {
     let length = args.len();
     let seed = &args[length - 1];
     match seed {
@@ -195,9 +197,14 @@ mod tests {
 
     use crate::murmur3::create_murmur3_hashes;
     use crate::test_hashes_with_nulls;
-    use datafusion::arrow::array::{ArrayRef, Int32Array, Int64Array, Int8Array, StringArray};
+    use datafusion::arrow::array::{
+        ArrayRef, Int32Array, Int64Array, Int8Array, StringArray,
+    };
 
-    fn test_murmur3_hash<I: Clone, T: arrow_array::Array + From<Vec<Option<I>>> + 'static>(
+    fn test_murmur3_hash<
+        I: Clone,
+        T: arrow_array::Array + From<Vec<Option<I>>> + 'static,
+    >(
         values: Vec<Option<I>>,
         expected: Vec<u32>,
     ) {
@@ -271,8 +278,8 @@ mod tests {
         .map(|s| Some(s.to_string()))
         .collect::<Vec<Option<String>>>();
         let expected: Vec<u32> = vec![
-            3286402344, 2486176763, 142593372, 885025535, 2395000894, 1485273170, 0xfa37157b,
-            1322437556, 0xe860e5cc, 814637928,
+            3286402344, 2486176763, 142593372, 885025535, 2395000894, 1485273170,
+            0xfa37157b, 1322437556, 0xe860e5cc, 814637928,
         ];
 
         test_murmur3_hash::<String, StringArray>(input.clone(), expected);
diff --git a/datafusion/functions-spark/src/hash_funcs/utils.rs b/datafusion/functions-spark/src/hash_funcs/utils.rs
index ede89c0ba437..f1db9b4810e1 100644
--- a/datafusion/functions-spark/src/hash_funcs/utils.rs
+++ b/datafusion/functions-spark/src/hash_funcs/utils.rs
@@ -41,13 +41,18 @@ macro_rules! hash_array_boolean {
         let array = $column.as_any().downcast_ref::<$array_type>().unwrap();
         if array.null_count() == 0 {
             for (i, hash) in $hashes.iter_mut().enumerate() {
-                *hash = $hash_method($hash_input_type::from(array.value(i)).to_le_bytes(), *hash);
+                *hash = $hash_method(
+                    $hash_input_type::from(array.value(i)).to_le_bytes(),
+                    *hash,
+                );
             }
         } else {
             for (i, hash) in $hashes.iter_mut().enumerate() {
                 if !array.is_null(i) {
-                    *hash =
-                        $hash_method($hash_input_type::from(array.value(i)).to_le_bytes(), *hash);
+                    *hash = $hash_method(
+                        $hash_input_type::from(array.value(i)).to_le_bytes(),
+                        *hash,
+                    );
                 }
             }
         }
@@ -111,13 +116,18 @@ macro_rules! hash_array_small_decimal {
 
         if array.null_count() == 0 {
             for (i, hash) in $hashes.iter_mut().enumerate() {
-                *hash = $hash_method(i64::try_from(array.value(i)).unwrap().to_le_bytes(), *hash);
+                *hash = $hash_method(
+                    i64::try_from(array.value(i)).unwrap().to_le_bytes(),
+                    *hash,
+                );
             }
         } else {
             for (i, hash) in $hashes.iter_mut().enumerate() {
                 if !array.is_null(i) {
-                    *hash =
-                        $hash_method(i64::try_from(array.value(i)).unwrap().to_le_bytes(), *hash);
+                    *hash = $hash_method(
+                        i64::try_from(array.value(i)).unwrap().to_le_bytes(),
+                        *hash,
+                    );
                 }
             }
         }
diff --git a/datafusion/functions-spark/src/hash_funcs/xxhash64.rs b/datafusion/functions-spark/src/hash_funcs/xxhash64.rs
index e96f178d8313..cc33c535010a 100644
--- a/datafusion/functions-spark/src/hash_funcs/xxhash64.rs
+++ b/datafusion/functions-spark/src/hash_funcs/xxhash64.rs
@@ -141,9 +141,14 @@ mod tests {
 
     use super::create_xxhash64_hashes;
     use crate::test_hashes_with_nulls;
-    use datafusion::arrow::array::{ArrayRef, Int32Array, Int64Array, Int8Array, StringArray};
+    use datafusion::arrow::array::{
+        ArrayRef, Int32Array, Int64Array, Int8Array, StringArray,
+    };
 
-    fn test_xxhash64_hash<I: Clone, T: arrow_array::Array + From<Vec<Option<I>>> + 'static>(
+    fn test_xxhash64_hash<
+        I: Clone,
+        T: arrow_array::Array + From<Vec<Option<I>>> + 'static,
+    >(
         values: Vec<Option<I>>,
         expected: Vec<u64>,
     ) {
diff --git a/datafusion/functions-spark/src/kernels/strings.rs b/datafusion/functions-spark/src/kernels/strings.rs
index bb275fbb9f7d..8b0b3dea32f9 100644
--- a/datafusion/functions-spark/src/kernels/strings.rs
+++ b/datafusion/functions-spark/src/kernels/strings.rs
@@ -51,7 +51,11 @@ pub fn string_space(length: &dyn Array) -> Result<ArrayRef, DataFusionError> {
     }
 }
 
-pub fn substring(array: &dyn Array, start: i64, length: u64) -> Result<ArrayRef, DataFusionError> {
+pub fn substring(
+    array: &dyn Array,
+    start: i64,
+    length: u64,
+) -> Result<ArrayRef, DataFusionError> {
     match array.data_type() {
         DataType::LargeUtf8 => substring_by_char(
             array
@@ -88,7 +92,8 @@ pub fn substring(array: &dyn Array, start: i64, length: u64) -> Result<ArrayRef,
 
 fn generic_string_space<OffsetSize: OffsetSizeTrait>(length: &Int32Array) -> ArrayRef {
     let array_len = length.len();
-    let mut offsets = MutableBuffer::new((array_len + 1) * std::mem::size_of::<OffsetSize>());
+    let mut offsets =
+        MutableBuffer::new((array_len + 1) * std::mem::size_of::<OffsetSize>());
     let mut length_so_far = OffsetSize::zero();
 
     // compute null bitmap (copy)
diff --git a/datafusion/functions-spark/src/kernels/temporal.rs b/datafusion/functions-spark/src/kernels/temporal.rs
index cda4bef5d184..9cfa2d04b628 100644
--- a/datafusion/functions-spark/src/kernels/temporal.rs
+++ b/datafusion/functions-spark/src/kernels/temporal.rs
@@ -26,7 +26,9 @@ use arrow_array::{
     downcast_dictionary_array, downcast_temporal_array,
     temporal_conversions::*,
     timezone::Tz,
-    types::{ArrowDictionaryKeyType, ArrowTemporalType, Date32Type, TimestampMicrosecondType},
+    types::{
+        ArrowDictionaryKeyType, ArrowTemporalType, Date32Type, TimestampMicrosecondType,
+    },
     ArrowNumericType,
 };
 
@@ -250,7 +252,10 @@ fn trunc_date_to_microsec<T: Timelike>(dt: T) -> Option<T> {
 ///   array is an array of Date32 values. The array may be a dictionary array.
 ///
 ///   format is a scalar string specifying the format to apply to the timestamp value.
-pub(crate) fn date_trunc_dyn(array: &dyn Array, format: String) -> Result<ArrayRef, SparkError> {
+pub(crate) fn date_trunc_dyn(
+    array: &dyn Array,
+    format: String,
+) -> Result<ArrayRef, SparkError> {
     match array.data_type().clone() {
         DataType::Dictionary(_, _) => {
             downcast_dictionary_array!(
@@ -285,21 +290,21 @@ where
     let iter = ArrayIter::new(array);
     match array.data_type() {
         DataType::Date32 => match format.to_uppercase().as_str() {
-            "YEAR" | "YYYY" | "YY" => Ok(as_datetime_with_op::<&PrimitiveArray<T>, T, _>(
-                iter,
-                builder,
-                |dt| as_days_from_unix_epoch(trunc_date_to_year(dt)),
-            )),
+            "YEAR" | "YYYY" | "YY" => Ok(
+                as_datetime_with_op::<&PrimitiveArray<T>, T, _>(iter, builder, |dt| {
+                    as_days_from_unix_epoch(trunc_date_to_year(dt))
+                }),
+            ),
             "QUARTER" => Ok(as_datetime_with_op::<&PrimitiveArray<T>, T, _>(
                 iter,
                 builder,
                 |dt| as_days_from_unix_epoch(trunc_date_to_quarter(dt)),
             )),
-            "MONTH" | "MON" | "MM" => Ok(as_datetime_with_op::<&PrimitiveArray<T>, T, _>(
-                iter,
-                builder,
-                |dt| as_days_from_unix_epoch(trunc_date_to_month(dt)),
-            )),
+            "MONTH" | "MON" | "MM" => Ok(
+                as_datetime_with_op::<&PrimitiveArray<T>, T, _>(iter, builder, |dt| {
+                    as_days_from_unix_epoch(trunc_date_to_month(dt))
+                }),
+            ),
             "WEEK" => Ok(as_datetime_with_op::<&PrimitiveArray<T>, T, _>(
                 iter,
                 builder,
@@ -417,17 +422,21 @@ macro_rules! date_trunc_array_fmt_helper {
                                 as_days_from_unix_epoch(trunc_date_to_year(dt))
                             }))
                         }
-                        "QUARTER" => Ok(as_datetime_with_op_single(val, &mut builder, |dt| {
-                            as_days_from_unix_epoch(trunc_date_to_quarter(dt))
-                        })),
+                        "QUARTER" => {
+                            Ok(as_datetime_with_op_single(val, &mut builder, |dt| {
+                                as_days_from_unix_epoch(trunc_date_to_quarter(dt))
+                            }))
+                        }
                         "MONTH" | "MON" | "MM" => {
                             Ok(as_datetime_with_op_single(val, &mut builder, |dt| {
                                 as_days_from_unix_epoch(trunc_date_to_month(dt))
                             }))
                         }
-                        "WEEK" => Ok(as_datetime_with_op_single(val, &mut builder, |dt| {
-                            as_days_from_unix_epoch(trunc_date_to_week(dt))
-                        })),
+                        "WEEK" => {
+                            Ok(as_datetime_with_op_single(val, &mut builder, |dt| {
+                                as_days_from_unix_epoch(trunc_date_to_week(dt))
+                            }))
+                        }
                         _ => Err(SparkError::Internal(format!(
                             "Unsupported format: {:?} for function 'date_trunc'",
                             $formats.value(index)
@@ -536,56 +545,68 @@ where
     match array.data_type() {
         DataType::Timestamp(TimeUnit::Microsecond, Some(tz)) => {
             match format.to_uppercase().as_str() {
-                "YEAR" | "YYYY" | "YY" => {
-                    as_timestamp_tz_with_op::<&PrimitiveArray<T>, T, _>(iter, builder, tz, |dt| {
-                        as_micros_from_unix_epoch_utc(trunc_date_to_year(dt))
-                    })
-                }
-                "QUARTER" => {
-                    as_timestamp_tz_with_op::<&PrimitiveArray<T>, T, _>(iter, builder, tz, |dt| {
-                        as_micros_from_unix_epoch_utc(trunc_date_to_quarter(dt))
-                    })
-                }
-                "MONTH" | "MON" | "MM" => {
-                    as_timestamp_tz_with_op::<&PrimitiveArray<T>, T, _>(iter, builder, tz, |dt| {
-                        as_micros_from_unix_epoch_utc(trunc_date_to_month(dt))
-                    })
-                }
-                "WEEK" => {
-                    as_timestamp_tz_with_op::<&PrimitiveArray<T>, T, _>(iter, builder, tz, |dt| {
-                        as_micros_from_unix_epoch_utc(trunc_date_to_week(dt))
-                    })
-                }
-                "DAY" | "DD" => {
-                    as_timestamp_tz_with_op::<&PrimitiveArray<T>, T, _>(iter, builder, tz, |dt| {
-                        as_micros_from_unix_epoch_utc(trunc_date_to_day(dt))
-                    })
-                }
-                "HOUR" => {
-                    as_timestamp_tz_with_op::<&PrimitiveArray<T>, T, _>(iter, builder, tz, |dt| {
-                        as_micros_from_unix_epoch_utc(trunc_date_to_hour(dt))
-                    })
-                }
-                "MINUTE" => {
-                    as_timestamp_tz_with_op::<&PrimitiveArray<T>, T, _>(iter, builder, tz, |dt| {
-                        as_micros_from_unix_epoch_utc(trunc_date_to_minute(dt))
-                    })
-                }
-                "SECOND" => {
-                    as_timestamp_tz_with_op::<&PrimitiveArray<T>, T, _>(iter, builder, tz, |dt| {
-                        as_micros_from_unix_epoch_utc(trunc_date_to_second(dt))
-                    })
-                }
-                "MILLISECOND" => {
-                    as_timestamp_tz_with_op::<&PrimitiveArray<T>, T, _>(iter, builder, tz, |dt| {
-                        as_micros_from_unix_epoch_utc(trunc_date_to_ms(dt))
-                    })
-                }
-                "MICROSECOND" => {
-                    as_timestamp_tz_with_op::<&PrimitiveArray<T>, T, _>(iter, builder, tz, |dt| {
-                        as_micros_from_unix_epoch_utc(trunc_date_to_microsec(dt))
-                    })
-                }
+                "YEAR" | "YYYY" | "YY" => as_timestamp_tz_with_op::<
+                    &PrimitiveArray<T>,
+                    T,
+                    _,
+                >(iter, builder, tz, |dt| {
+                    as_micros_from_unix_epoch_utc(trunc_date_to_year(dt))
+                }),
+                "QUARTER" => as_timestamp_tz_with_op::<&PrimitiveArray<T>, T, _>(
+                    iter,
+                    builder,
+                    tz,
+                    |dt| as_micros_from_unix_epoch_utc(trunc_date_to_quarter(dt)),
+                ),
+                "MONTH" | "MON" | "MM" => as_timestamp_tz_with_op::<
+                    &PrimitiveArray<T>,
+                    T,
+                    _,
+                >(iter, builder, tz, |dt| {
+                    as_micros_from_unix_epoch_utc(trunc_date_to_month(dt))
+                }),
+                "WEEK" => as_timestamp_tz_with_op::<&PrimitiveArray<T>, T, _>(
+                    iter,
+                    builder,
+                    tz,
+                    |dt| as_micros_from_unix_epoch_utc(trunc_date_to_week(dt)),
+                ),
+                "DAY" | "DD" => as_timestamp_tz_with_op::<&PrimitiveArray<T>, T, _>(
+                    iter,
+                    builder,
+                    tz,
+                    |dt| as_micros_from_unix_epoch_utc(trunc_date_to_day(dt)),
+                ),
+                "HOUR" => as_timestamp_tz_with_op::<&PrimitiveArray<T>, T, _>(
+                    iter,
+                    builder,
+                    tz,
+                    |dt| as_micros_from_unix_epoch_utc(trunc_date_to_hour(dt)),
+                ),
+                "MINUTE" => as_timestamp_tz_with_op::<&PrimitiveArray<T>, T, _>(
+                    iter,
+                    builder,
+                    tz,
+                    |dt| as_micros_from_unix_epoch_utc(trunc_date_to_minute(dt)),
+                ),
+                "SECOND" => as_timestamp_tz_with_op::<&PrimitiveArray<T>, T, _>(
+                    iter,
+                    builder,
+                    tz,
+                    |dt| as_micros_from_unix_epoch_utc(trunc_date_to_second(dt)),
+                ),
+                "MILLISECOND" => as_timestamp_tz_with_op::<&PrimitiveArray<T>, T, _>(
+                    iter,
+                    builder,
+                    tz,
+                    |dt| as_micros_from_unix_epoch_utc(trunc_date_to_ms(dt)),
+                ),
+                "MICROSECOND" => as_timestamp_tz_with_op::<&PrimitiveArray<T>, T, _>(
+                    iter,
+                    builder,
+                    tz,
+                    |dt| as_micros_from_unix_epoch_utc(trunc_date_to_microsec(dt)),
+                ),
                 _ => Err(SparkError::Internal(format!(
                     "Unsupported format: {:?} for function 'timestamp_trunc'",
                     format
@@ -810,7 +831,8 @@ where
 #[cfg(test)]
 mod tests {
     use crate::kernels::temporal::{
-        date_trunc, date_trunc_array_fmt_dyn, timestamp_trunc, timestamp_trunc_array_fmt_dyn,
+        date_trunc, date_trunc_array_fmt_dyn, timestamp_trunc,
+        timestamp_trunc_array_fmt_dyn,
     };
     use arrow_array::{
         builder::{PrimitiveDictionaryBuilder, StringDictionaryBuilder},
@@ -867,7 +889,8 @@ mod tests {
         let fmt_array = StringArray::from(fmt_vec);
 
         // timestamp dictionary array
-        let mut date_dict_builder = PrimitiveDictionaryBuilder::<Int32Type, Date32Type>::new();
+        let mut date_dict_builder =
+            PrimitiveDictionaryBuilder::<Int32Type, Date32Type>::new();
         for v in array.iter() {
             date_dict_builder
                 .append(v.unwrap())
@@ -910,7 +933,8 @@ mod tests {
 
         // verify input format arrays
         let fmt_iter = ArrayIter::new(&fmt_array);
-        let mut fmt_dict_iter = fmt_dict.downcast_dict::<StringArray>().unwrap().into_iter();
+        let mut fmt_dict_iter =
+            fmt_dict.downcast_dict::<StringArray>().unwrap().into_iter();
         for val in fmt_iter {
             assert_eq!(
                 fmt_dict_iter
@@ -924,7 +948,8 @@ mod tests {
         if let Ok(a) = date_trunc_array_fmt_dyn(&array, &fmt_array) {
             for i in 0..array.len() {
                 assert!(
-                    array.value(i) >= a.as_any().downcast_ref::<Date32Array>().unwrap().value(i)
+                    array.value(i)
+                        >= a.as_any().downcast_ref::<Date32Array>().unwrap().value(i)
                 )
             }
         } else {
@@ -933,7 +958,8 @@ mod tests {
         if let Ok(a) = date_trunc_array_fmt_dyn(&array_dict, &fmt_array) {
             for i in 0..array.len() {
                 assert!(
-                    array.value(i) >= a.as_any().downcast_ref::<Date32Array>().unwrap().value(i)
+                    array.value(i)
+                        >= a.as_any().downcast_ref::<Date32Array>().unwrap().value(i)
                 )
             }
         } else {
@@ -942,7 +968,8 @@ mod tests {
         if let Ok(a) = date_trunc_array_fmt_dyn(&array, &fmt_dict) {
             for i in 0..array.len() {
                 assert!(
-                    array.value(i) >= a.as_any().downcast_ref::<Date32Array>().unwrap().value(i)
+                    array.value(i)
+                        >= a.as_any().downcast_ref::<Date32Array>().unwrap().value(i)
                 )
             }
         } else {
@@ -951,7 +978,8 @@ mod tests {
         if let Ok(a) = date_trunc_array_fmt_dyn(&array_dict, &fmt_dict) {
             for i in 0..array.len() {
                 assert!(
-                    array.value(i) >= a.as_any().downcast_ref::<Date32Array>().unwrap().value(i)
+                    array.value(i)
+                        >= a.as_any().downcast_ref::<Date32Array>().unwrap().value(i)
                 )
             }
         } else {
@@ -1081,7 +1109,8 @@ mod tests {
 
         // verify input format arrays
         let fmt_iter = ArrayIter::new(&fmt_array);
-        let mut fmt_dict_iter = fmt_dict.downcast_dict::<StringArray>().unwrap().into_iter();
+        let mut fmt_dict_iter =
+            fmt_dict.downcast_dict::<StringArray>().unwrap().into_iter();
         for val in fmt_iter {
             assert_eq!(
                 fmt_dict_iter
diff --git a/datafusion/functions-spark/src/lib.rs b/datafusion/functions-spark/src/lib.rs
index 9bf6bb24f7c7..f87648a1c9a7 100644
--- a/datafusion/functions-spark/src/lib.rs
+++ b/datafusion/functions-spark/src/lib.rs
@@ -67,9 +67,9 @@ pub use error::{SparkError, SparkResult};
 pub use hash_funcs::*;
 pub use json_funcs::ToJson;
 pub use math_funcs::{
-    create_negate_expr, spark_ceil, spark_decimal_div, spark_floor, spark_hex, spark_make_decimal,
-    spark_round, spark_unhex, spark_unscaled_value, CheckOverflow, NegativeExpr,
-    NormalizeNaNAndZero,
+    create_negate_expr, spark_ceil, spark_decimal_div, spark_floor, spark_hex,
+    spark_make_decimal, spark_round, spark_unhex, spark_unscaled_value, CheckOverflow,
+    NegativeExpr, NormalizeNaNAndZero,
 };
 pub use string_funcs::*;
 
diff --git a/datafusion/functions-spark/src/math_funcs/ceil.rs b/datafusion/functions-spark/src/math_funcs/ceil.rs
index 9c0fc9b57142..923a1d0cb10f 100644
--- a/datafusion/functions-spark/src/math_funcs/ceil.rs
+++ b/datafusion/functions-spark/src/math_funcs/ceil.rs
@@ -16,7 +16,9 @@
 // under the License.
 
 use crate::downcast_compute_op;
-use crate::math_funcs::utils::{get_precision_scale, make_decimal_array, make_decimal_scalar};
+use crate::math_funcs::utils::{
+    get_precision_scale, make_decimal_array, make_decimal_scalar,
+};
 use arrow::array::{Float32Array, Float64Array, Int64Array};
 use arrow_array::{Array, ArrowNativeTypeOp};
 use arrow_schema::DataType;
@@ -34,11 +36,13 @@ pub fn spark_ceil(
     match value {
         ColumnarValue::Array(array) => match array.data_type() {
             DataType::Float32 => {
-                let result = downcast_compute_op!(array, "ceil", ceil, Float32Array, Int64Array);
+                let result =
+                    downcast_compute_op!(array, "ceil", ceil, Float32Array, Int64Array);
                 Ok(ColumnarValue::Array(result?))
             }
             DataType::Float64 => {
-                let result = downcast_compute_op!(array, "ceil", ceil, Float64Array, Int64Array);
+                let result =
+                    downcast_compute_op!(array, "ceil", ceil, Float64Array, Int64Array);
                 Ok(ColumnarValue::Array(result?))
             }
             DataType::Int64 => {
@@ -62,7 +66,9 @@ pub fn spark_ceil(
             ScalarValue::Float64(a) => Ok(ColumnarValue::Scalar(ScalarValue::Int64(
                 a.map(|x| x.ceil() as i64),
             ))),
-            ScalarValue::Int64(a) => Ok(ColumnarValue::Scalar(ScalarValue::Int64(a.map(|x| x)))),
+            ScalarValue::Int64(a) => {
+                Ok(ColumnarValue::Scalar(ScalarValue::Int64(a.map(|x| x))))
+            }
             ScalarValue::Decimal128(a, _, scale) if *scale > 0 => {
                 let f = decimal_ceil_f(scale);
                 let (precision, scale) = get_precision_scale(data_type);
diff --git a/datafusion/functions-spark/src/math_funcs/div.rs b/datafusion/functions-spark/src/math_funcs/div.rs
index 72c23b9e9b2c..1b6d99065a9a 100644
--- a/datafusion/functions-spark/src/math_funcs/div.rs
+++ b/datafusion/functions-spark/src/math_funcs/div.rs
@@ -41,14 +41,18 @@ pub fn spark_decimal_div(
     let (p3, s3) = get_precision_scale(data_type);
 
     let (left, right): (ArrayRef, ArrayRef) = match (left, right) {
-        (ColumnarValue::Array(l), ColumnarValue::Array(r)) => (Arc::clone(l), Arc::clone(r)),
+        (ColumnarValue::Array(l), ColumnarValue::Array(r)) => {
+            (Arc::clone(l), Arc::clone(r))
+        }
         (ColumnarValue::Scalar(l), ColumnarValue::Array(r)) => {
             (l.to_array_of_size(r.len())?, Arc::clone(r))
         }
         (ColumnarValue::Array(l), ColumnarValue::Scalar(r)) => {
             (Arc::clone(l), r.to_array_of_size(l.len())?)
         }
-        (ColumnarValue::Scalar(l), ColumnarValue::Scalar(r)) => (l.to_array()?, r.to_array()?),
+        (ColumnarValue::Scalar(l), ColumnarValue::Scalar(r)) => {
+            (l.to_array()?, r.to_array()?)
+        }
     };
     let left = left.as_primitive::<Decimal128Type>();
     let right = right.as_primitive::<Decimal128Type>();
diff --git a/datafusion/functions-spark/src/math_funcs/floor.rs b/datafusion/functions-spark/src/math_funcs/floor.rs
index 9a95d95afe14..06755493bcb0 100644
--- a/datafusion/functions-spark/src/math_funcs/floor.rs
+++ b/datafusion/functions-spark/src/math_funcs/floor.rs
@@ -16,7 +16,9 @@
 // under the License.
 
 use crate::downcast_compute_op;
-use crate::math_funcs::utils::{get_precision_scale, make_decimal_array, make_decimal_scalar};
+use crate::math_funcs::utils::{
+    get_precision_scale, make_decimal_array, make_decimal_scalar,
+};
 use arrow::array::{Float32Array, Float64Array, Int64Array};
 use arrow_array::{Array, ArrowNativeTypeOp};
 use arrow_schema::DataType;
@@ -34,11 +36,13 @@ pub fn spark_floor(
     match value {
         ColumnarValue::Array(array) => match array.data_type() {
             DataType::Float32 => {
-                let result = downcast_compute_op!(array, "floor", floor, Float32Array, Int64Array);
+                let result =
+                    downcast_compute_op!(array, "floor", floor, Float32Array, Int64Array);
                 Ok(ColumnarValue::Array(result?))
             }
             DataType::Float64 => {
-                let result = downcast_compute_op!(array, "floor", floor, Float64Array, Int64Array);
+                let result =
+                    downcast_compute_op!(array, "floor", floor, Float64Array, Int64Array);
                 Ok(ColumnarValue::Array(result?))
             }
             DataType::Int64 => {
@@ -62,7 +66,9 @@ pub fn spark_floor(
             ScalarValue::Float64(a) => Ok(ColumnarValue::Scalar(ScalarValue::Int64(
                 a.map(|x| x.floor() as i64),
             ))),
-            ScalarValue::Int64(a) => Ok(ColumnarValue::Scalar(ScalarValue::Int64(a.map(|x| x)))),
+            ScalarValue::Int64(a) => {
+                Ok(ColumnarValue::Scalar(ScalarValue::Int64(a.map(|x| x))))
+            }
             ScalarValue::Decimal128(a, _, scale) if *scale > 0 => {
                 let f = decimal_floor_f(scale);
                 let (precision, scale) = get_precision_scale(data_type);
diff --git a/datafusion/functions-spark/src/math_funcs/hex.rs b/datafusion/functions-spark/src/math_funcs/hex.rs
index 4ccd4f453879..bedcfd679d16 100644
--- a/datafusion/functions-spark/src/math_funcs/hex.rs
+++ b/datafusion/functions-spark/src/math_funcs/hex.rs
@@ -75,7 +75,8 @@ pub fn spark_hex(args: &[ColumnarValue]) -> Result<ColumnarValue, DataFusionErro
             DataType::Int64 => {
                 let array = as_int64_array(array)?;
 
-                let hexed_array: StringArray = array.iter().map(|v| v.map(hex_int64)).collect();
+                let hexed_array: StringArray =
+                    array.iter().map(|v| v.map(hex_int64)).collect();
 
                 Ok(ColumnarValue::Array(Arc::new(hexed_array)))
             }
@@ -166,8 +167,8 @@ mod test {
 
     use arrow::{
         array::{
-            as_string_array, BinaryDictionaryBuilder, PrimitiveDictionaryBuilder, StringBuilder,
-            StringDictionaryBuilder,
+            as_string_array, BinaryDictionaryBuilder, PrimitiveDictionaryBuilder,
+            StringBuilder, StringDictionaryBuilder,
         },
         datatypes::{Int32Type, Int64Type},
     };
diff --git a/datafusion/functions-spark/src/math_funcs/internal/checkoverflow.rs b/datafusion/functions-spark/src/math_funcs/internal/checkoverflow.rs
index 528bbd5d96fc..2d38f5e2f39f 100644
--- a/datafusion/functions-spark/src/math_funcs/internal/checkoverflow.rs
+++ b/datafusion/functions-spark/src/math_funcs/internal/checkoverflow.rs
@@ -59,7 +59,11 @@ impl PartialEq for CheckOverflow {
 }
 
 impl CheckOverflow {
-    pub fn new(child: Arc<dyn PhysicalExpr>, data_type: DataType, fail_on_error: bool) -> Self {
+    pub fn new(
+        child: Arc<dyn PhysicalExpr>,
+        data_type: DataType,
+        fail_on_error: bool,
+    ) -> Self {
         Self {
             child,
             data_type,
diff --git a/datafusion/functions-spark/src/math_funcs/internal/make_decimal.rs b/datafusion/functions-spark/src/math_funcs/internal/make_decimal.rs
index dd761cd69f05..9fdb61c4d0ce 100644
--- a/datafusion/functions-spark/src/math_funcs/internal/make_decimal.rs
+++ b/datafusion/functions-spark/src/math_funcs/internal/make_decimal.rs
@@ -60,7 +60,9 @@ pub fn spark_make_decimal(
 #[inline]
 fn long_to_decimal(v: &Option<i64>, precision: u8) -> Option<i128> {
     match v {
-        Some(v) if validate_decimal_precision(*v as i128, precision).is_ok() => Some(*v as i128),
+        Some(v) if validate_decimal_precision(*v as i128, precision).is_ok() => {
+            Some(*v as i128)
+        }
         _ => None,
     }
 }
diff --git a/datafusion/functions-spark/src/math_funcs/internal/unscaled_value.rs b/datafusion/functions-spark/src/math_funcs/internal/unscaled_value.rs
index 053f9b078f06..f45c047ba270 100644
--- a/datafusion/functions-spark/src/math_funcs/internal/unscaled_value.rs
+++ b/datafusion/functions-spark/src/math_funcs/internal/unscaled_value.rs
@@ -27,9 +27,9 @@ use std::sync::Arc;
 pub fn spark_unscaled_value(args: &[ColumnarValue]) -> DataFusionResult<ColumnarValue> {
     match &args[0] {
         ColumnarValue::Scalar(v) => match v {
-            ScalarValue::Decimal128(d, _, _) => Ok(ColumnarValue::Scalar(ScalarValue::Int64(
-                d.map(|n| n as i64),
-            ))),
+            ScalarValue::Decimal128(d, _, _) => Ok(ColumnarValue::Scalar(
+                ScalarValue::Int64(d.map(|n| n as i64)),
+            )),
             dt => internal_err!("Expected Decimal128 but found {dt:}"),
         },
         ColumnarValue::Array(a) => {
diff --git a/datafusion/functions-spark/src/math_funcs/negative.rs b/datafusion/functions-spark/src/math_funcs/negative.rs
index cafbcfcbdb0f..b7d66ab96bcb 100644
--- a/datafusion/functions-spark/src/math_funcs/negative.rs
+++ b/datafusion/functions-spark/src/math_funcs/negative.rs
@@ -118,16 +118,36 @@ impl PhysicalExpr for NegativeExpr {
                 if self.fail_on_error {
                     match array.data_type() {
                         DataType::Int8 => {
-                            check_overflow!(array, arrow::array::Int8Array, i8::MIN, "byte")
+                            check_overflow!(
+                                array,
+                                arrow::array::Int8Array,
+                                i8::MIN,
+                                "byte"
+                            )
                         }
                         DataType::Int16 => {
-                            check_overflow!(array, arrow::array::Int16Array, i16::MIN, "short")
+                            check_overflow!(
+                                array,
+                                arrow::array::Int16Array,
+                                i16::MIN,
+                                "short"
+                            )
                         }
                         DataType::Int32 => {
-                            check_overflow!(array, arrow::array::Int32Array, i32::MIN, "integer")
+                            check_overflow!(
+                                array,
+                                arrow::array::Int32Array,
+                                i32::MIN,
+                                "integer"
+                            )
                         }
                         DataType::Int64 => {
-                            check_overflow!(array, arrow::array::Int64Array, i64::MIN, "long")
+                            check_overflow!(
+                                array,
+                                arrow::array::Int64Array,
+                                i64::MIN,
+                                "long"
+                            )
                         }
                         DataType::Interval(value) => match value {
                             arrow::datatypes::IntervalUnit::YearMonth => check_overflow!(
diff --git a/datafusion/functions-spark/src/math_funcs/round.rs b/datafusion/functions-spark/src/math_funcs/round.rs
index a47b7bc294f3..eae523007795 100644
--- a/datafusion/functions-spark/src/math_funcs/round.rs
+++ b/datafusion/functions-spark/src/math_funcs/round.rs
@@ -15,7 +15,9 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use crate::math_funcs::utils::{get_precision_scale, make_decimal_array, make_decimal_scalar};
+use crate::math_funcs::utils::{
+    get_precision_scale, make_decimal_array, make_decimal_scalar,
+};
 use arrow::array::{Int16Array, Int32Array, Int64Array, Int8Array};
 use arrow_array::{Array, ArrowNativeTypeOp};
 use arrow_schema::DataType;
@@ -76,10 +78,18 @@ pub fn spark_round(
     };
     match value {
         ColumnarValue::Array(array) => match array.data_type() {
-            DataType::Int64 if *point < 0 => round_integer_array!(array, point, Int64Array, i64),
-            DataType::Int32 if *point < 0 => round_integer_array!(array, point, Int32Array, i32),
-            DataType::Int16 if *point < 0 => round_integer_array!(array, point, Int16Array, i16),
-            DataType::Int8 if *point < 0 => round_integer_array!(array, point, Int8Array, i8),
+            DataType::Int64 if *point < 0 => {
+                round_integer_array!(array, point, Int64Array, i64)
+            }
+            DataType::Int32 if *point < 0 => {
+                round_integer_array!(array, point, Int32Array, i32)
+            }
+            DataType::Int16 if *point < 0 => {
+                round_integer_array!(array, point, Int16Array, i16)
+            }
+            DataType::Int8 if *point < 0 => {
+                round_integer_array!(array, point, Int8Array, i8)
+            }
             DataType::Decimal128(_, scale) if *scale >= 0 => {
                 let f = decimal_round_f(scale, point);
                 let (precision, scale) = get_precision_scale(data_type);
@@ -108,9 +118,12 @@ pub fn spark_round(
                 let (precision, scale) = get_precision_scale(data_type);
                 make_decimal_scalar(a, precision, scale, &f)
             }
-            ScalarValue::Float32(_) | ScalarValue::Float64(_) => Ok(ColumnarValue::Scalar(
-                ScalarValue::try_from_array(&round(&[a.to_array()?])?, 0)?,
-            )),
+            ScalarValue::Float32(_) | ScalarValue::Float64(_) => {
+                Ok(ColumnarValue::Scalar(ScalarValue::try_from_array(
+                    &round(&[a.to_array()?])?,
+                    0,
+                )?))
+            }
             dt => exec_err!("Not supported datatype for ROUND: {dt}"),
         },
     }
@@ -130,7 +143,8 @@ fn decimal_round_f(scale: &i8, point: &i64) -> Box<dyn Fn(i128) -> i128> {
             Box::new(move |_: i128| 0)
         }
     } else {
-        let div = 10_i128.pow_wrapping((*scale as u32) - min(*scale as u32, *point as u32));
+        let div =
+            10_i128.pow_wrapping((*scale as u32) - min(*scale as u32, *point as u32));
         let half = div / 2;
         Box::new(move |x: i128| (x + x.signum() * half) / div)
     }
diff --git a/datafusion/functions-spark/src/math_funcs/unhex.rs b/datafusion/functions-spark/src/math_funcs/unhex.rs
index 9996392b63a4..8c1de2f4cd98 100644
--- a/datafusion/functions-spark/src/math_funcs/unhex.rs
+++ b/datafusion/functions-spark/src/math_funcs/unhex.rs
@@ -20,7 +20,9 @@ use std::sync::Arc;
 use arrow_array::OffsetSizeTrait;
 use arrow_schema::DataType;
 use datafusion::logical_expr::ColumnarValue;
-use datafusion_common::{cast::as_generic_string_array, exec_err, DataFusionError, ScalarValue};
+use datafusion_common::{
+    cast::as_generic_string_array, exec_err, DataFusionError, ScalarValue,
+};
 
 /// Helper function to convert a hex digit to a binary value.
 fn unhex_digit(c: u8) -> Result<u8, DataFusionError> {
@@ -75,7 +77,9 @@ fn spark_unhex_inner<T: OffsetSizeTrait>(
                     if unhex(s, &mut encoded).is_ok() {
                         builder.append_value(encoded.as_slice());
                     } else if fail_on_error {
-                        return exec_err!("Input to unhex is not a valid hex string: {s}");
+                        return exec_err!(
+                            "Input to unhex is not a valid hex string: {s}"
+                        );
                     } else {
                         builder.append_null();
                     }
@@ -118,7 +122,9 @@ pub fn spark_unhex(args: &[ColumnarValue]) -> Result<ColumnarValue, DataFusionEr
     let val_to_unhex = &args[0];
     let fail_on_error = if args.len() == 2 {
         match &args[1] {
-            ColumnarValue::Scalar(ScalarValue::Boolean(Some(fail_on_error))) => *fail_on_error,
+            ColumnarValue::Scalar(ScalarValue::Boolean(Some(fail_on_error))) => {
+                *fail_on_error
+            }
             _ => {
                 return exec_err!(
                     "The second argument must be boolean scalar, but got: {:?}",
diff --git a/datafusion/functions-spark/src/predicate_funcs/is_nan.rs b/datafusion/functions-spark/src/predicate_funcs/is_nan.rs
index bf4d7e0f2605..094079ddb04c 100644
--- a/datafusion/functions-spark/src/predicate_funcs/is_nan.rs
+++ b/datafusion/functions-spark/src/predicate_funcs/is_nan.rs
@@ -55,12 +55,12 @@ pub fn spark_isnan(args: &[ColumnarValue]) -> Result<ColumnarValue, DataFusionEr
             ))),
         },
         ColumnarValue::Scalar(a) => match a {
-            ScalarValue::Float64(a) => Ok(ColumnarValue::Scalar(ScalarValue::Boolean(Some(
-                a.map(|x| x.is_nan()).unwrap_or(false),
-            )))),
-            ScalarValue::Float32(a) => Ok(ColumnarValue::Scalar(ScalarValue::Boolean(Some(
-                a.map(|x| x.is_nan()).unwrap_or(false),
-            )))),
+            ScalarValue::Float64(a) => Ok(ColumnarValue::Scalar(ScalarValue::Boolean(
+                Some(a.map(|x| x.is_nan()).unwrap_or(false)),
+            ))),
+            ScalarValue::Float32(a) => Ok(ColumnarValue::Scalar(ScalarValue::Boolean(
+                Some(a.map(|x| x.is_nan()).unwrap_or(false)),
+            ))),
             _ => Err(DataFusionError::Internal(format!(
                 "Unsupported data type {:?} for function isnan",
                 value.data_type(),
diff --git a/datafusion/functions-spark/src/predicate_funcs/rlike.rs b/datafusion/functions-spark/src/predicate_funcs/rlike.rs
index 7f367a8bb98a..7b67b0099c37 100644
--- a/datafusion/functions-spark/src/predicate_funcs/rlike.rs
+++ b/datafusion/functions-spark/src/predicate_funcs/rlike.rs
@@ -69,7 +69,10 @@ impl RLike {
             child,
             pattern_str: pattern.to_string(),
             pattern: Regex::new(pattern).map_err(|e| {
-                SparkError::Internal(format!("Failed to compile pattern {}: {}", pattern, e))
+                SparkError::Internal(format!(
+                    "Failed to compile pattern {}: {}",
+                    pattern, e
+                ))
             })?,
         })
     }
@@ -118,7 +121,9 @@ impl PhysicalExpr for RLike {
 
     fn evaluate(&self, batch: &RecordBatch) -> Result<ColumnarValue> {
         match self.child.evaluate(batch)? {
-            ColumnarValue::Array(array) if array.as_any().is::<DictionaryArray<Int32Type>>() => {
+            ColumnarValue::Array(array)
+                if array.as_any().is::<DictionaryArray<Int32Type>>() =>
+            {
                 let dict_array = array
                     .as_any()
                     .downcast_ref::<DictionaryArray<Int32Type>>()
diff --git a/datafusion/functions-spark/src/static_invoke/char_varchar_utils/read_side_padding.rs b/datafusion/functions-spark/src/static_invoke/char_varchar_utils/read_side_padding.rs
index 15807bf57d4e..2b334af3d699 100644
--- a/datafusion/functions-spark/src/static_invoke/char_varchar_utils/read_side_padding.rs
+++ b/datafusion/functions-spark/src/static_invoke/char_varchar_utils/read_side_padding.rs
@@ -25,12 +25,17 @@ use std::fmt::Write;
 use std::sync::Arc;
 
 /// Similar to DataFusion `rpad`, but not to truncate when the string is already longer than length
-pub fn spark_read_side_padding(args: &[ColumnarValue]) -> Result<ColumnarValue, DataFusionError> {
+pub fn spark_read_side_padding(
+    args: &[ColumnarValue],
+) -> Result<ColumnarValue, DataFusionError> {
     match args {
-        [ColumnarValue::Array(array), ColumnarValue::Scalar(ScalarValue::Int32(Some(length)))] => {
+        [ColumnarValue::Array(array), ColumnarValue::Scalar(ScalarValue::Int32(Some(length)))] =>
+        {
             match array.data_type() {
                 DataType::Utf8 => spark_read_side_padding_internal::<i32>(array, *length),
-                DataType::LargeUtf8 => spark_read_side_padding_internal::<i64>(array, *length),
+                DataType::LargeUtf8 => {
+                    spark_read_side_padding_internal::<i64>(array, *length)
+                }
                 // TODO: handle Dictionary types
                 other => Err(DataFusionError::Internal(format!(
                     "Unsupported data type {other:?} for function read_side_padding",
@@ -51,8 +56,10 @@ fn spark_read_side_padding_internal<T: OffsetSizeTrait>(
     let length = 0.max(length) as usize;
     let space_string = " ".repeat(length);
 
-    let mut builder =
-        GenericStringBuilder::<T>::with_capacity(string_array.len(), string_array.len() * length);
+    let mut builder = GenericStringBuilder::<T>::with_capacity(
+        string_array.len(),
+        string_array.len() * length,
+    );
 
     for string in string_array.iter() {
         match string {
diff --git a/datafusion/functions-spark/src/string_funcs/chr.rs b/datafusion/functions-spark/src/string_funcs/chr.rs
index 5de59f9f27ca..66470b62b315 100644
--- a/datafusion/functions-spark/src/string_funcs/chr.rs
+++ b/datafusion/functions-spark/src/string_funcs/chr.rs
@@ -116,7 +116,9 @@ fn spark_chr(args: &[ColumnarValue]) -> Result<ColumnarValue> {
                     Some(ch) => Ok(ColumnarValue::Scalar(ScalarValue::Utf8(Some(
                         ch.to_string(),
                     )))),
-                    None => exec_err!("requested character was incompatible for encoding."),
+                    None => {
+                        exec_err!("requested character was incompatible for encoding.")
+                    }
                 }
             }
         }
diff --git a/datafusion/functions-spark/src/string_funcs/prediction.rs b/datafusion/functions-spark/src/string_funcs/prediction.rs
index d2ef82fcbe73..d75e3187df99 100644
--- a/datafusion/functions-spark/src/string_funcs/prediction.rs
+++ b/datafusion/functions-spark/src/string_funcs/prediction.rs
@@ -19,8 +19,8 @@
 
 use arrow::{
     compute::{
-        contains_dyn, contains_utf8_scalar_dyn, ends_with_dyn, ends_with_utf8_scalar_dyn, like_dyn,
-        like_utf8_scalar_dyn, starts_with_dyn, starts_with_utf8_scalar_dyn,
+        contains_dyn, contains_utf8_scalar_dyn, ends_with_dyn, ends_with_utf8_scalar_dyn,
+        like_dyn, like_utf8_scalar_dyn, starts_with_dyn, starts_with_utf8_scalar_dyn,
     },
     record_batch::RecordBatch,
 };
@@ -44,7 +44,10 @@ macro_rules! make_predicate_function {
         }
 
         impl $name {
-            pub fn new(left: Arc<dyn PhysicalExpr>, right: Arc<dyn PhysicalExpr>) -> Self {
+            pub fn new(
+                left: Arc<dyn PhysicalExpr>,
+                right: Arc<dyn PhysicalExpr>,
+            ) -> Self {
                 Self { left, right }
             }
         }
@@ -81,15 +84,19 @@ macro_rules! make_predicate_function {
                 Ok(true)
             }
 
-            fn evaluate(&self, batch: &RecordBatch) -> datafusion_common::Result<ColumnarValue> {
+            fn evaluate(
+                &self,
+                batch: &RecordBatch,
+            ) -> datafusion_common::Result<ColumnarValue> {
                 let left_arg = self.left.evaluate(batch)?;
                 let right_arg = self.right.evaluate(batch)?;
 
                 let array = match (left_arg, right_arg) {
                     // array (op) scalar
-                    (ColumnarValue::Array(array), ColumnarValue::Scalar(Utf8(Some(string)))) => {
-                        $str_scalar_kernel(&array, string.as_str())
-                    }
+                    (
+                        ColumnarValue::Array(array),
+                        ColumnarValue::Scalar(Utf8(Some(string))),
+                    ) => $str_scalar_kernel(&array, string.as_str()),
                     (ColumnarValue::Array(_), ColumnarValue::Scalar(other)) => {
                         return Err(DataFusionError::Execution(format!(
                             "Should be String but got: {:?}",
@@ -103,7 +110,8 @@ macro_rules! make_predicate_function {
                     // scalar (op) scalar should be folded at Spark optimizer
                     _ => {
                         return Err(DataFusionError::Execution(
-                            "Predicate on two literals should be folded at Spark".to_string(),
+                            "Predicate on two literals should be folded at Spark"
+                                .to_string(),
                         ))
                     }
                 }?;
diff --git a/datafusion/functions-spark/src/string_funcs/substring.rs b/datafusion/functions-spark/src/string_funcs/substring.rs
index 741ea9139d61..c38001160be0 100644
--- a/datafusion/functions-spark/src/string_funcs/substring.rs
+++ b/datafusion/functions-spark/src/string_funcs/substring.rs
@@ -47,7 +47,9 @@ impl Hash for SubstringExpr {
 
 impl PartialEq for SubstringExpr {
     fn eq(&self, other: &Self) -> bool {
-        self.child.eq(&other.child) && self.start.eq(&other.start) && self.len.eq(&other.len)
+        self.child.eq(&other.child)
+            && self.start.eq(&other.start)
+            && self.len.eq(&other.len)
     }
 }
 
diff --git a/datafusion/functions-spark/src/struct_funcs/create_named_struct.rs b/datafusion/functions-spark/src/struct_funcs/create_named_struct.rs
index df63127412fe..3212104133eb 100644
--- a/datafusion/functions-spark/src/struct_funcs/create_named_struct.rs
+++ b/datafusion/functions-spark/src/struct_funcs/create_named_struct.rs
@@ -122,7 +122,8 @@ mod test {
         let keys = Int32Array::from(vec![0, 1, 2]);
         let values = Int32Array::from(vec![0, 111, 233]);
         let dict = DictionaryArray::try_new(keys, Arc::new(values))?;
-        let data_type = DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Int32));
+        let data_type =
+            DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Int32));
         let schema = Schema::new(vec![Field::new("a", data_type, false)]);
         let batch = RecordBatch::try_new(Arc::new(schema), vec![Arc::new(dict)])?;
         let field_names = vec!["a".to_string()];
@@ -137,9 +138,11 @@ mod test {
     #[test]
     fn test_create_struct_from_dict_encoded_string() -> Result<()> {
         let keys = Int32Array::from(vec![0, 1, 2]);
-        let values = StringArray::from(vec!["a".to_string(), "b".to_string(), "c".to_string()]);
+        let values =
+            StringArray::from(vec!["a".to_string(), "b".to_string(), "c".to_string()]);
         let dict = DictionaryArray::try_new(keys, Arc::new(values))?;
-        let data_type = DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8));
+        let data_type =
+            DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8));
         let schema = Schema::new(vec![Field::new("a", data_type, false)]);
         let batch = RecordBatch::try_new(Arc::new(schema), vec![Arc::new(dict)])?;
         let field_names = vec!["a".to_string()];
diff --git a/datafusion/functions-spark/src/struct_funcs/get_struct_field.rs b/datafusion/functions-spark/src/struct_funcs/get_struct_field.rs
index c4e1a1e23934..966419ee45e3 100644
--- a/datafusion/functions-spark/src/struct_funcs/get_struct_field.rs
+++ b/datafusion/functions-spark/src/struct_funcs/get_struct_field.rs
@@ -89,9 +89,9 @@ impl PhysicalExpr for GetStructField {
                     struct_array.column(self.ordinal),
                 )))
             }
-            ColumnarValue::Scalar(ScalarValue::Struct(struct_array)) => Ok(ColumnarValue::Array(
-                Arc::clone(struct_array.column(self.ordinal)),
-            )),
+            ColumnarValue::Scalar(ScalarValue::Struct(struct_array)) => Ok(
+                ColumnarValue::Array(Arc::clone(struct_array.column(self.ordinal))),
+            ),
             value => Err(DataFusionError::Execution(format!(
                 "Expected a struct array, got {:?}",
                 value
diff --git a/datafusion/functions-spark/src/timezone.rs b/datafusion/functions-spark/src/timezone.rs
index 7aad386aa915..59bcb13a3022 100644
--- a/datafusion/functions-spark/src/timezone.rs
+++ b/datafusion/functions-spark/src/timezone.rs
@@ -28,14 +28,14 @@ use std::str::FromStr;
 fn parse_fixed_offset(tz: &str) -> Result<FixedOffset, ArrowError> {
     let mut parsed = Parsed::new();
 
-    if let Ok(fixed_offset) =
-        parse(&mut parsed, tz, StrftimeItems::new("%:z")).and_then(|_| parsed.to_fixed_offset())
+    if let Ok(fixed_offset) = parse(&mut parsed, tz, StrftimeItems::new("%:z"))
+        .and_then(|_| parsed.to_fixed_offset())
     {
         return Ok(fixed_offset);
     }
 
-    if let Ok(fixed_offset) =
-        parse(&mut parsed, tz, StrftimeItems::new("%#z")).and_then(|_| parsed.to_fixed_offset())
+    if let Ok(fixed_offset) = parse(&mut parsed, tz, StrftimeItems::new("%#z"))
+        .and_then(|_| parsed.to_fixed_offset())
     {
         return Ok(fixed_offset);
     }
@@ -114,7 +114,10 @@ impl TimeZone for Tz {
         })
     }
 
-    fn offset_from_local_datetime(&self, local: &NaiveDateTime) -> LocalResult<Self::Offset> {
+    fn offset_from_local_datetime(
+        &self,
+        local: &NaiveDateTime,
+    ) -> LocalResult<Self::Offset> {
         tz!(self, tz, {
             tz.offset_from_local_datetime(local).map(|x| TzOffset {
                 tz: *self,
diff --git a/datafusion/functions-spark/src/utils.rs b/datafusion/functions-spark/src/utils.rs
index ed04941bdb91..d6090014d05a 100644
--- a/datafusion/functions-spark/src/utils.rs
+++ b/datafusion/functions-spark/src/utils.rs
@@ -28,7 +28,9 @@ use arrow::{
     temporal_conversions::as_datetime,
 };
 use arrow_array::types::TimestampMillisecondType;
-use arrow_data::decimal::{MAX_DECIMAL_FOR_EACH_PRECISION, MIN_DECIMAL_FOR_EACH_PRECISION};
+use arrow_data::decimal::{
+    MAX_DECIMAL_FOR_EACH_PRECISION, MIN_DECIMAL_FOR_EACH_PRECISION,
+};
 use chrono::{DateTime, Offset, TimeZone};
 
 /// Preprocesses input arrays to add timezone information from Spark to Arrow array datatype or
@@ -69,9 +71,11 @@ pub fn array_with_timezone(
             assert!(!timezone.is_empty());
             match to_type {
                 Some(DataType::Utf8) | Some(DataType::Date32) => Ok(array),
-                Some(DataType::Timestamp(_, Some(_))) => {
-                    timestamp_ntz_to_timestamp(array, timezone.as_str(), Some(timezone.as_str()))
-                }
+                Some(DataType::Timestamp(_, Some(_))) => timestamp_ntz_to_timestamp(
+                    array,
+                    timezone.as_str(),
+                    Some(timezone.as_str()),
+                ),
                 Some(DataType::Timestamp(_, None)) => {
                     timestamp_ntz_to_timestamp(array, timezone.as_str(), None)
                 }
@@ -114,8 +118,11 @@ pub fn array_with_timezone(
         {
             let dict = as_dictionary_array::<Int32Type>(&array);
             let array = as_primitive_array::<TimestampMicrosecondType>(dict.values());
-            let array_with_timezone =
-                array_with_timezone(Arc::new(array.clone()) as ArrayRef, timezone, to_type)?;
+            let array_with_timezone = array_with_timezone(
+                Arc::new(array.clone()) as ArrayRef,
+                timezone,
+                to_type,
+            )?;
             let dict = dict.with_values(array_with_timezone);
             Ok(Arc::new(dict))
         }
@@ -147,15 +154,16 @@ fn timestamp_ntz_to_timestamp(
         DataType::Timestamp(TimeUnit::Microsecond, None) => {
             let array = as_primitive_array::<TimestampMicrosecondType>(&array);
             let tz: Tz = tz.parse()?;
-            let array: PrimitiveArray<TimestampMicrosecondType> = array.try_unary(|value| {
-                as_datetime::<TimestampMicrosecondType>(value)
-                    .ok_or_else(|| datetime_cast_err(value))
-                    .map(|local_datetime| {
-                        let datetime: DateTime<Tz> =
-                            tz.from_local_datetime(&local_datetime).unwrap();
-                        datetime.timestamp_micros()
-                    })
-            })?;
+            let array: PrimitiveArray<TimestampMicrosecondType> =
+                array.try_unary(|value| {
+                    as_datetime::<TimestampMicrosecondType>(value)
+                        .ok_or_else(|| datetime_cast_err(value))
+                        .map(|local_datetime| {
+                            let datetime: DateTime<Tz> =
+                                tz.from_local_datetime(&local_datetime).unwrap();
+                            datetime.timestamp_micros()
+                        })
+                })?;
             let array_with_tz = if let Some(to_tz) = to_timezone {
                 array.with_timezone(to_tz)
             } else {
@@ -166,15 +174,16 @@ fn timestamp_ntz_to_timestamp(
         DataType::Timestamp(TimeUnit::Millisecond, None) => {
             let array = as_primitive_array::<TimestampMillisecondType>(&array);
             let tz: Tz = tz.parse()?;
-            let array: PrimitiveArray<TimestampMillisecondType> = array.try_unary(|value| {
-                as_datetime::<TimestampMillisecondType>(value)
-                    .ok_or_else(|| datetime_cast_err(value))
-                    .map(|local_datetime| {
-                        let datetime: DateTime<Tz> =
-                            tz.from_local_datetime(&local_datetime).unwrap();
-                        datetime.timestamp_millis()
-                    })
-            })?;
+            let array: PrimitiveArray<TimestampMillisecondType> =
+                array.try_unary(|value| {
+                    as_datetime::<TimestampMillisecondType>(value)
+                        .ok_or_else(|| datetime_cast_err(value))
+                        .map(|local_datetime| {
+                            let datetime: DateTime<Tz> =
+                                tz.from_local_datetime(&local_datetime).unwrap();
+                            datetime.timestamp_millis()
+                        })
+                })?;
             let array_with_tz = if let Some(to_tz) = to_timezone {
                 array.with_timezone(to_tz)
             } else {
@@ -197,15 +206,16 @@ fn pre_timestamp_cast(array: ArrayRef, timezone: String) -> Result<ArrayRef, Arr
             let array = as_primitive_array::<TimestampMicrosecondType>(&array);
 
             let tz: Tz = timezone.parse()?;
-            let array: PrimitiveArray<TimestampMicrosecondType> = array.try_unary(|value| {
-                as_datetime::<TimestampMicrosecondType>(value)
-                    .ok_or_else(|| datetime_cast_err(value))
-                    .map(|datetime| {
-                        let offset = tz.offset_from_utc_datetime(&datetime).fix();
-                        let datetime = datetime + offset;
-                        datetime.and_utc().timestamp_micros()
-                    })
-            })?;
+            let array: PrimitiveArray<TimestampMicrosecondType> =
+                array.try_unary(|value| {
+                    as_datetime::<TimestampMicrosecondType>(value)
+                        .ok_or_else(|| datetime_cast_err(value))
+                        .map(|datetime| {
+                            let offset = tz.offset_from_utc_datetime(&datetime).fix();
+                            let datetime = datetime + offset;
+                            datetime.and_utc().timestamp_micros()
+                        })
+                })?;
 
             Ok(Arc::new(array))
         }

From af2ec268bb8eb0e2de8ff9f42a611a60109ccaaf Mon Sep 17 00:00:00 2001
From: Andy Grove <agrove@apache.org>
Date: Fri, 31 Jan 2025 13:37:31 -0700
Subject: [PATCH 65/68] fix merge conflict in README

---
 datafusion/functions-spark/README.md | 188 +--------------------------
 1 file changed, 2 insertions(+), 186 deletions(-)

diff --git a/datafusion/functions-spark/README.md b/datafusion/functions-spark/README.md
index 4e6673d290d9..42c7f4d2a8e4 100644
--- a/datafusion/functions-spark/README.md
+++ b/datafusion/functions-spark/README.md
@@ -1,184 +1,3 @@
-<<<<<<< HEAD
-
-<!---
-  Licensed to the Apache Software Foundation (ASF) under one
-  or more contributor license agreements.  See the NOTICE file
-  distributed with this work for additional information
-  regarding copyright ownership.  The ASF licenses this file
-  to you under the Apache License, Version 2.0 (the
-  "License"); you may not use this file except in compliance
-  with the License.  You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-  Unless required by applicable law or agreed to in writing,
-  software distributed under the License is distributed on an
-  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-  KIND, either express or implied.  See the License for the
-  specific language governing permissions and limitations
-  under the License.
--->
-
-# Apache DataFusion
-
-[![Crates.io][crates-badge]][crates-url]
-[![Apache licensed][license-badge]][license-url]
-[![Build Status][actions-badge]][actions-url]
-![Commit Activity][commit-activity-badge]
-[![Open Issues][open-issues-badge]][open-issues-url]
-[![Discord chat][discord-badge]][discord-url]
-
-[crates-badge]: https://img.shields.io/crates/v/datafusion.svg
-[crates-url]: https://crates.io/crates/datafusion
-[license-badge]: https://img.shields.io/badge/license-Apache%20v2-blue.svg
-[license-url]: https://github.com/apache/datafusion/blob/main/LICENSE.txt
-[actions-badge]: https://github.com/apache/datafusion/actions/workflows/rust.yml/badge.svg
-[actions-url]: https://github.com/apache/datafusion/actions?query=branch%3Amain
-[discord-badge]: https://img.shields.io/discord/885562378132000778.svg?logo=discord&style=flat-square
-[discord-url]: https://discord.com/invite/Qw5gKqHxUM
-[commit-activity-badge]: https://img.shields.io/github/commit-activity/m/apache/datafusion
-[open-issues-badge]: https://img.shields.io/github/issues-raw/apache/datafusion
-[open-issues-url]: https://github.com/apache/datafusion/issues
-
-[Website](https://datafusion.apache.org/) |
-[API Docs](https://docs.rs/datafusion/latest/datafusion/) |
-[Chat](https://discord.com/channels/885562378132000778/885562378132000781)
-
-<a href="https://datafusion.apache.org/">
-  <img src="https://github.com/apache/datafusion/raw/HEAD/docs/source/_static/images/2x_bgwhite_original.png" width="512" alt="logo"/>
-</a>
-
-DataFusion is an extensible query engine written in [Rust] that
-uses [Apache Arrow] as its in-memory format.
-
-This crate provides libraries and binaries for developers building fast and
-feature rich database and analytic systems, customized to particular workloads.
-See [use cases] for examples. The following related subprojects target end users:
-
-- [DataFusion Python](https://github.com/apache/datafusion-python/) offers a Python interface for SQL and DataFrame
-  queries.
-- [DataFusion Ray](https://github.com/apache/datafusion-ray/) provides a distributed version of DataFusion that scales
-  out on Ray clusters.
-- [DataFusion Comet](https://github.com/apache/datafusion-comet/) is an accelerator for Apache Spark based on
-  DataFusion.
-
-"Out of the box,"
-DataFusion offers [SQL] and [`Dataframe`] APIs, excellent [performance],
-built-in support for CSV, Parquet, JSON, and Avro, extensive customization, and
-a great community.
-
-DataFusion features a full query planner, a columnar, streaming, multi-threaded,
-vectorized execution engine, and partitioned data sources. You can
-customize DataFusion at almost all points including additional data sources,
-query languages, functions, custom operators and more.
-See the [Architecture] section for more details.
-
-[rust]: http://rustlang.org
-[apache arrow]: https://arrow.apache.org
-[use cases]: https://datafusion.apache.org/user-guide/introduction.html#use-cases
-[python bindings]: https://github.com/apache/datafusion-python
-[performance]: https://benchmark.clickhouse.com/
-[architecture]: https://datafusion.apache.org/contributor-guide/architecture.html
-
-Here are links to some important information
-
-- [Project Site](https://datafusion.apache.org/)
-- [Installation](https://datafusion.apache.org/user-guide/cli/installation.html)
-- [Rust Getting Started](https://datafusion.apache.org/user-guide/example-usage.html)
-- [Rust DataFrame API](https://datafusion.apache.org/user-guide/dataframe.html)
-- [Rust API docs](https://docs.rs/datafusion/latest/datafusion)
-- [Rust Examples](https://github.com/apache/datafusion/tree/main/datafusion-examples)
-- [Python DataFrame API](https://arrow.apache.org/datafusion-python/)
-- [Architecture](https://docs.rs/datafusion/latest/datafusion/index.html#architecture)
-
-## What can you do with this crate?
-
-DataFusion is great for building projects such as domain specific query engines, new database platforms and data pipelines, query languages and more.
-It lets you start quickly from a fully working engine, and then customize those features specific to your use. [Click Here](https://datafusion.apache.org/user-guide/introduction.html#known-users) to see a list known users.
-
-## Contributing to DataFusion
-
-Please see the [contributor guide] and [communication] pages for more information.
-
-[contributor guide]: https://datafusion.apache.org/contributor-guide
-[communication]: https://datafusion.apache.org/contributor-guide/communication.html
-
-## Crate features
-
-This crate has several [features] which can be specified in your `Cargo.toml`.
-
-[features]: https://doc.rust-lang.org/cargo/reference/features.html
-
-Default features:
-
-- `nested_expressions`: functions for working with nested type function such as `array_to_string`
-- `compression`: reading files compressed with `xz2`, `bzip2`, `flate2`, and `zstd`
-- `crypto_expressions`: cryptographic functions such as `md5` and `sha256`
-- `datetime_expressions`: date and time functions such as `to_timestamp`
-- `encoding_expressions`: `encode` and `decode` functions
-- `parquet`: support for reading the [Apache Parquet] format
-- `regex_expressions`: regular expression functions, such as `regexp_match`
-- `unicode_expressions`: Include unicode aware functions such as `character_length`
-- `unparser`: enables support to reverse LogicalPlans back into SQL
-- `recursive_protection`: uses [recursive](https://docs.rs/recursive/latest/recursive/) for stack overflow protection.
-
-Optional features:
-
-- `avro`: support for reading the [Apache Avro] format
-- `backtrace`: include backtrace information in error messages
-- `pyarrow`: conversions between PyArrow and DataFusion types
-- `serde`: enable arrow-schema's `serde` feature
-
-[apache avro]: https://avro.apache.org/
-[apache parquet]: https://parquet.apache.org/
-
-## Rust Version Compatibility Policy
-
-The Rust toolchain releases are tracked at [Rust Versions](https://releases.rs) and follow
-[semantic versioning](https://semver.org/). A Rust toolchain release can be identified
-by a version string like `1.80.0`, or more generally `major.minor.patch`.
-
-DataFusion's supports the last 4 stable Rust minor versions released and any such versions released within the last 4 months.
-
-For example, given the releases `1.78.0`, `1.79.0`, `1.80.0`, `1.80.1` and `1.81.0` DataFusion will support 1.78.0, which is 3 minor versions prior to the most minor recent `1.81`.
-
-Note: If a Rust hotfix is released for the current MSRV, the MSRV will be updated to the specific minor version that includes all applicable hotfixes preceding other policies.
-
-DataFusion enforces MSRV policy using a [MSRV CI Check](https://github.com/search?q=repo%3Aapache%2Fdatafusion+rust-version+language%3ATOML+path%3A%2F%5ECargo.toml%2F&type=code)
-
-## DataFusion API Evolution and Deprecation Guidelines
-
-Public methods in Apache DataFusion evolve over time: while we try to maintain a
-stable API, we also improve the API over time. As a result, we typically
-deprecate methods before removing them, according to the [deprecation guidelines].
-
-[deprecation guidelines]: https://datafusion.apache.org/library-user-guide/api-health.html
-
-## Dependencies and a `Cargo.lock`
-
-`datafusion` is intended for use as a library and thus purposely does not have a
-`Cargo.lock` file checked in. You can read more about the distinction in the
-[Cargo book].
-
-CI tests always run against the latest compatible versions of all dependencies
-(the equivalent of doing `cargo update`), as suggested in the [Cargo CI guide]
-and we rely on Dependabot for other upgrades. This strategy has two problems
-that occasionally arise:
-
-1. CI failures when downstream libraries upgrade in some non compatible way
-2. Local development builds that fail when DataFusion inadvertently relies on
-   a feature in a newer version of a dependency than declared in `Cargo.toml`
-   (e.g. a new method is added to a trait that we use).
-
-However, we think the current strategy is the best tradeoff between maintenance
-overhead and user experience and ensures DataFusion always works with the latest
-compatible versions of all dependencies. If you encounter either of these
-problems, please open an issue or PR.
-
-[cargo book]: https://doc.rust-lang.org/cargo/guide/cargo-toml-vs-cargo-lock.html
-
-# [cargo ci guide]: https://doc.rust-lang.org/cargo/guide/continuous-integration.html#verifying-latest-dependencies
-
 <!--
 Licensed to the Apache Software Foundation (ASF) under one
 or more contributor license agreements.  See the NOTICE file
@@ -198,9 +17,6 @@ specific language governing permissions and limitations
 under the License.
 -->
 
-# datafusion-comet-spark-expr: Spark-compatible Expressions
-
-This crate provides Apache Spark-compatible expressions for use with DataFusion and is maintained as part of the
-[Apache DataFusion Comet](https://github.com/apache/datafusion-comet/) subproject.
+# datafusion-functions-spark: Spark-compatible Expressions
 
-> > > > > > > comet/main
+This crate provides Apache Spark-compatible expressions for use with DataFusion.

From d7eaf68978629c764f963ed4d114e5b757623a4f Mon Sep 17 00:00:00 2001
From: Andy Grove <agrove@apache.org>
Date: Fri, 31 Jan 2025 14:26:19 -0700
Subject: [PATCH 66/68] fix cargo doc failures

---
 datafusion/functions-spark/src/datetime_funcs/date_trunc.rs   | 2 +-
 .../functions-spark/src/datetime_funcs/timestamp_trunc.rs     | 4 ++--
 datafusion/functions-spark/src/predicate_funcs/rlike.rs       | 2 +-
 datafusion/functions-spark/src/utils.rs                       | 2 +-
 4 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/datafusion/functions-spark/src/datetime_funcs/date_trunc.rs b/datafusion/functions-spark/src/datetime_funcs/date_trunc.rs
index 5c044945d04c..a3b06e6a1c0f 100644
--- a/datafusion/functions-spark/src/datetime_funcs/date_trunc.rs
+++ b/datafusion/functions-spark/src/datetime_funcs/date_trunc.rs
@@ -33,7 +33,7 @@ use crate::kernels::temporal::{date_trunc_array_fmt_dyn, date_trunc_dyn};
 pub struct DateTruncExpr {
     /// An array with DataType::Date32
     child: Arc<dyn PhysicalExpr>,
-    /// Scalar UTF8 string matching the valid values in Spark SQL: https://spark.apache.org/docs/latest/api/sql/index.html#trunc
+    /// Scalar UTF8 string matching the valid values in Spark SQL: <https://spark.apache.org/docs/latest/api/sql/index.html#trunc>
     format: Arc<dyn PhysicalExpr>,
 }
 
diff --git a/datafusion/functions-spark/src/datetime_funcs/timestamp_trunc.rs b/datafusion/functions-spark/src/datetime_funcs/timestamp_trunc.rs
index 349992322f9b..bca9b8e8daab 100644
--- a/datafusion/functions-spark/src/datetime_funcs/timestamp_trunc.rs
+++ b/datafusion/functions-spark/src/datetime_funcs/timestamp_trunc.rs
@@ -34,10 +34,10 @@ use crate::kernels::temporal::{timestamp_trunc_array_fmt_dyn, timestamp_trunc_dy
 pub struct TimestampTruncExpr {
     /// An array with DataType::Timestamp(TimeUnit::Microsecond, None)
     child: Arc<dyn PhysicalExpr>,
-    /// Scalar UTF8 string matching the valid values in Spark SQL: https://spark.apache.org/docs/latest/api/sql/index.html#date_trunc
+    /// Scalar UTF8 string matching the valid values in Spark SQL: <https://spark.apache.org/docs/latest/api/sql/index.html#date_trunc>
     format: Arc<dyn PhysicalExpr>,
     /// String containing a timezone name. The name must be found in the standard timezone
-    /// database (https://en.wikipedia.org/wiki/List_of_tz_database_time_zones). The string is
+    /// database (<https://en.wikipedia.org/wiki/List_of_tz_database_time_zones>). The string is
     /// later parsed into a chrono::TimeZone.
     /// Timestamp arrays in this implementation are kept in arrays of UTC timestamps (in micros)
     /// along with a single value for the associated TimeZone. The timezone offset is applied
diff --git a/datafusion/functions-spark/src/predicate_funcs/rlike.rs b/datafusion/functions-spark/src/predicate_funcs/rlike.rs
index 7b67b0099c37..bfee0cc769cb 100644
--- a/datafusion/functions-spark/src/predicate_funcs/rlike.rs
+++ b/datafusion/functions-spark/src/predicate_funcs/rlike.rs
@@ -38,7 +38,7 @@ use std::sync::Arc;
 /// differences in whitespace handling and does not support all the features of Java's
 /// regular expression engine, which are documented at:
 ///
-/// https://docs.oracle.com/javase/8/docs/api/java/util/regex/Pattern.html
+/// <https://docs.oracle.com/javase/8/docs/api/java/util/regex/Pattern.html>
 #[derive(Debug)]
 pub struct RLike {
     child: Arc<dyn PhysicalExpr>,
diff --git a/datafusion/functions-spark/src/utils.rs b/datafusion/functions-spark/src/utils.rs
index d6090014d05a..37d633e52549 100644
--- a/datafusion/functions-spark/src/utils.rs
+++ b/datafusion/functions-spark/src/utils.rs
@@ -227,7 +227,7 @@ fn pre_timestamp_cast(array: ArrayRef, timezone: String) -> Result<ArrayRef, Arr
 /// instead of Err to avoid the cost of formatting the error strings and is
 /// optimized to remove a memcpy that exists in the original function
 /// we can remove this code once we upgrade to a version of arrow-rs that
-/// includes https://github.com/apache/arrow-rs/pull/6419
+/// includes <https://github.com/apache/arrow-rs/pull/6419>
 #[inline]
 pub fn is_valid_decimal_precision(value: i128, precision: u8) -> bool {
     precision <= DECIMAL128_MAX_PRECISION

From 96f21363a3b4114b2e1a99d7f0e5d2c6dad94d53 Mon Sep 17 00:00:00 2001
From: Andy Grove <agrove@apache.org>
Date: Fri, 31 Jan 2025 14:30:08 -0700
Subject: [PATCH 67/68] taplo fmt

---
 datafusion/functions-spark/Cargo.toml | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/datafusion/functions-spark/Cargo.toml b/datafusion/functions-spark/Cargo.toml
index 08361c06b0dc..eaa48845820f 100644
--- a/datafusion/functions-spark/Cargo.toml
+++ b/datafusion/functions-spark/Cargo.toml
@@ -33,27 +33,26 @@ arrow-buffer = { workspace = true }
 arrow-data = { workspace = true }
 arrow-schema = { workspace = true }
 chrono = { workspace = true }
+chrono-tz = "0.10.1"
 datafusion = { workspace = true, features = ["parquet"] }
 datafusion-common = { workspace = true }
 datafusion-expr = { workspace = true }
 datafusion-expr-common = { workspace = true }
 datafusion-physical-expr = { workspace = true }
-chrono-tz = "0.10.1"
+futures = { workspace = true }
 num = "0.4.3"
+rand = { workspace = true }
 regex = { workspace = true }
 thiserror = "2.0.11"
-futures = { workspace = true }
 twox-hash = "2.0.0"
-rand = { workspace = true }
 
 [dev-dependencies]
-arrow-data = {workspace = true}
-parquet = { workspace = true, features = ["arrow"] }
+arrow-data = { workspace = true }
 criterion = "0.5.1"
-rand = { workspace = true}
+parquet = { workspace = true, features = ["arrow"] }
+rand = { workspace = true }
 tokio = { version = "1", features = ["rt-multi-thread"] }
 
-
 [lib]
 name = "datafusion_comet_spark_expr"
 path = "src/lib.rs"
@@ -77,4 +76,3 @@ harness = false
 [[bench]]
 name = "aggregate"
 harness = false
-

From 77e7831897a4b38581c40730128ccb350e72af2a Mon Sep 17 00:00:00 2001
From: Andy Grove <agrove@apache.org>
Date: Fri, 31 Jan 2025 16:04:07 -0700
Subject: [PATCH 68/68] rename functions-spark to spark

---
 Cargo.toml                                                      | 2 +-
 datafusion/{functions-spark => spark}/Cargo.toml                | 2 +-
 datafusion/{functions-spark => spark}/README.md                 | 2 +-
 datafusion/{functions-spark => spark}/benches/aggregate.rs      | 0
 .../{functions-spark => spark}/benches/cast_from_string.rs      | 0
 datafusion/{functions-spark => spark}/benches/cast_numeric.rs   | 0
 datafusion/{functions-spark => spark}/benches/conditional.rs    | 0
 datafusion/{functions-spark => spark}/benches/decimal_div.rs    | 0
 datafusion/{functions-spark => spark}/src/agg_funcs/avg.rs      | 0
 .../{functions-spark => spark}/src/agg_funcs/avg_decimal.rs     | 0
 .../{functions-spark => spark}/src/agg_funcs/correlation.rs     | 0
 .../{functions-spark => spark}/src/agg_funcs/covariance.rs      | 0
 datafusion/{functions-spark => spark}/src/agg_funcs/mod.rs      | 0
 datafusion/{functions-spark => spark}/src/agg_funcs/stddev.rs   | 0
 .../{functions-spark => spark}/src/agg_funcs/sum_decimal.rs     | 0
 datafusion/{functions-spark => spark}/src/agg_funcs/variance.rs | 0
 .../{functions-spark => spark}/src/array_funcs/array_insert.rs  | 0
 .../src/array_funcs/get_array_struct_fields.rs                  | 0
 .../{functions-spark => spark}/src/array_funcs/list_extract.rs  | 0
 datafusion/{functions-spark => spark}/src/array_funcs/mod.rs    | 0
 .../{functions-spark => spark}/src/bitwise_funcs/bitwise_not.rs | 0
 datafusion/{functions-spark => spark}/src/bitwise_funcs/mod.rs  | 0
 datafusion/{functions-spark => spark}/src/comet_scalar_funcs.rs | 0
 .../{functions-spark => spark}/src/conditional_funcs/if_expr.rs | 0
 .../{functions-spark => spark}/src/conditional_funcs/mod.rs     | 0
 .../{functions-spark => spark}/src/conversion_funcs/cast.rs     | 0
 .../{functions-spark => spark}/src/conversion_funcs/mod.rs      | 0
 .../src/datetime_funcs/date_arithmetic.rs                       | 0
 .../{functions-spark => spark}/src/datetime_funcs/date_trunc.rs | 0
 .../{functions-spark => spark}/src/datetime_funcs/hour.rs       | 0
 .../{functions-spark => spark}/src/datetime_funcs/minute.rs     | 0
 datafusion/{functions-spark => spark}/src/datetime_funcs/mod.rs | 0
 .../{functions-spark => spark}/src/datetime_funcs/second.rs     | 0
 .../src/datetime_funcs/timestamp_trunc.rs                       | 0
 datafusion/{functions-spark => spark}/src/error.rs              | 0
 datafusion/{functions-spark => spark}/src/hash_funcs/mod.rs     | 0
 datafusion/{functions-spark => spark}/src/hash_funcs/murmur3.rs | 0
 datafusion/{functions-spark => spark}/src/hash_funcs/sha2.rs    | 0
 datafusion/{functions-spark => spark}/src/hash_funcs/utils.rs   | 0
 .../{functions-spark => spark}/src/hash_funcs/xxhash64.rs       | 0
 datafusion/{functions-spark => spark}/src/json_funcs/mod.rs     | 0
 datafusion/{functions-spark => spark}/src/json_funcs/to_json.rs | 0
 datafusion/{functions-spark => spark}/src/kernels/mod.rs        | 0
 datafusion/{functions-spark => spark}/src/kernels/strings.rs    | 0
 datafusion/{functions-spark => spark}/src/kernels/temporal.rs   | 0
 datafusion/{functions-spark => spark}/src/lib.rs                | 0
 datafusion/{functions-spark => spark}/src/math_funcs/ceil.rs    | 0
 datafusion/{functions-spark => spark}/src/math_funcs/div.rs     | 0
 datafusion/{functions-spark => spark}/src/math_funcs/floor.rs   | 0
 datafusion/{functions-spark => spark}/src/math_funcs/hex.rs     | 0
 .../src/math_funcs/internal/checkoverflow.rs                    | 0
 .../src/math_funcs/internal/make_decimal.rs                     | 0
 .../{functions-spark => spark}/src/math_funcs/internal/mod.rs   | 0
 .../src/math_funcs/internal/normalize_nan.rs                    | 0
 .../src/math_funcs/internal/unscaled_value.rs                   | 0
 datafusion/{functions-spark => spark}/src/math_funcs/mod.rs     | 0
 .../{functions-spark => spark}/src/math_funcs/negative.rs       | 0
 datafusion/{functions-spark => spark}/src/math_funcs/round.rs   | 0
 datafusion/{functions-spark => spark}/src/math_funcs/unhex.rs   | 0
 datafusion/{functions-spark => spark}/src/math_funcs/utils.rs   | 0
 .../{functions-spark => spark}/src/predicate_funcs/is_nan.rs    | 0
 .../{functions-spark => spark}/src/predicate_funcs/mod.rs       | 0
 .../{functions-spark => spark}/src/predicate_funcs/rlike.rs     | 0
 .../src/static_invoke/char_varchar_utils/mod.rs                 | 0
 .../src/static_invoke/char_varchar_utils/read_side_padding.rs   | 0
 datafusion/{functions-spark => spark}/src/static_invoke/mod.rs  | 0
 datafusion/{functions-spark => spark}/src/string_funcs/chr.rs   | 0
 datafusion/{functions-spark => spark}/src/string_funcs/mod.rs   | 0
 .../{functions-spark => spark}/src/string_funcs/prediction.rs   | 0
 .../{functions-spark => spark}/src/string_funcs/string_space.rs | 0
 .../{functions-spark => spark}/src/string_funcs/substring.rs    | 0
 .../src/struct_funcs/create_named_struct.rs                     | 0
 .../src/struct_funcs/get_struct_field.rs                        | 0
 datafusion/{functions-spark => spark}/src/struct_funcs/mod.rs   | 0
 .../{functions-spark => spark}/src/test_common/file_util.rs     | 0
 datafusion/{functions-spark => spark}/src/test_common/mod.rs    | 0
 datafusion/{functions-spark => spark}/src/timezone.rs           | 0
 datafusion/{functions-spark => spark}/src/unbound.rs            | 0
 datafusion/{functions-spark => spark}/src/utils.rs              | 0
 79 files changed, 3 insertions(+), 3 deletions(-)
 rename datafusion/{functions-spark => spark}/Cargo.toml (98%)
 rename datafusion/{functions-spark => spark}/README.md (93%)
 rename datafusion/{functions-spark => spark}/benches/aggregate.rs (100%)
 rename datafusion/{functions-spark => spark}/benches/cast_from_string.rs (100%)
 rename datafusion/{functions-spark => spark}/benches/cast_numeric.rs (100%)
 rename datafusion/{functions-spark => spark}/benches/conditional.rs (100%)
 rename datafusion/{functions-spark => spark}/benches/decimal_div.rs (100%)
 rename datafusion/{functions-spark => spark}/src/agg_funcs/avg.rs (100%)
 rename datafusion/{functions-spark => spark}/src/agg_funcs/avg_decimal.rs (100%)
 rename datafusion/{functions-spark => spark}/src/agg_funcs/correlation.rs (100%)
 rename datafusion/{functions-spark => spark}/src/agg_funcs/covariance.rs (100%)
 rename datafusion/{functions-spark => spark}/src/agg_funcs/mod.rs (100%)
 rename datafusion/{functions-spark => spark}/src/agg_funcs/stddev.rs (100%)
 rename datafusion/{functions-spark => spark}/src/agg_funcs/sum_decimal.rs (100%)
 rename datafusion/{functions-spark => spark}/src/agg_funcs/variance.rs (100%)
 rename datafusion/{functions-spark => spark}/src/array_funcs/array_insert.rs (100%)
 rename datafusion/{functions-spark => spark}/src/array_funcs/get_array_struct_fields.rs (100%)
 rename datafusion/{functions-spark => spark}/src/array_funcs/list_extract.rs (100%)
 rename datafusion/{functions-spark => spark}/src/array_funcs/mod.rs (100%)
 rename datafusion/{functions-spark => spark}/src/bitwise_funcs/bitwise_not.rs (100%)
 rename datafusion/{functions-spark => spark}/src/bitwise_funcs/mod.rs (100%)
 rename datafusion/{functions-spark => spark}/src/comet_scalar_funcs.rs (100%)
 rename datafusion/{functions-spark => spark}/src/conditional_funcs/if_expr.rs (100%)
 rename datafusion/{functions-spark => spark}/src/conditional_funcs/mod.rs (100%)
 rename datafusion/{functions-spark => spark}/src/conversion_funcs/cast.rs (100%)
 rename datafusion/{functions-spark => spark}/src/conversion_funcs/mod.rs (100%)
 rename datafusion/{functions-spark => spark}/src/datetime_funcs/date_arithmetic.rs (100%)
 rename datafusion/{functions-spark => spark}/src/datetime_funcs/date_trunc.rs (100%)
 rename datafusion/{functions-spark => spark}/src/datetime_funcs/hour.rs (100%)
 rename datafusion/{functions-spark => spark}/src/datetime_funcs/minute.rs (100%)
 rename datafusion/{functions-spark => spark}/src/datetime_funcs/mod.rs (100%)
 rename datafusion/{functions-spark => spark}/src/datetime_funcs/second.rs (100%)
 rename datafusion/{functions-spark => spark}/src/datetime_funcs/timestamp_trunc.rs (100%)
 rename datafusion/{functions-spark => spark}/src/error.rs (100%)
 rename datafusion/{functions-spark => spark}/src/hash_funcs/mod.rs (100%)
 rename datafusion/{functions-spark => spark}/src/hash_funcs/murmur3.rs (100%)
 rename datafusion/{functions-spark => spark}/src/hash_funcs/sha2.rs (100%)
 rename datafusion/{functions-spark => spark}/src/hash_funcs/utils.rs (100%)
 rename datafusion/{functions-spark => spark}/src/hash_funcs/xxhash64.rs (100%)
 rename datafusion/{functions-spark => spark}/src/json_funcs/mod.rs (100%)
 rename datafusion/{functions-spark => spark}/src/json_funcs/to_json.rs (100%)
 rename datafusion/{functions-spark => spark}/src/kernels/mod.rs (100%)
 rename datafusion/{functions-spark => spark}/src/kernels/strings.rs (100%)
 rename datafusion/{functions-spark => spark}/src/kernels/temporal.rs (100%)
 rename datafusion/{functions-spark => spark}/src/lib.rs (100%)
 rename datafusion/{functions-spark => spark}/src/math_funcs/ceil.rs (100%)
 rename datafusion/{functions-spark => spark}/src/math_funcs/div.rs (100%)
 rename datafusion/{functions-spark => spark}/src/math_funcs/floor.rs (100%)
 rename datafusion/{functions-spark => spark}/src/math_funcs/hex.rs (100%)
 rename datafusion/{functions-spark => spark}/src/math_funcs/internal/checkoverflow.rs (100%)
 rename datafusion/{functions-spark => spark}/src/math_funcs/internal/make_decimal.rs (100%)
 rename datafusion/{functions-spark => spark}/src/math_funcs/internal/mod.rs (100%)
 rename datafusion/{functions-spark => spark}/src/math_funcs/internal/normalize_nan.rs (100%)
 rename datafusion/{functions-spark => spark}/src/math_funcs/internal/unscaled_value.rs (100%)
 rename datafusion/{functions-spark => spark}/src/math_funcs/mod.rs (100%)
 rename datafusion/{functions-spark => spark}/src/math_funcs/negative.rs (100%)
 rename datafusion/{functions-spark => spark}/src/math_funcs/round.rs (100%)
 rename datafusion/{functions-spark => spark}/src/math_funcs/unhex.rs (100%)
 rename datafusion/{functions-spark => spark}/src/math_funcs/utils.rs (100%)
 rename datafusion/{functions-spark => spark}/src/predicate_funcs/is_nan.rs (100%)
 rename datafusion/{functions-spark => spark}/src/predicate_funcs/mod.rs (100%)
 rename datafusion/{functions-spark => spark}/src/predicate_funcs/rlike.rs (100%)
 rename datafusion/{functions-spark => spark}/src/static_invoke/char_varchar_utils/mod.rs (100%)
 rename datafusion/{functions-spark => spark}/src/static_invoke/char_varchar_utils/read_side_padding.rs (100%)
 rename datafusion/{functions-spark => spark}/src/static_invoke/mod.rs (100%)
 rename datafusion/{functions-spark => spark}/src/string_funcs/chr.rs (100%)
 rename datafusion/{functions-spark => spark}/src/string_funcs/mod.rs (100%)
 rename datafusion/{functions-spark => spark}/src/string_funcs/prediction.rs (100%)
 rename datafusion/{functions-spark => spark}/src/string_funcs/string_space.rs (100%)
 rename datafusion/{functions-spark => spark}/src/string_funcs/substring.rs (100%)
 rename datafusion/{functions-spark => spark}/src/struct_funcs/create_named_struct.rs (100%)
 rename datafusion/{functions-spark => spark}/src/struct_funcs/get_struct_field.rs (100%)
 rename datafusion/{functions-spark => spark}/src/struct_funcs/mod.rs (100%)
 rename datafusion/{functions-spark => spark}/src/test_common/file_util.rs (100%)
 rename datafusion/{functions-spark => spark}/src/test_common/mod.rs (100%)
 rename datafusion/{functions-spark => spark}/src/timezone.rs (100%)
 rename datafusion/{functions-spark => spark}/src/unbound.rs (100%)
 rename datafusion/{functions-spark => spark}/src/utils.rs (100%)

diff --git a/Cargo.toml b/Cargo.toml
index 63b9c0d3315e..e8f94885e79e 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -32,7 +32,6 @@ members = [
     "datafusion/functions-aggregate-common",
     "datafusion/functions-table",
     "datafusion/functions-nested",
-    "datafusion/functions-spark",
     "datafusion/functions-window",
     "datafusion/functions-window-common",
     "datafusion/optimizer",
@@ -44,6 +43,7 @@ members = [
     "datafusion/proto/gen",
     "datafusion/proto-common",
     "datafusion/proto-common/gen",
+    "datafusion/spark",
     "datafusion/sql",
     "datafusion/sqllogictest",
     "datafusion/substrait",
diff --git a/datafusion/functions-spark/Cargo.toml b/datafusion/spark/Cargo.toml
similarity index 98%
rename from datafusion/functions-spark/Cargo.toml
rename to datafusion/spark/Cargo.toml
index eaa48845820f..be5a897d1648 100644
--- a/datafusion/functions-spark/Cargo.toml
+++ b/datafusion/spark/Cargo.toml
@@ -16,7 +16,7 @@
 # under the License.
 
 [package]
-name = "datafusion-functions-spark"
+name = "datafusion-spark"
 description = "DataFusion expressions that emulate Apache Spark's behavior"
 version = { workspace = true }
 homepage = { workspace = true }
diff --git a/datafusion/functions-spark/README.md b/datafusion/spark/README.md
similarity index 93%
rename from datafusion/functions-spark/README.md
rename to datafusion/spark/README.md
index 42c7f4d2a8e4..afd94d2c0690 100644
--- a/datafusion/functions-spark/README.md
+++ b/datafusion/spark/README.md
@@ -17,6 +17,6 @@ specific language governing permissions and limitations
 under the License.
 -->
 
-# datafusion-functions-spark: Spark-compatible Expressions
+# datafusion-spark: Spark-compatible Expressions
 
 This crate provides Apache Spark-compatible expressions for use with DataFusion.
diff --git a/datafusion/functions-spark/benches/aggregate.rs b/datafusion/spark/benches/aggregate.rs
similarity index 100%
rename from datafusion/functions-spark/benches/aggregate.rs
rename to datafusion/spark/benches/aggregate.rs
diff --git a/datafusion/functions-spark/benches/cast_from_string.rs b/datafusion/spark/benches/cast_from_string.rs
similarity index 100%
rename from datafusion/functions-spark/benches/cast_from_string.rs
rename to datafusion/spark/benches/cast_from_string.rs
diff --git a/datafusion/functions-spark/benches/cast_numeric.rs b/datafusion/spark/benches/cast_numeric.rs
similarity index 100%
rename from datafusion/functions-spark/benches/cast_numeric.rs
rename to datafusion/spark/benches/cast_numeric.rs
diff --git a/datafusion/functions-spark/benches/conditional.rs b/datafusion/spark/benches/conditional.rs
similarity index 100%
rename from datafusion/functions-spark/benches/conditional.rs
rename to datafusion/spark/benches/conditional.rs
diff --git a/datafusion/functions-spark/benches/decimal_div.rs b/datafusion/spark/benches/decimal_div.rs
similarity index 100%
rename from datafusion/functions-spark/benches/decimal_div.rs
rename to datafusion/spark/benches/decimal_div.rs
diff --git a/datafusion/functions-spark/src/agg_funcs/avg.rs b/datafusion/spark/src/agg_funcs/avg.rs
similarity index 100%
rename from datafusion/functions-spark/src/agg_funcs/avg.rs
rename to datafusion/spark/src/agg_funcs/avg.rs
diff --git a/datafusion/functions-spark/src/agg_funcs/avg_decimal.rs b/datafusion/spark/src/agg_funcs/avg_decimal.rs
similarity index 100%
rename from datafusion/functions-spark/src/agg_funcs/avg_decimal.rs
rename to datafusion/spark/src/agg_funcs/avg_decimal.rs
diff --git a/datafusion/functions-spark/src/agg_funcs/correlation.rs b/datafusion/spark/src/agg_funcs/correlation.rs
similarity index 100%
rename from datafusion/functions-spark/src/agg_funcs/correlation.rs
rename to datafusion/spark/src/agg_funcs/correlation.rs
diff --git a/datafusion/functions-spark/src/agg_funcs/covariance.rs b/datafusion/spark/src/agg_funcs/covariance.rs
similarity index 100%
rename from datafusion/functions-spark/src/agg_funcs/covariance.rs
rename to datafusion/spark/src/agg_funcs/covariance.rs
diff --git a/datafusion/functions-spark/src/agg_funcs/mod.rs b/datafusion/spark/src/agg_funcs/mod.rs
similarity index 100%
rename from datafusion/functions-spark/src/agg_funcs/mod.rs
rename to datafusion/spark/src/agg_funcs/mod.rs
diff --git a/datafusion/functions-spark/src/agg_funcs/stddev.rs b/datafusion/spark/src/agg_funcs/stddev.rs
similarity index 100%
rename from datafusion/functions-spark/src/agg_funcs/stddev.rs
rename to datafusion/spark/src/agg_funcs/stddev.rs
diff --git a/datafusion/functions-spark/src/agg_funcs/sum_decimal.rs b/datafusion/spark/src/agg_funcs/sum_decimal.rs
similarity index 100%
rename from datafusion/functions-spark/src/agg_funcs/sum_decimal.rs
rename to datafusion/spark/src/agg_funcs/sum_decimal.rs
diff --git a/datafusion/functions-spark/src/agg_funcs/variance.rs b/datafusion/spark/src/agg_funcs/variance.rs
similarity index 100%
rename from datafusion/functions-spark/src/agg_funcs/variance.rs
rename to datafusion/spark/src/agg_funcs/variance.rs
diff --git a/datafusion/functions-spark/src/array_funcs/array_insert.rs b/datafusion/spark/src/array_funcs/array_insert.rs
similarity index 100%
rename from datafusion/functions-spark/src/array_funcs/array_insert.rs
rename to datafusion/spark/src/array_funcs/array_insert.rs
diff --git a/datafusion/functions-spark/src/array_funcs/get_array_struct_fields.rs b/datafusion/spark/src/array_funcs/get_array_struct_fields.rs
similarity index 100%
rename from datafusion/functions-spark/src/array_funcs/get_array_struct_fields.rs
rename to datafusion/spark/src/array_funcs/get_array_struct_fields.rs
diff --git a/datafusion/functions-spark/src/array_funcs/list_extract.rs b/datafusion/spark/src/array_funcs/list_extract.rs
similarity index 100%
rename from datafusion/functions-spark/src/array_funcs/list_extract.rs
rename to datafusion/spark/src/array_funcs/list_extract.rs
diff --git a/datafusion/functions-spark/src/array_funcs/mod.rs b/datafusion/spark/src/array_funcs/mod.rs
similarity index 100%
rename from datafusion/functions-spark/src/array_funcs/mod.rs
rename to datafusion/spark/src/array_funcs/mod.rs
diff --git a/datafusion/functions-spark/src/bitwise_funcs/bitwise_not.rs b/datafusion/spark/src/bitwise_funcs/bitwise_not.rs
similarity index 100%
rename from datafusion/functions-spark/src/bitwise_funcs/bitwise_not.rs
rename to datafusion/spark/src/bitwise_funcs/bitwise_not.rs
diff --git a/datafusion/functions-spark/src/bitwise_funcs/mod.rs b/datafusion/spark/src/bitwise_funcs/mod.rs
similarity index 100%
rename from datafusion/functions-spark/src/bitwise_funcs/mod.rs
rename to datafusion/spark/src/bitwise_funcs/mod.rs
diff --git a/datafusion/functions-spark/src/comet_scalar_funcs.rs b/datafusion/spark/src/comet_scalar_funcs.rs
similarity index 100%
rename from datafusion/functions-spark/src/comet_scalar_funcs.rs
rename to datafusion/spark/src/comet_scalar_funcs.rs
diff --git a/datafusion/functions-spark/src/conditional_funcs/if_expr.rs b/datafusion/spark/src/conditional_funcs/if_expr.rs
similarity index 100%
rename from datafusion/functions-spark/src/conditional_funcs/if_expr.rs
rename to datafusion/spark/src/conditional_funcs/if_expr.rs
diff --git a/datafusion/functions-spark/src/conditional_funcs/mod.rs b/datafusion/spark/src/conditional_funcs/mod.rs
similarity index 100%
rename from datafusion/functions-spark/src/conditional_funcs/mod.rs
rename to datafusion/spark/src/conditional_funcs/mod.rs
diff --git a/datafusion/functions-spark/src/conversion_funcs/cast.rs b/datafusion/spark/src/conversion_funcs/cast.rs
similarity index 100%
rename from datafusion/functions-spark/src/conversion_funcs/cast.rs
rename to datafusion/spark/src/conversion_funcs/cast.rs
diff --git a/datafusion/functions-spark/src/conversion_funcs/mod.rs b/datafusion/spark/src/conversion_funcs/mod.rs
similarity index 100%
rename from datafusion/functions-spark/src/conversion_funcs/mod.rs
rename to datafusion/spark/src/conversion_funcs/mod.rs
diff --git a/datafusion/functions-spark/src/datetime_funcs/date_arithmetic.rs b/datafusion/spark/src/datetime_funcs/date_arithmetic.rs
similarity index 100%
rename from datafusion/functions-spark/src/datetime_funcs/date_arithmetic.rs
rename to datafusion/spark/src/datetime_funcs/date_arithmetic.rs
diff --git a/datafusion/functions-spark/src/datetime_funcs/date_trunc.rs b/datafusion/spark/src/datetime_funcs/date_trunc.rs
similarity index 100%
rename from datafusion/functions-spark/src/datetime_funcs/date_trunc.rs
rename to datafusion/spark/src/datetime_funcs/date_trunc.rs
diff --git a/datafusion/functions-spark/src/datetime_funcs/hour.rs b/datafusion/spark/src/datetime_funcs/hour.rs
similarity index 100%
rename from datafusion/functions-spark/src/datetime_funcs/hour.rs
rename to datafusion/spark/src/datetime_funcs/hour.rs
diff --git a/datafusion/functions-spark/src/datetime_funcs/minute.rs b/datafusion/spark/src/datetime_funcs/minute.rs
similarity index 100%
rename from datafusion/functions-spark/src/datetime_funcs/minute.rs
rename to datafusion/spark/src/datetime_funcs/minute.rs
diff --git a/datafusion/functions-spark/src/datetime_funcs/mod.rs b/datafusion/spark/src/datetime_funcs/mod.rs
similarity index 100%
rename from datafusion/functions-spark/src/datetime_funcs/mod.rs
rename to datafusion/spark/src/datetime_funcs/mod.rs
diff --git a/datafusion/functions-spark/src/datetime_funcs/second.rs b/datafusion/spark/src/datetime_funcs/second.rs
similarity index 100%
rename from datafusion/functions-spark/src/datetime_funcs/second.rs
rename to datafusion/spark/src/datetime_funcs/second.rs
diff --git a/datafusion/functions-spark/src/datetime_funcs/timestamp_trunc.rs b/datafusion/spark/src/datetime_funcs/timestamp_trunc.rs
similarity index 100%
rename from datafusion/functions-spark/src/datetime_funcs/timestamp_trunc.rs
rename to datafusion/spark/src/datetime_funcs/timestamp_trunc.rs
diff --git a/datafusion/functions-spark/src/error.rs b/datafusion/spark/src/error.rs
similarity index 100%
rename from datafusion/functions-spark/src/error.rs
rename to datafusion/spark/src/error.rs
diff --git a/datafusion/functions-spark/src/hash_funcs/mod.rs b/datafusion/spark/src/hash_funcs/mod.rs
similarity index 100%
rename from datafusion/functions-spark/src/hash_funcs/mod.rs
rename to datafusion/spark/src/hash_funcs/mod.rs
diff --git a/datafusion/functions-spark/src/hash_funcs/murmur3.rs b/datafusion/spark/src/hash_funcs/murmur3.rs
similarity index 100%
rename from datafusion/functions-spark/src/hash_funcs/murmur3.rs
rename to datafusion/spark/src/hash_funcs/murmur3.rs
diff --git a/datafusion/functions-spark/src/hash_funcs/sha2.rs b/datafusion/spark/src/hash_funcs/sha2.rs
similarity index 100%
rename from datafusion/functions-spark/src/hash_funcs/sha2.rs
rename to datafusion/spark/src/hash_funcs/sha2.rs
diff --git a/datafusion/functions-spark/src/hash_funcs/utils.rs b/datafusion/spark/src/hash_funcs/utils.rs
similarity index 100%
rename from datafusion/functions-spark/src/hash_funcs/utils.rs
rename to datafusion/spark/src/hash_funcs/utils.rs
diff --git a/datafusion/functions-spark/src/hash_funcs/xxhash64.rs b/datafusion/spark/src/hash_funcs/xxhash64.rs
similarity index 100%
rename from datafusion/functions-spark/src/hash_funcs/xxhash64.rs
rename to datafusion/spark/src/hash_funcs/xxhash64.rs
diff --git a/datafusion/functions-spark/src/json_funcs/mod.rs b/datafusion/spark/src/json_funcs/mod.rs
similarity index 100%
rename from datafusion/functions-spark/src/json_funcs/mod.rs
rename to datafusion/spark/src/json_funcs/mod.rs
diff --git a/datafusion/functions-spark/src/json_funcs/to_json.rs b/datafusion/spark/src/json_funcs/to_json.rs
similarity index 100%
rename from datafusion/functions-spark/src/json_funcs/to_json.rs
rename to datafusion/spark/src/json_funcs/to_json.rs
diff --git a/datafusion/functions-spark/src/kernels/mod.rs b/datafusion/spark/src/kernels/mod.rs
similarity index 100%
rename from datafusion/functions-spark/src/kernels/mod.rs
rename to datafusion/spark/src/kernels/mod.rs
diff --git a/datafusion/functions-spark/src/kernels/strings.rs b/datafusion/spark/src/kernels/strings.rs
similarity index 100%
rename from datafusion/functions-spark/src/kernels/strings.rs
rename to datafusion/spark/src/kernels/strings.rs
diff --git a/datafusion/functions-spark/src/kernels/temporal.rs b/datafusion/spark/src/kernels/temporal.rs
similarity index 100%
rename from datafusion/functions-spark/src/kernels/temporal.rs
rename to datafusion/spark/src/kernels/temporal.rs
diff --git a/datafusion/functions-spark/src/lib.rs b/datafusion/spark/src/lib.rs
similarity index 100%
rename from datafusion/functions-spark/src/lib.rs
rename to datafusion/spark/src/lib.rs
diff --git a/datafusion/functions-spark/src/math_funcs/ceil.rs b/datafusion/spark/src/math_funcs/ceil.rs
similarity index 100%
rename from datafusion/functions-spark/src/math_funcs/ceil.rs
rename to datafusion/spark/src/math_funcs/ceil.rs
diff --git a/datafusion/functions-spark/src/math_funcs/div.rs b/datafusion/spark/src/math_funcs/div.rs
similarity index 100%
rename from datafusion/functions-spark/src/math_funcs/div.rs
rename to datafusion/spark/src/math_funcs/div.rs
diff --git a/datafusion/functions-spark/src/math_funcs/floor.rs b/datafusion/spark/src/math_funcs/floor.rs
similarity index 100%
rename from datafusion/functions-spark/src/math_funcs/floor.rs
rename to datafusion/spark/src/math_funcs/floor.rs
diff --git a/datafusion/functions-spark/src/math_funcs/hex.rs b/datafusion/spark/src/math_funcs/hex.rs
similarity index 100%
rename from datafusion/functions-spark/src/math_funcs/hex.rs
rename to datafusion/spark/src/math_funcs/hex.rs
diff --git a/datafusion/functions-spark/src/math_funcs/internal/checkoverflow.rs b/datafusion/spark/src/math_funcs/internal/checkoverflow.rs
similarity index 100%
rename from datafusion/functions-spark/src/math_funcs/internal/checkoverflow.rs
rename to datafusion/spark/src/math_funcs/internal/checkoverflow.rs
diff --git a/datafusion/functions-spark/src/math_funcs/internal/make_decimal.rs b/datafusion/spark/src/math_funcs/internal/make_decimal.rs
similarity index 100%
rename from datafusion/functions-spark/src/math_funcs/internal/make_decimal.rs
rename to datafusion/spark/src/math_funcs/internal/make_decimal.rs
diff --git a/datafusion/functions-spark/src/math_funcs/internal/mod.rs b/datafusion/spark/src/math_funcs/internal/mod.rs
similarity index 100%
rename from datafusion/functions-spark/src/math_funcs/internal/mod.rs
rename to datafusion/spark/src/math_funcs/internal/mod.rs
diff --git a/datafusion/functions-spark/src/math_funcs/internal/normalize_nan.rs b/datafusion/spark/src/math_funcs/internal/normalize_nan.rs
similarity index 100%
rename from datafusion/functions-spark/src/math_funcs/internal/normalize_nan.rs
rename to datafusion/spark/src/math_funcs/internal/normalize_nan.rs
diff --git a/datafusion/functions-spark/src/math_funcs/internal/unscaled_value.rs b/datafusion/spark/src/math_funcs/internal/unscaled_value.rs
similarity index 100%
rename from datafusion/functions-spark/src/math_funcs/internal/unscaled_value.rs
rename to datafusion/spark/src/math_funcs/internal/unscaled_value.rs
diff --git a/datafusion/functions-spark/src/math_funcs/mod.rs b/datafusion/spark/src/math_funcs/mod.rs
similarity index 100%
rename from datafusion/functions-spark/src/math_funcs/mod.rs
rename to datafusion/spark/src/math_funcs/mod.rs
diff --git a/datafusion/functions-spark/src/math_funcs/negative.rs b/datafusion/spark/src/math_funcs/negative.rs
similarity index 100%
rename from datafusion/functions-spark/src/math_funcs/negative.rs
rename to datafusion/spark/src/math_funcs/negative.rs
diff --git a/datafusion/functions-spark/src/math_funcs/round.rs b/datafusion/spark/src/math_funcs/round.rs
similarity index 100%
rename from datafusion/functions-spark/src/math_funcs/round.rs
rename to datafusion/spark/src/math_funcs/round.rs
diff --git a/datafusion/functions-spark/src/math_funcs/unhex.rs b/datafusion/spark/src/math_funcs/unhex.rs
similarity index 100%
rename from datafusion/functions-spark/src/math_funcs/unhex.rs
rename to datafusion/spark/src/math_funcs/unhex.rs
diff --git a/datafusion/functions-spark/src/math_funcs/utils.rs b/datafusion/spark/src/math_funcs/utils.rs
similarity index 100%
rename from datafusion/functions-spark/src/math_funcs/utils.rs
rename to datafusion/spark/src/math_funcs/utils.rs
diff --git a/datafusion/functions-spark/src/predicate_funcs/is_nan.rs b/datafusion/spark/src/predicate_funcs/is_nan.rs
similarity index 100%
rename from datafusion/functions-spark/src/predicate_funcs/is_nan.rs
rename to datafusion/spark/src/predicate_funcs/is_nan.rs
diff --git a/datafusion/functions-spark/src/predicate_funcs/mod.rs b/datafusion/spark/src/predicate_funcs/mod.rs
similarity index 100%
rename from datafusion/functions-spark/src/predicate_funcs/mod.rs
rename to datafusion/spark/src/predicate_funcs/mod.rs
diff --git a/datafusion/functions-spark/src/predicate_funcs/rlike.rs b/datafusion/spark/src/predicate_funcs/rlike.rs
similarity index 100%
rename from datafusion/functions-spark/src/predicate_funcs/rlike.rs
rename to datafusion/spark/src/predicate_funcs/rlike.rs
diff --git a/datafusion/functions-spark/src/static_invoke/char_varchar_utils/mod.rs b/datafusion/spark/src/static_invoke/char_varchar_utils/mod.rs
similarity index 100%
rename from datafusion/functions-spark/src/static_invoke/char_varchar_utils/mod.rs
rename to datafusion/spark/src/static_invoke/char_varchar_utils/mod.rs
diff --git a/datafusion/functions-spark/src/static_invoke/char_varchar_utils/read_side_padding.rs b/datafusion/spark/src/static_invoke/char_varchar_utils/read_side_padding.rs
similarity index 100%
rename from datafusion/functions-spark/src/static_invoke/char_varchar_utils/read_side_padding.rs
rename to datafusion/spark/src/static_invoke/char_varchar_utils/read_side_padding.rs
diff --git a/datafusion/functions-spark/src/static_invoke/mod.rs b/datafusion/spark/src/static_invoke/mod.rs
similarity index 100%
rename from datafusion/functions-spark/src/static_invoke/mod.rs
rename to datafusion/spark/src/static_invoke/mod.rs
diff --git a/datafusion/functions-spark/src/string_funcs/chr.rs b/datafusion/spark/src/string_funcs/chr.rs
similarity index 100%
rename from datafusion/functions-spark/src/string_funcs/chr.rs
rename to datafusion/spark/src/string_funcs/chr.rs
diff --git a/datafusion/functions-spark/src/string_funcs/mod.rs b/datafusion/spark/src/string_funcs/mod.rs
similarity index 100%
rename from datafusion/functions-spark/src/string_funcs/mod.rs
rename to datafusion/spark/src/string_funcs/mod.rs
diff --git a/datafusion/functions-spark/src/string_funcs/prediction.rs b/datafusion/spark/src/string_funcs/prediction.rs
similarity index 100%
rename from datafusion/functions-spark/src/string_funcs/prediction.rs
rename to datafusion/spark/src/string_funcs/prediction.rs
diff --git a/datafusion/functions-spark/src/string_funcs/string_space.rs b/datafusion/spark/src/string_funcs/string_space.rs
similarity index 100%
rename from datafusion/functions-spark/src/string_funcs/string_space.rs
rename to datafusion/spark/src/string_funcs/string_space.rs
diff --git a/datafusion/functions-spark/src/string_funcs/substring.rs b/datafusion/spark/src/string_funcs/substring.rs
similarity index 100%
rename from datafusion/functions-spark/src/string_funcs/substring.rs
rename to datafusion/spark/src/string_funcs/substring.rs
diff --git a/datafusion/functions-spark/src/struct_funcs/create_named_struct.rs b/datafusion/spark/src/struct_funcs/create_named_struct.rs
similarity index 100%
rename from datafusion/functions-spark/src/struct_funcs/create_named_struct.rs
rename to datafusion/spark/src/struct_funcs/create_named_struct.rs
diff --git a/datafusion/functions-spark/src/struct_funcs/get_struct_field.rs b/datafusion/spark/src/struct_funcs/get_struct_field.rs
similarity index 100%
rename from datafusion/functions-spark/src/struct_funcs/get_struct_field.rs
rename to datafusion/spark/src/struct_funcs/get_struct_field.rs
diff --git a/datafusion/functions-spark/src/struct_funcs/mod.rs b/datafusion/spark/src/struct_funcs/mod.rs
similarity index 100%
rename from datafusion/functions-spark/src/struct_funcs/mod.rs
rename to datafusion/spark/src/struct_funcs/mod.rs
diff --git a/datafusion/functions-spark/src/test_common/file_util.rs b/datafusion/spark/src/test_common/file_util.rs
similarity index 100%
rename from datafusion/functions-spark/src/test_common/file_util.rs
rename to datafusion/spark/src/test_common/file_util.rs
diff --git a/datafusion/functions-spark/src/test_common/mod.rs b/datafusion/spark/src/test_common/mod.rs
similarity index 100%
rename from datafusion/functions-spark/src/test_common/mod.rs
rename to datafusion/spark/src/test_common/mod.rs
diff --git a/datafusion/functions-spark/src/timezone.rs b/datafusion/spark/src/timezone.rs
similarity index 100%
rename from datafusion/functions-spark/src/timezone.rs
rename to datafusion/spark/src/timezone.rs
diff --git a/datafusion/functions-spark/src/unbound.rs b/datafusion/spark/src/unbound.rs
similarity index 100%
rename from datafusion/functions-spark/src/unbound.rs
rename to datafusion/spark/src/unbound.rs
diff --git a/datafusion/functions-spark/src/utils.rs b/datafusion/spark/src/utils.rs
similarity index 100%
rename from datafusion/functions-spark/src/utils.rs
rename to datafusion/spark/src/utils.rs