apache · jayzhan211 · Sep 4, 2024 · Aug 31, 2024 · Aug 31, 2024 · Aug 31, 2024
diff --git a/datafusion/functions-aggregate/src/kurtosis_pop.rs b/datafusion/functions-aggregate/src/kurtosis_pop.rs
@@ -0,0 +1,199 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::array::{Array, ArrayRef, Float64Array, UInt64Array};
+use arrow::compute::cast;
+use arrow_schema::{DataType, Field};
+use datafusion_common::{
+    downcast_value, plan_err, unwrap_or_internal_err, DataFusionError, Result,
+    ScalarValue,
+};
+use datafusion_expr::{Accumulator, AggregateUDFImpl, Signature, Volatility};
+use datafusion_functions_aggregate_common::accumulator::{
+    AccumulatorArgs, StateFieldsArgs,
+};
+use std::any::Any;
+use std::fmt::Debug;
+
+make_udaf_expr_and_func!(
+    KurtosisPopFunction,
+    kurtosis_pop,
+    x,
+    "Calculates the excess kurtosis (Fisher’s definition) without bias correction.",
+    kurtosis_pop_udaf
+);
+
+pub struct KurtosisPopFunction {
+    signature: Signature,
+}
+
+impl Debug for KurtosisPopFunction {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("KurtosisPopFunction")
+            .field("signature", &self.signature)
+            .finish()
+    }
+}
+
+impl Default for KurtosisPopFunction {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl KurtosisPopFunction {
+    pub fn new() -> Self {
+        Self {
+            signature: Signature::numeric(1, Volatility::Immutable),
+        }
+    }
+}
+
+impl AggregateUDFImpl for KurtosisPopFunction {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn name(&self) -> &str {
+        "kurtosis_pop"
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
+        if !arg_types[0].is_null() && !arg_types[0].is_numeric() {
+            return plan_err!("KurtosisPop requires numeric input types");
+        }
+
+        Ok(DataType::Float64)
+    }
+
+    fn state_fields(&self, _args: StateFieldsArgs) -> Result<Vec<Field>> {
+        Ok(vec![
+            Field::new("count", DataType::UInt64, true),
+            Field::new("sum", DataType::Float64, true),
+            Field::new("sum_sqr", DataType::Float64, true),
+            Field::new("sum_cub", DataType::Float64, true),
+            Field::new("sum_four", DataType::Float64, true),
+        ])
+    }
+
+    fn accumulator(&self, _acc_args: AccumulatorArgs) -> Result<Box<dyn Accumulator>> {
+        Ok(Box::new(KurtosisPopAccumulator::new()))
+    }
+}
+
+#[derive(Debug, Default)]
+pub struct KurtosisPopAccumulator {
+    count: u64,
+    sum: f64,
+    sum_sqr: f64,
+    sum_cub: f64,
+    sum_four: f64,
+}
+
+impl KurtosisPopAccumulator {
+    pub fn new() -> Self {
+        Self {
+            count: 0,
+            sum: 0.0,
+            sum_sqr: 0.0,
+            sum_cub: 0.0,
+            sum_four: 0.0,
+        }
+    }
+}
+
+impl Accumulator for KurtosisPopAccumulator {
+    fn update_batch(&mut self, values: &[ArrayRef]) -> Result<()> {
+        let values = &cast(&values[0], &DataType::Float64)?;
-        let values = &cast(&values[0], &DataType::Float64)?;
+        let array = values[0].as_primitive::<Float64Type>();
+        for value in array.iter().flatten() {
+            self.count += 1;
+            self.sum += value;
+            self.sum_sqr += value.powi(2);
+            self.sum_cub += value.powi(3);
+            self.sum_four += value.powi(4);
+        }
-        let values = &cast(&values[0], &DataType::Float64)?;
+        let array = values[0].as_primitive::<Float64Type>();
+        for value in array.iter().flatten() {
+            self.count += 1;
+            self.sum += value;
+            self.sum_sqr += value.powi(2);
+            self.sum_cub += value.powi(3);
+            self.sum_four += value.powi(4);
+        }
+        let mut arr = downcast_value!(values, Float64Array).iter().flatten();
+        for i in 0..values.len() {
+            let value = if values.is_valid(i) { arr.next() } else { None };
+
+            if value.is_none() {
+                continue;
+            }
+
+            let value = unwrap_or_internal_err!(value);
+            self.count += 1;
+            self.sum += value;
+            self.sum_sqr += value.powi(2);
+            self.sum_cub += value.powi(3);
+            self.sum_four += value.powi(4);
+        }
+        Ok(())
+    }
+
+    fn merge_batch(&mut self, states: &[ArrayRef]) -> Result<()> {
+        let counts = downcast_value!(states[0], UInt64Array);
+        let sums = downcast_value!(states[1], Float64Array);
+        let sum_sqrs = downcast_value!(states[2], Float64Array);
+        let sum_cubs = downcast_value!(states[3], Float64Array);
+        let sum_fours = downcast_value!(states[4], Float64Array);
+
+        for i in 0..counts.len() {
+            let c = counts.value(i);
+            if c == 0 {
+                continue;
+            }
+            self.count += c;
+            self.sum += sums.value(i);
+            self.sum_sqr += sum_sqrs.value(i);
+            self.sum_cub += sum_cubs.value(i);
+            self.sum_four += sum_fours.value(i);
+        }
+
+        Ok(())
+    }
+
+    fn evaluate(&mut self) -> Result<ScalarValue> {
+        if self.count < 1 {
+            return Ok(ScalarValue::Float64(None));
+        }
+
+        let count_64 = 1_f64 / self.count as f64;
+        let m4 = count_64
+            * (self.sum_four - 4.0 * self.sum_cub * self.sum * count_64
+                + 6.0 * self.sum_sqr * self.sum.powi(2) * count_64.powi(2)
+                - 3.0 * self.sum.powi(4) * count_64.powi(3));
-        let count_64 = 1_f64 / self.count as f64;
-        let m4 = count_64
-            * (self.sum_four - 4.0 * self.sum_cub * self.sum * count_64
-                + 6.0 * self.sum_sqr * self.sum.powi(2) * count_64.powi(2)
-                - 3.0 * self.sum.powi(4) * count_64.powi(3));
+        let count_64 = self.count as f64;
+        let m4 = 
+        (self.sum_four - 4.0 * self.sum_cub * self.sum / count_64
+                + 6.0 * self.sum_sqr * self.sum.powi(2) / count_64.powi(2)
+                - 3.0 * self.sum.powi(4) / count_64.powi(3)) / count_64;
-        let count_64 = 1_f64 / self.count as f64;
-        let m4 = count_64
-            * (self.sum_four - 4.0 * self.sum_cub * self.sum * count_64
-                + 6.0 * self.sum_sqr * self.sum.powi(2) * count_64.powi(2)
-                - 3.0 * self.sum.powi(4) * count_64.powi(3));
+        let count_64 = self.count as f64;
+        let m4 = 
+        (self.sum_four - 4.0 * self.sum_cub * self.sum / count_64
+                + 6.0 * self.sum_sqr * self.sum.powi(2) / count_64.powi(2)
+                - 3.0 * self.sum.powi(4) / count_64.powi(3)) / count_64;
+
+        let m2 = (self.sum_sqr - self.sum.powi(2) * count_64) * count_64;
+        if m2 <= 0.0 {
+            return Ok(ScalarValue::Float64(None));
+        }
+
+        let target = m4 / (m2.powi(2)) - 3.0;
+        Ok(ScalarValue::Float64(Some(target)))
+    }
+
+    fn size(&self) -> usize {
+        std::mem::size_of_val(self)
+    }
+
+    fn state(&mut self) -> Result<Vec<ScalarValue>> {
+        Ok(vec![
+            ScalarValue::from(self.count),
+            ScalarValue::from(self.sum),
+            ScalarValue::from(self.sum_sqr),
+            ScalarValue::from(self.sum_cub),
+            ScalarValue::from(self.sum_four),
+        ])
+    }
+}
diff --git a/datafusion/functions-aggregate/src/lib.rs b/datafusion/functions-aggregate/src/lib.rs
@@ -78,6 +78,7 @@ pub mod average;
 pub mod bit_and_or_xor;
 pub mod bool_and_or;
 pub mod grouping;
+pub mod kurtosis_pop;
 pub mod nth_value;
 pub mod string_agg;
 
@@ -169,6 +170,7 @@ pub fn all_default_aggregate_functions() -> Vec<Arc<AggregateUDF>> {
         average::avg_udaf(),
         grouping::grouping_udaf(),
         nth_value::nth_value_udaf(),
+        kurtosis_pop::kurtosis_pop_udaf(),
     ]
 }
 

diff --git a/datafusion/proto/tests/cases/roundtrip_logical_plan.rs b/datafusion/proto/tests/cases/roundtrip_logical_plan.rs
@@ -72,6 +72,7 @@ use datafusion_functions_aggregate::average::avg_udaf;
 use datafusion_functions_aggregate::expr_fn::{
     approx_distinct, array_agg, avg, bit_and, bit_or, bit_xor, bool_and, bool_or, corr,
 };
+use datafusion_functions_aggregate::kurtosis_pop::kurtosis_pop;
 use datafusion_functions_aggregate::string_agg::string_agg;
 use datafusion_proto::bytes::{
     logical_plan_from_bytes, logical_plan_from_bytes_with_extension_codec,
@@ -903,6 +904,7 @@ async fn roundtrip_expr_api() -> Result<()> {
             vec![lit(10), lit(20), lit(30)],
         ),
         row_number(),
+        kurtosis_pop(lit(1)),
     ];
 
     // ensure expressions created with the expr api can be round tripped

diff --git a/datafusion/sqllogictest/test_files/aggregate.slt b/datafusion/sqllogictest/test_files/aggregate.slt
@@ -5863,3 +5863,62 @@ ORDER BY k;
 ----
 1 1.8125 6.8007813 Float16 Float16
 2 8.5 8.5 Float16 Float16
+
+# The result is 0.19432323191699075 actually
+query R
+SELECT kurtosis_pop(col) FROM VALUES (1), (10), (100), (10), (1) as tab(col);
+----
+0.194323231917
+
+# The result is -1.153061224489787 actually
+query R
+SELECT kurtosis_pop(col) FROM VALUES (1), (2), (3), (2), (1) as tab(col);
+----
+-1.15306122449
+
+query R
+SELECT kurtosis_pop(col) FROM VALUES (1.0), (10.0), (100.0), (10.0), (1.0) as tab(col);
+----
+0.194323231917
+
+query error DataFusion error: Error during planning: KurtosisPop requires numeric input types
+SELECT kurtosis_pop(col) FROM VALUES ('1'), ('10'), ('100'), ('10'), ('1') as tab(col);
+
+query R
+SELECT kurtosis_pop(col) FROM VALUES (1.0) as tab(col);
+----
+NULL
+
+query R
+SELECT kurtosis_pop(1)
+----
+NULL
+
+query R
+SELECT kurtosis_pop(1.0)
+----
+NULL
+
+query R
+SELECT kurtosis_pop(null)
+----
+NULL
+
+statement ok
+CREATE TABLE t1(c1 int);
+
+query R
+SELECT kurtosis_pop(c1) FROM t1;
+----
+NULL
+
+statement ok
+INSERT INTO t1 VALUES (1), (10), (100), (10), (1);
+
+query R
+SELECT kurtosis_pop(c1) FROM t1;
+----
+0.194323231917
+
+statement ok
+DROP TABLE t1;