@@ -51,14 +51,18 @@ use arrow::compute::{cast, concat};
51
51
use arrow:: datatypes:: { DataType , Field , Schema , SchemaRef } ;
52
52
use datafusion_common:: config:: { CsvOptions , JsonOptions } ;
53
53
use datafusion_common:: {
54
- exec_err, not_impl_err, plan_err, Column , DFSchema , DataFusionError , ParamValues ,
55
- SchemaError , UnnestOptions ,
54
+ exec_err, not_impl_err, plan_datafusion_err , plan_err, Column , DFSchema ,
55
+ DataFusionError , ParamValues , ScalarValue , SchemaError , UnnestOptions ,
56
56
} ;
57
- use datafusion_expr:: dml:: InsertOp ;
58
- use datafusion_expr:: { case, is_null, lit, SortExpr } ;
59
57
use datafusion_expr:: {
60
- utils:: COUNT_STAR_EXPANSION , TableProviderFilterPushDown , UNNAMED_TABLE ,
58
+ case,
59
+ dml:: InsertOp ,
60
+ expr:: { Alias , ScalarFunction } ,
61
+ is_null, lit,
62
+ utils:: COUNT_STAR_EXPANSION ,
63
+ SortExpr , TableProviderFilterPushDown , UNNAMED_TABLE ,
61
64
} ;
65
+ use datafusion_functions:: core:: coalesce;
62
66
use datafusion_functions_aggregate:: expr_fn:: {
63
67
avg, count, max, median, min, stddev, sum,
64
68
} ;
@@ -1930,6 +1934,89 @@ impl DataFrame {
1930
1934
plan,
1931
1935
} )
1932
1936
}
1937
+
1938
+ /// Fill null values in specified columns with a given value
1939
+ /// If no columns are specified (empty vector), applies to all columns
1940
+ /// Only fills if the value can be cast to the column's type
1941
+ ///
1942
+ /// # Arguments
1943
+ /// * `value` - Value to fill nulls with
1944
+ /// * `columns` - List of column names to fill. If empty, fills all columns.
1945
+ ///
1946
+ /// # Example
1947
+ /// ```
1948
+ /// # use datafusion::prelude::*;
1949
+ /// # use datafusion::error::Result;
1950
+ /// # use datafusion_common::ScalarValue;
1951
+ /// # #[tokio::main]
1952
+ /// # async fn main() -> Result<()> {
1953
+ /// let ctx = SessionContext::new();
1954
+ /// let df = ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await?;
1955
+ /// // Fill nulls in only columns "a" and "c":
1956
+ /// let df = df.fill_null(ScalarValue::from(0), vec!["a".to_owned(), "c".to_owned()])?;
1957
+ /// // Fill nulls across all columns:
1958
+ /// let df = df.fill_null(ScalarValue::from(0), vec![])?;
1959
+ /// # Ok(())
1960
+ /// # }
1961
+ /// ```
1962
+ pub fn fill_null (
1963
+ & self ,
1964
+ value : ScalarValue ,
1965
+ columns : Vec < String > ,
1966
+ ) -> Result < DataFrame > {
1967
+ let cols = if columns. is_empty ( ) {
1968
+ self . logical_plan ( )
1969
+ . schema ( )
1970
+ . fields ( )
1971
+ . iter ( )
1972
+ . map ( |f| f. as_ref ( ) . clone ( ) )
1973
+ . collect ( )
1974
+ } else {
1975
+ self . find_columns ( & columns) ?
1976
+ } ;
1977
+
1978
+ // Create projections for each column
1979
+ let projections = self
1980
+ . logical_plan ( )
1981
+ . schema ( )
1982
+ . fields ( )
1983
+ . iter ( )
1984
+ . map ( |field| {
1985
+ if cols. contains ( field) {
1986
+ // Try to cast fill value to column type. If the cast fails, fallback to the original column.
1987
+ match value. clone ( ) . cast_to ( field. data_type ( ) ) {
1988
+ Ok ( fill_value) => Expr :: Alias ( Alias {
1989
+ expr : Box :: new ( Expr :: ScalarFunction ( ScalarFunction {
1990
+ func : coalesce ( ) ,
1991
+ args : vec ! [ col( field. name( ) ) , lit( fill_value) ] ,
1992
+ } ) ) ,
1993
+ relation : None ,
1994
+ name : field. name ( ) . to_string ( ) ,
1995
+ } ) ,
1996
+ Err ( _) => col ( field. name ( ) ) ,
1997
+ }
1998
+ } else {
1999
+ col ( field. name ( ) )
2000
+ }
2001
+ } )
2002
+ . collect :: < Vec < _ > > ( ) ;
2003
+
2004
+ self . clone ( ) . select ( projections)
2005
+ }
2006
+
2007
+ // Helper to find columns from names
2008
+ fn find_columns ( & self , names : & [ String ] ) -> Result < Vec < Field > > {
2009
+ let schema = self . logical_plan ( ) . schema ( ) ;
2010
+ names
2011
+ . iter ( )
2012
+ . map ( |name| {
2013
+ schema
2014
+ . field_with_name ( None , name)
2015
+ . cloned ( )
2016
+ . map_err ( |_| plan_datafusion_err ! ( "Column '{}' not found" , name) )
2017
+ } )
2018
+ . collect ( )
2019
+ }
1933
2020
}
1934
2021
1935
2022
#[ derive( Debug ) ]
0 commit comments