Skip to content

Commit f3c98ec

Browse files
authored
Add fill_null method to DataFrame API for handling missing values (#1019)
* feat: add fill_null method to DataFrame for handling null values * test: add coalesce function tests for handling default values * Resolve test cases for fill_null * feat: add fill_nan method to DataFrame for handling NaN values * move imports out of functions * docs: add documentation for fill_null and fill_nan methods in DataFrame * Add more tests * fix ruff errors * amend def fill_null to invoke PyDataFrame's fill_null - Implemented `fill_null` method in `dataframe.rs` to allow filling null values with a specified value for specific columns or all columns. - Added a helper function `python_value_to_scalar_value` to convert Python values to DataFusion ScalarValues, supporting various types including integers, floats, booleans, strings, and timestamps. - Updated the `count` method in `PyDataFrame` to maintain functionality. * refactor: remove fill_nan method documentation from functions.rst * refactor: remove unused import of Enum from dataframe.py * refactor: improve error handling and type extraction in python_value_to_scalar_value function * refactor: enhance datetime and date conversion logic in python_value_to_scalar_value function * refactor: streamline type extraction in python_value_to_scalar_value function * fix try_convert_to_string * refactor: improve type handling in python_value_to_scalar_value function * refactor: move py_obj_to_scalar_value function to utils module * refactor: update fill_null to use py_obj_to_scalar_value from utils * Remove python_object_to_scalar_value code * refactor: enhance py_obj_to_scalar_value to utilize PyArrow for complex type conversion * refactor: update py_obj_to_scalar_value to handle errors and use extract_bound for PyArrow scalar conversion * refactor: modify py_obj_to_scalar_value to return ScalarValue directly and streamline error handling * refactor: update py_obj_to_scalar_value to return a Result for better error handling * test: add tests for fill_null functionality in DataFrame with null values * test: enhance null DataFrame tests to include date32 and date64 columns * refactor: simplify py_obj_to_scalar_value by removing direct extraction of basic types * refactor: remove unnecessary documentation from py_obj_to_scalar_value function * Fix ruff errors * test: update datetime handling in coalesce tests to include timezone information * Fix ruff errors * trigger ci
1 parent 7d8bcd8 commit f3c98ec

File tree

7 files changed

+414
-22
lines changed

7 files changed

+414
-22
lines changed

docs/source/user-guide/common-operations/functions.rst

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -129,3 +129,24 @@ The function :py:func:`~datafusion.functions.in_list` allows to check a column f
129129
.limit(20)
130130
.to_pandas()
131131
)
132+
133+
134+
Handling Missing Values
135+
=====================
136+
137+
DataFusion provides methods to handle missing values in DataFrames:
138+
139+
fill_null
140+
---------
141+
142+
The ``fill_null()`` method replaces NULL values in specified columns with a provided value:
143+
144+
.. code-block:: python
145+
146+
# Fill all NULL values with 0 where possible
147+
df = df.fill_null(0)
148+
149+
# Fill NULL values only in specific string columns
150+
df = df.fill_null("missing", subset=["name", "category"])
151+
152+
The fill value will be cast to match each column's type. If casting fails for a column, that column remains unchanged.

python/datafusion/dataframe.py

Lines changed: 24 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,8 @@
3737
except ImportError:
3838
from typing_extensions import deprecated # Python 3.12
3939

40+
from datafusion._internal import DataFrame as DataFrameInternal
41+
from datafusion.expr import Expr, SortExpr, sort_or_default
4042
from datafusion.plan import ExecutionPlan, LogicalPlan
4143
from datafusion.record_batch import RecordBatchStream
4244

@@ -53,8 +55,6 @@
5355

5456
from enum import Enum
5557

56-
from datafusion.expr import Expr, SortExpr, sort_or_default
57-
5858

5959
# excerpt from deltalake
6060
# https://github.com/apache/datafusion-python/pull/981#discussion_r1905619163
@@ -869,3 +869,25 @@ def within_limit(df: DataFrame, limit: int) -> DataFrame:
869869
DataFrame: After applying func to the original dataframe.
870870
"""
871871
return func(self, *args)
872+
873+
def fill_null(self, value: Any, subset: list[str] | None = None) -> DataFrame:
874+
"""Fill null values in specified columns with a value.
875+
876+
Args:
877+
value: Value to replace nulls with. Will be cast to match column type.
878+
subset: Optional list of column names to fill. If None, fills all columns.
879+
880+
Returns:
881+
DataFrame with null values replaced where type casting is possible
882+
883+
Examples:
884+
>>> df = df.fill_null(0) # Fill all nulls with 0 where possible
885+
>>> # Fill nulls in specific string columns
886+
>>> df = df.fill_null("missing", subset=["name", "category"])
887+
888+
Notes:
889+
- Only fills nulls in columns where the value can be cast to the column type
890+
- For columns where casting fails, the original column is kept unchanged
891+
- For columns not in subset, the original column is kept unchanged
892+
"""
893+
return DataFrame(self.df.fill_null(value, subset))

python/tests/test_dataframe.py

Lines changed: 266 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
# KIND, either express or implied. See the License for the
1515
# specific language governing permissions and limitations
1616
# under the License.
17+
import datetime
1718
import os
1819
import re
1920
from typing import Any
@@ -119,6 +120,38 @@ def clean_formatter_state():
119120
reset_formatter()
120121

121122

123+
@pytest.fixture
124+
def null_df():
125+
"""Create a DataFrame with null values of different types."""
126+
ctx = SessionContext()
127+
128+
# Create a RecordBatch with nulls across different types
129+
batch = pa.RecordBatch.from_arrays(
130+
[
131+
pa.array([1, None, 3, None], type=pa.int64()),
132+
pa.array([4.5, 6.7, None, None], type=pa.float64()),
133+
pa.array(["a", None, "c", None], type=pa.string()),
134+
pa.array([True, None, False, None], type=pa.bool_()),
135+
pa.array(
136+
[10957, None, 18993, None], type=pa.date32()
137+
), # 2000-01-01, null, 2022-01-01, null
138+
pa.array(
139+
[946684800000, None, 1640995200000, None], type=pa.date64()
140+
), # 2000-01-01, null, 2022-01-01, null
141+
],
142+
names=[
143+
"int_col",
144+
"float_col",
145+
"str_col",
146+
"bool_col",
147+
"date32_col",
148+
"date64_col",
149+
],
150+
)
151+
152+
return ctx.create_dataframe([[batch]])
153+
154+
122155
# custom style for testing with html formatter
123156
class CustomStyleProvider:
124157
def get_cell_style(self) -> str:
@@ -1794,3 +1827,236 @@ def test_html_formatter_manual_format_html(clean_formatter_state):
17941827

17951828
assert "<style>" in local_html_1
17961829
assert "<style>" in local_html_2
1830+
1831+
1832+
def test_fill_null_basic(null_df):
1833+
"""Test basic fill_null functionality with a single value."""
1834+
# Fill all nulls with 0
1835+
filled_df = null_df.fill_null(0)
1836+
1837+
result = filled_df.collect()[0]
1838+
1839+
# Check that nulls were filled with 0 (or equivalent)
1840+
assert result.column(0) == pa.array([1, 0, 3, 0])
1841+
assert result.column(1) == pa.array([4.5, 6.7, 0.0, 0.0])
1842+
# String column should be filled with "0"
1843+
assert result.column(2) == pa.array(["a", "0", "c", "0"])
1844+
# Boolean column should be filled with False (0 converted to bool)
1845+
assert result.column(3) == pa.array([True, False, False, False])
1846+
1847+
1848+
def test_fill_null_subset(null_df):
1849+
"""Test filling nulls only in a subset of columns."""
1850+
# Fill nulls only in numeric columns
1851+
filled_df = null_df.fill_null(0, subset=["int_col", "float_col"])
1852+
1853+
result = filled_df.collect()[0]
1854+
1855+
# Check that nulls were filled only in specified columns
1856+
assert result.column(0) == pa.array([1, 0, 3, 0])
1857+
assert result.column(1) == pa.array([4.5, 6.7, 0.0, 0.0])
1858+
# These should still have nulls
1859+
assert None in result.column(2).to_pylist()
1860+
assert None in result.column(3).to_pylist()
1861+
1862+
1863+
def test_fill_null_str_column(null_df):
1864+
"""Test filling nulls in string columns with different values."""
1865+
# Fill string nulls with a replacement string
1866+
filled_df = null_df.fill_null("N/A", subset=["str_col"])
1867+
1868+
result = filled_df.collect()[0]
1869+
1870+
# Check that string nulls were filled with "N/A"
1871+
assert result.column(2).to_pylist() == ["a", "N/A", "c", "N/A"]
1872+
1873+
# Other columns should be unchanged
1874+
assert None in result.column(0).to_pylist()
1875+
assert None in result.column(1).to_pylist()
1876+
assert None in result.column(3).to_pylist()
1877+
1878+
# Fill with an empty string
1879+
filled_df = null_df.fill_null("", subset=["str_col"])
1880+
result = filled_df.collect()[0]
1881+
assert result.column(2).to_pylist() == ["a", "", "c", ""]
1882+
1883+
1884+
def test_fill_null_bool_column(null_df):
1885+
"""Test filling nulls in boolean columns with different values."""
1886+
# Fill bool nulls with True
1887+
filled_df = null_df.fill_null(value=True, subset=["bool_col"])
1888+
1889+
result = filled_df.collect()[0]
1890+
1891+
# Check that bool nulls were filled with True
1892+
assert result.column(3).to_pylist() == [True, True, False, True]
1893+
1894+
# Other columns should be unchanged
1895+
assert None in result.column(0).to_pylist()
1896+
1897+
# Fill bool nulls with False
1898+
filled_df = null_df.fill_null(value=False, subset=["bool_col"])
1899+
result = filled_df.collect()[0]
1900+
assert result.column(3).to_pylist() == [True, False, False, False]
1901+
1902+
1903+
def test_fill_null_date32_column(null_df):
1904+
"""Test filling nulls in date32 columns."""
1905+
1906+
# Fill date32 nulls with a specific date (1970-01-01)
1907+
epoch_date = datetime.date(1970, 1, 1)
1908+
filled_df = null_df.fill_null(epoch_date, subset=["date32_col"])
1909+
1910+
result = filled_df.collect()[0]
1911+
1912+
# Check that date32 nulls were filled with epoch date
1913+
dates = result.column(4).to_pylist()
1914+
assert dates[0] == datetime.date(2000, 1, 1) # Original value
1915+
assert dates[1] == epoch_date # Filled value
1916+
assert dates[2] == datetime.date(2022, 1, 1) # Original value
1917+
assert dates[3] == epoch_date # Filled value
1918+
1919+
# Other date column should be unchanged
1920+
assert None in result.column(5).to_pylist()
1921+
1922+
1923+
def test_fill_null_date64_column(null_df):
1924+
"""Test filling nulls in date64 columns."""
1925+
1926+
# Fill date64 nulls with a specific date (1970-01-01)
1927+
epoch_date = datetime.date(1970, 1, 1)
1928+
filled_df = null_df.fill_null(epoch_date, subset=["date64_col"])
1929+
1930+
result = filled_df.collect()[0]
1931+
1932+
# Check that date64 nulls were filled with epoch date
1933+
dates = result.column(5).to_pylist()
1934+
assert dates[0] == datetime.date(2000, 1, 1) # Original value
1935+
assert dates[1] == epoch_date # Filled value
1936+
assert dates[2] == datetime.date(2022, 1, 1) # Original value
1937+
assert dates[3] == epoch_date # Filled value
1938+
1939+
# Other date column should be unchanged
1940+
assert None in result.column(4).to_pylist()
1941+
1942+
1943+
def test_fill_null_type_coercion(null_df):
1944+
"""Test type coercion when filling nulls with values of different types."""
1945+
# Try to fill string nulls with a number
1946+
filled_df = null_df.fill_null(42, subset=["str_col"])
1947+
1948+
result = filled_df.collect()[0]
1949+
1950+
# String nulls should be filled with string representation of the number
1951+
assert result.column(2).to_pylist() == ["a", "42", "c", "42"]
1952+
1953+
# Try to fill bool nulls with a string that converts to True
1954+
filled_df = null_df.fill_null("true", subset=["bool_col"])
1955+
result = filled_df.collect()[0]
1956+
1957+
# This behavior depends on the implementation - check it works without error
1958+
# but don't make assertions about exact conversion behavior
1959+
assert None not in result.column(3).to_pylist()
1960+
1961+
1962+
def test_fill_null_multiple_date_columns(null_df):
1963+
"""Test filling nulls in both date column types simultaneously."""
1964+
1965+
# Fill both date column types with the same date
1966+
test_date = datetime.date(2023, 12, 31)
1967+
filled_df = null_df.fill_null(test_date, subset=["date32_col", "date64_col"])
1968+
1969+
result = filled_df.collect()[0]
1970+
1971+
# Check both date columns were filled correctly
1972+
date32_vals = result.column(4).to_pylist()
1973+
date64_vals = result.column(5).to_pylist()
1974+
1975+
assert None not in date32_vals
1976+
assert None not in date64_vals
1977+
1978+
assert date32_vals[1] == test_date
1979+
assert date32_vals[3] == test_date
1980+
assert date64_vals[1] == test_date
1981+
assert date64_vals[3] == test_date
1982+
1983+
1984+
def test_fill_null_specific_types(null_df):
1985+
"""Test filling nulls with type-appropriate values."""
1986+
# Fill with type-specific values
1987+
filled_df = null_df.fill_null("missing")
1988+
1989+
result = filled_df.collect()[0]
1990+
1991+
# Check that nulls were filled appropriately by type
1992+
1993+
assert result.column(0).to_pylist() == [1, None, 3, None]
1994+
assert result.column(1).to_pylist() == [4.5, 6.7, None, None]
1995+
assert result.column(2).to_pylist() == ["a", "missing", "c", "missing"]
1996+
assert result.column(3).to_pylist() == [True, None, False, None] # Bool gets False
1997+
assert result.column(4).to_pylist() == [
1998+
datetime.date(2000, 1, 1),
1999+
None,
2000+
datetime.date(2022, 1, 1),
2001+
None,
2002+
]
2003+
assert result.column(5).to_pylist() == [
2004+
datetime.date(2000, 1, 1),
2005+
None,
2006+
datetime.date(2022, 1, 1),
2007+
None,
2008+
]
2009+
2010+
2011+
def test_fill_null_immutability(null_df):
2012+
"""Test that original DataFrame is unchanged after fill_null."""
2013+
# Get original values with nulls
2014+
original = null_df.collect()[0]
2015+
original_int_nulls = original.column(0).to_pylist().count(None)
2016+
2017+
# Apply fill_null
2018+
_filled_df = null_df.fill_null(0)
2019+
2020+
# Check that original is unchanged
2021+
new_original = null_df.collect()[0]
2022+
new_original_int_nulls = new_original.column(0).to_pylist().count(None)
2023+
2024+
assert original_int_nulls == new_original_int_nulls
2025+
assert original_int_nulls > 0 # Ensure we actually had nulls in the first place
2026+
2027+
2028+
def test_fill_null_empty_df(ctx):
2029+
"""Test fill_null on empty DataFrame."""
2030+
# Create an empty DataFrame with schema
2031+
batch = pa.RecordBatch.from_arrays(
2032+
[pa.array([], type=pa.int64()), pa.array([], type=pa.string())],
2033+
names=["a", "b"],
2034+
)
2035+
empty_df = ctx.create_dataframe([[batch]])
2036+
2037+
# Fill nulls (should work without errors)
2038+
filled_df = empty_df.fill_null(0)
2039+
2040+
# Should still be empty but with same schema
2041+
result = filled_df.collect()[0]
2042+
assert len(result.column(0)) == 0
2043+
assert len(result.column(1)) == 0
2044+
assert result.schema.field(0).name == "a"
2045+
assert result.schema.field(1).name == "b"
2046+
2047+
2048+
def test_fill_null_all_null_column(ctx):
2049+
"""Test fill_null on a column with all nulls."""
2050+
# Create DataFrame with a column of all nulls
2051+
batch = pa.RecordBatch.from_arrays(
2052+
[pa.array([1, 2, 3]), pa.array([None, None, None], type=pa.string())],
2053+
names=["a", "b"],
2054+
)
2055+
all_null_df = ctx.create_dataframe([[batch]])
2056+
2057+
# Fill nulls with a value
2058+
filled_df = all_null_df.fill_null("filled")
2059+
2060+
# Check that all nulls were filled
2061+
result = filled_df.collect()[0]
2062+
assert result.column(1).to_pylist() == ["filled", "filled", "filled"]

0 commit comments

Comments
 (0)