|
14 | 14 | # KIND, either express or implied. See the License for the
|
15 | 15 | # specific language governing permissions and limitations
|
16 | 16 | # under the License.
|
| 17 | +import datetime |
17 | 18 | import os
|
18 | 19 | import re
|
19 | 20 | from typing import Any
|
@@ -119,6 +120,38 @@ def clean_formatter_state():
|
119 | 120 | reset_formatter()
|
120 | 121 |
|
121 | 122 |
|
| 123 | +@pytest.fixture |
| 124 | +def null_df(): |
| 125 | + """Create a DataFrame with null values of different types.""" |
| 126 | + ctx = SessionContext() |
| 127 | + |
| 128 | + # Create a RecordBatch with nulls across different types |
| 129 | + batch = pa.RecordBatch.from_arrays( |
| 130 | + [ |
| 131 | + pa.array([1, None, 3, None], type=pa.int64()), |
| 132 | + pa.array([4.5, 6.7, None, None], type=pa.float64()), |
| 133 | + pa.array(["a", None, "c", None], type=pa.string()), |
| 134 | + pa.array([True, None, False, None], type=pa.bool_()), |
| 135 | + pa.array( |
| 136 | + [10957, None, 18993, None], type=pa.date32() |
| 137 | + ), # 2000-01-01, null, 2022-01-01, null |
| 138 | + pa.array( |
| 139 | + [946684800000, None, 1640995200000, None], type=pa.date64() |
| 140 | + ), # 2000-01-01, null, 2022-01-01, null |
| 141 | + ], |
| 142 | + names=[ |
| 143 | + "int_col", |
| 144 | + "float_col", |
| 145 | + "str_col", |
| 146 | + "bool_col", |
| 147 | + "date32_col", |
| 148 | + "date64_col", |
| 149 | + ], |
| 150 | + ) |
| 151 | + |
| 152 | + return ctx.create_dataframe([[batch]]) |
| 153 | + |
| 154 | + |
122 | 155 | # custom style for testing with html formatter
|
123 | 156 | class CustomStyleProvider:
|
124 | 157 | def get_cell_style(self) -> str:
|
@@ -1794,3 +1827,236 @@ def test_html_formatter_manual_format_html(clean_formatter_state):
|
1794 | 1827 |
|
1795 | 1828 | assert "<style>" in local_html_1
|
1796 | 1829 | assert "<style>" in local_html_2
|
| 1830 | + |
| 1831 | + |
| 1832 | +def test_fill_null_basic(null_df): |
| 1833 | + """Test basic fill_null functionality with a single value.""" |
| 1834 | + # Fill all nulls with 0 |
| 1835 | + filled_df = null_df.fill_null(0) |
| 1836 | + |
| 1837 | + result = filled_df.collect()[0] |
| 1838 | + |
| 1839 | + # Check that nulls were filled with 0 (or equivalent) |
| 1840 | + assert result.column(0) == pa.array([1, 0, 3, 0]) |
| 1841 | + assert result.column(1) == pa.array([4.5, 6.7, 0.0, 0.0]) |
| 1842 | + # String column should be filled with "0" |
| 1843 | + assert result.column(2) == pa.array(["a", "0", "c", "0"]) |
| 1844 | + # Boolean column should be filled with False (0 converted to bool) |
| 1845 | + assert result.column(3) == pa.array([True, False, False, False]) |
| 1846 | + |
| 1847 | + |
| 1848 | +def test_fill_null_subset(null_df): |
| 1849 | + """Test filling nulls only in a subset of columns.""" |
| 1850 | + # Fill nulls only in numeric columns |
| 1851 | + filled_df = null_df.fill_null(0, subset=["int_col", "float_col"]) |
| 1852 | + |
| 1853 | + result = filled_df.collect()[0] |
| 1854 | + |
| 1855 | + # Check that nulls were filled only in specified columns |
| 1856 | + assert result.column(0) == pa.array([1, 0, 3, 0]) |
| 1857 | + assert result.column(1) == pa.array([4.5, 6.7, 0.0, 0.0]) |
| 1858 | + # These should still have nulls |
| 1859 | + assert None in result.column(2).to_pylist() |
| 1860 | + assert None in result.column(3).to_pylist() |
| 1861 | + |
| 1862 | + |
| 1863 | +def test_fill_null_str_column(null_df): |
| 1864 | + """Test filling nulls in string columns with different values.""" |
| 1865 | + # Fill string nulls with a replacement string |
| 1866 | + filled_df = null_df.fill_null("N/A", subset=["str_col"]) |
| 1867 | + |
| 1868 | + result = filled_df.collect()[0] |
| 1869 | + |
| 1870 | + # Check that string nulls were filled with "N/A" |
| 1871 | + assert result.column(2).to_pylist() == ["a", "N/A", "c", "N/A"] |
| 1872 | + |
| 1873 | + # Other columns should be unchanged |
| 1874 | + assert None in result.column(0).to_pylist() |
| 1875 | + assert None in result.column(1).to_pylist() |
| 1876 | + assert None in result.column(3).to_pylist() |
| 1877 | + |
| 1878 | + # Fill with an empty string |
| 1879 | + filled_df = null_df.fill_null("", subset=["str_col"]) |
| 1880 | + result = filled_df.collect()[0] |
| 1881 | + assert result.column(2).to_pylist() == ["a", "", "c", ""] |
| 1882 | + |
| 1883 | + |
| 1884 | +def test_fill_null_bool_column(null_df): |
| 1885 | + """Test filling nulls in boolean columns with different values.""" |
| 1886 | + # Fill bool nulls with True |
| 1887 | + filled_df = null_df.fill_null(value=True, subset=["bool_col"]) |
| 1888 | + |
| 1889 | + result = filled_df.collect()[0] |
| 1890 | + |
| 1891 | + # Check that bool nulls were filled with True |
| 1892 | + assert result.column(3).to_pylist() == [True, True, False, True] |
| 1893 | + |
| 1894 | + # Other columns should be unchanged |
| 1895 | + assert None in result.column(0).to_pylist() |
| 1896 | + |
| 1897 | + # Fill bool nulls with False |
| 1898 | + filled_df = null_df.fill_null(value=False, subset=["bool_col"]) |
| 1899 | + result = filled_df.collect()[0] |
| 1900 | + assert result.column(3).to_pylist() == [True, False, False, False] |
| 1901 | + |
| 1902 | + |
| 1903 | +def test_fill_null_date32_column(null_df): |
| 1904 | + """Test filling nulls in date32 columns.""" |
| 1905 | + |
| 1906 | + # Fill date32 nulls with a specific date (1970-01-01) |
| 1907 | + epoch_date = datetime.date(1970, 1, 1) |
| 1908 | + filled_df = null_df.fill_null(epoch_date, subset=["date32_col"]) |
| 1909 | + |
| 1910 | + result = filled_df.collect()[0] |
| 1911 | + |
| 1912 | + # Check that date32 nulls were filled with epoch date |
| 1913 | + dates = result.column(4).to_pylist() |
| 1914 | + assert dates[0] == datetime.date(2000, 1, 1) # Original value |
| 1915 | + assert dates[1] == epoch_date # Filled value |
| 1916 | + assert dates[2] == datetime.date(2022, 1, 1) # Original value |
| 1917 | + assert dates[3] == epoch_date # Filled value |
| 1918 | + |
| 1919 | + # Other date column should be unchanged |
| 1920 | + assert None in result.column(5).to_pylist() |
| 1921 | + |
| 1922 | + |
| 1923 | +def test_fill_null_date64_column(null_df): |
| 1924 | + """Test filling nulls in date64 columns.""" |
| 1925 | + |
| 1926 | + # Fill date64 nulls with a specific date (1970-01-01) |
| 1927 | + epoch_date = datetime.date(1970, 1, 1) |
| 1928 | + filled_df = null_df.fill_null(epoch_date, subset=["date64_col"]) |
| 1929 | + |
| 1930 | + result = filled_df.collect()[0] |
| 1931 | + |
| 1932 | + # Check that date64 nulls were filled with epoch date |
| 1933 | + dates = result.column(5).to_pylist() |
| 1934 | + assert dates[0] == datetime.date(2000, 1, 1) # Original value |
| 1935 | + assert dates[1] == epoch_date # Filled value |
| 1936 | + assert dates[2] == datetime.date(2022, 1, 1) # Original value |
| 1937 | + assert dates[3] == epoch_date # Filled value |
| 1938 | + |
| 1939 | + # Other date column should be unchanged |
| 1940 | + assert None in result.column(4).to_pylist() |
| 1941 | + |
| 1942 | + |
| 1943 | +def test_fill_null_type_coercion(null_df): |
| 1944 | + """Test type coercion when filling nulls with values of different types.""" |
| 1945 | + # Try to fill string nulls with a number |
| 1946 | + filled_df = null_df.fill_null(42, subset=["str_col"]) |
| 1947 | + |
| 1948 | + result = filled_df.collect()[0] |
| 1949 | + |
| 1950 | + # String nulls should be filled with string representation of the number |
| 1951 | + assert result.column(2).to_pylist() == ["a", "42", "c", "42"] |
| 1952 | + |
| 1953 | + # Try to fill bool nulls with a string that converts to True |
| 1954 | + filled_df = null_df.fill_null("true", subset=["bool_col"]) |
| 1955 | + result = filled_df.collect()[0] |
| 1956 | + |
| 1957 | + # This behavior depends on the implementation - check it works without error |
| 1958 | + # but don't make assertions about exact conversion behavior |
| 1959 | + assert None not in result.column(3).to_pylist() |
| 1960 | + |
| 1961 | + |
| 1962 | +def test_fill_null_multiple_date_columns(null_df): |
| 1963 | + """Test filling nulls in both date column types simultaneously.""" |
| 1964 | + |
| 1965 | + # Fill both date column types with the same date |
| 1966 | + test_date = datetime.date(2023, 12, 31) |
| 1967 | + filled_df = null_df.fill_null(test_date, subset=["date32_col", "date64_col"]) |
| 1968 | + |
| 1969 | + result = filled_df.collect()[0] |
| 1970 | + |
| 1971 | + # Check both date columns were filled correctly |
| 1972 | + date32_vals = result.column(4).to_pylist() |
| 1973 | + date64_vals = result.column(5).to_pylist() |
| 1974 | + |
| 1975 | + assert None not in date32_vals |
| 1976 | + assert None not in date64_vals |
| 1977 | + |
| 1978 | + assert date32_vals[1] == test_date |
| 1979 | + assert date32_vals[3] == test_date |
| 1980 | + assert date64_vals[1] == test_date |
| 1981 | + assert date64_vals[3] == test_date |
| 1982 | + |
| 1983 | + |
| 1984 | +def test_fill_null_specific_types(null_df): |
| 1985 | + """Test filling nulls with type-appropriate values.""" |
| 1986 | + # Fill with type-specific values |
| 1987 | + filled_df = null_df.fill_null("missing") |
| 1988 | + |
| 1989 | + result = filled_df.collect()[0] |
| 1990 | + |
| 1991 | + # Check that nulls were filled appropriately by type |
| 1992 | + |
| 1993 | + assert result.column(0).to_pylist() == [1, None, 3, None] |
| 1994 | + assert result.column(1).to_pylist() == [4.5, 6.7, None, None] |
| 1995 | + assert result.column(2).to_pylist() == ["a", "missing", "c", "missing"] |
| 1996 | + assert result.column(3).to_pylist() == [True, None, False, None] # Bool gets False |
| 1997 | + assert result.column(4).to_pylist() == [ |
| 1998 | + datetime.date(2000, 1, 1), |
| 1999 | + None, |
| 2000 | + datetime.date(2022, 1, 1), |
| 2001 | + None, |
| 2002 | + ] |
| 2003 | + assert result.column(5).to_pylist() == [ |
| 2004 | + datetime.date(2000, 1, 1), |
| 2005 | + None, |
| 2006 | + datetime.date(2022, 1, 1), |
| 2007 | + None, |
| 2008 | + ] |
| 2009 | + |
| 2010 | + |
| 2011 | +def test_fill_null_immutability(null_df): |
| 2012 | + """Test that original DataFrame is unchanged after fill_null.""" |
| 2013 | + # Get original values with nulls |
| 2014 | + original = null_df.collect()[0] |
| 2015 | + original_int_nulls = original.column(0).to_pylist().count(None) |
| 2016 | + |
| 2017 | + # Apply fill_null |
| 2018 | + _filled_df = null_df.fill_null(0) |
| 2019 | + |
| 2020 | + # Check that original is unchanged |
| 2021 | + new_original = null_df.collect()[0] |
| 2022 | + new_original_int_nulls = new_original.column(0).to_pylist().count(None) |
| 2023 | + |
| 2024 | + assert original_int_nulls == new_original_int_nulls |
| 2025 | + assert original_int_nulls > 0 # Ensure we actually had nulls in the first place |
| 2026 | + |
| 2027 | + |
| 2028 | +def test_fill_null_empty_df(ctx): |
| 2029 | + """Test fill_null on empty DataFrame.""" |
| 2030 | + # Create an empty DataFrame with schema |
| 2031 | + batch = pa.RecordBatch.from_arrays( |
| 2032 | + [pa.array([], type=pa.int64()), pa.array([], type=pa.string())], |
| 2033 | + names=["a", "b"], |
| 2034 | + ) |
| 2035 | + empty_df = ctx.create_dataframe([[batch]]) |
| 2036 | + |
| 2037 | + # Fill nulls (should work without errors) |
| 2038 | + filled_df = empty_df.fill_null(0) |
| 2039 | + |
| 2040 | + # Should still be empty but with same schema |
| 2041 | + result = filled_df.collect()[0] |
| 2042 | + assert len(result.column(0)) == 0 |
| 2043 | + assert len(result.column(1)) == 0 |
| 2044 | + assert result.schema.field(0).name == "a" |
| 2045 | + assert result.schema.field(1).name == "b" |
| 2046 | + |
| 2047 | + |
| 2048 | +def test_fill_null_all_null_column(ctx): |
| 2049 | + """Test fill_null on a column with all nulls.""" |
| 2050 | + # Create DataFrame with a column of all nulls |
| 2051 | + batch = pa.RecordBatch.from_arrays( |
| 2052 | + [pa.array([1, 2, 3]), pa.array([None, None, None], type=pa.string())], |
| 2053 | + names=["a", "b"], |
| 2054 | + ) |
| 2055 | + all_null_df = ctx.create_dataframe([[batch]]) |
| 2056 | + |
| 2057 | + # Fill nulls with a value |
| 2058 | + filled_df = all_null_df.fill_null("filled") |
| 2059 | + |
| 2060 | + # Check that all nulls were filled |
| 2061 | + result = filled_df.collect()[0] |
| 2062 | + assert result.column(1).to_pylist() == ["filled", "filled", "filled"] |
0 commit comments