From 1013ca8a82075b7619bc91af5c70dd8b5370d8a8 Mon Sep 17 00:00:00 2001 From: John Fieldman <1295319+yozzo@users.noreply.github.com> Date: Tue, 29 Oct 2024 13:40:44 +0000 Subject: [PATCH] Fix comparison report when one column is all NAs (#343) * Fix compariosn report when one column is all NAs With situations when one of the column in the comparison was all NA's then this would break the reporting. For some reason when the matching (boolean) of the actual and expected columns happened when there was a categorical value compared to a NA, the result was NA, rather than False, as it would happen for the other elements in the cols compared. This has now been addressed at the intersect rows level which doesn't seem to break the reporting anymore. * Fix column_equal to work with StringArrays with pd.NA values not returning booleans * Add test for fn column_equal to work with StringArrays with pd.NA Add test for fn column_equal to work with StringArrays containing pd.NA values not returning booleans when compared with other df's with rows of StringArrays * Fix linter error Printing out the report would've been useful for this test, but looks like it makes the linter fail the build. This has now been fixed. * Fix column_equal to work with StringArrays with pd.NA values Fix column_equal to work with StringArrays with pd.NA values not returning booleans, and update formatting to match the linter expectation * Add test for fn column_equal to work with StringArrays with pd.NA Add test for fn column_equal to work with StringArrays containing pd.NA values not returning booleans when compared with other df's with rows of StringArrays, and format test to match the linter. --- datacompy/core.py | 4 +++- tests/test_core.py | 32 ++++++++++++++++++++++++++++++++ 2 files changed, 35 insertions(+), 1 deletion(-) diff --git a/datacompy/core.py b/datacompy/core.py index 889fb901..7d0da3d8 100644 --- a/datacompy/core.py +++ b/datacompy/core.py @@ -799,6 +799,7 @@ def columns_equal( A series of Boolean values. True == the values match, False == the values don't match. """ + default_value = "DATACOMPY_NULL" compare: pd.Series[bool] # short circuit if comparing mixed type columns. We don't want to support this moving forward. @@ -842,7 +843,8 @@ def columns_equal( compare = compare_string_and_date_columns(col_1, col_2) else: compare = pd.Series( - (col_1 == col_2) | (col_1.isnull() & col_2.isnull()) + (col_1.fillna(default_value) == col_2.fillna(default_value)) + | (col_1.isnull() & col_2.isnull()) ) except Exception: # Blanket exception should just return all False diff --git a/tests/test_core.py b/tests/test_core.py index 482a12f4..b0c3647f 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -958,6 +958,38 @@ def test_sample_mismatch(): assert (output.name_df1 != output.name_df2).all() +def test_sample_mismatch_with_nans(): + """Checks that comparison of StringArrays with pd.NA values returns booleans + + When comparing pd.NA with a string the result is pd.NA, this breaks the compare + report with the following error: + "E ValueError: a must be greater than 0 unless no samples are taken" + + Dataframes with StringArray type rows come when using pd.Dataframes created from + parquet files using the pyarrow engine. + """ + df1 = pd.DataFrame( + { + "acct_id": [10000001221, 10000001222, 10000001223], + "name": pd.array([pd.NA, pd.NA, pd.NA], dtype="string"), + } + ) + df1.set_index("acct_id", inplace=True) + + df2 = pd.DataFrame( + { + "acct_id": [10000001221, 10000001222, 10000001223], + "name": pd.array([pd.NA, "Tobias Funke", pd.NA], dtype="string"), + } + ) + + df2.set_index("acct_id", inplace=True) + + report = datacompy.Compare(df1=df1, df2=df2, on_index=True).report() + + assert "Tobias Funke" in report + + def test_all_mismatch_not_ignore_matching_cols_no_cols_matching(): data1 = """acct_id,dollar_amt,name,float_fld,date_fld 10000001234,123.45,George Maharis,14530.1555,2017-01-01