Fix comparison report when one column is all NAs (#343)

* Fix compariosn report when one column is all NAs With situations when one of the column in the comparison was all NA's then this would break the reporting. For some reason when the matching (boolean) of the actual and expected columns happened when there was a categorical value compared to a NA, the result was NA, rather than False, as it would happen for the other elements in the cols compared. This has now been addressed at the intersect rows level which doesn't seem to break the reporting anymore. * Fix column_equal to work with StringArrays with pd.NA values not returning booleans * Add test for fn column_equal to work with StringArrays with pd.NA Add test for fn column_equal to work with StringArrays containing pd.NA values not returning booleans when compared with other df's with rows of StringArrays * Fix linter error Printing out the report would've been useful for this test, but looks like it makes the linter fail the build. This has now been fixed. * Fix column_equal to work with StringArrays with pd.NA values Fix column_equal to work with StringArrays with pd.NA values not returning booleans, and update formatting to match the linter expectation * Add test for fn column_equal to work with StringArrays with pd.NA Add test for fn column_equal to work with StringArrays containing pd.NA values not returning booleans when compared with other df's with rows of StringArrays, and format test to match the linter.
capitalone · Oct 29, 2024 · 1013ca8 · 1013ca8
1 parent 998ee4f
commit 1013ca8
Show file tree

Hide file tree

Showing 2 changed files with 35 additions and 1 deletion.
diff --git a/datacompy/core.py b/datacompy/core.py
@@ -799,6 +799,7 @@ def columns_equal(
         A series of Boolean values.  True == the values match, False == the
         values don't match.
     """
+    default_value = "DATACOMPY_NULL"
     compare: pd.Series[bool]
 
     # short circuit if comparing mixed type columns. We don't want to support this moving forward.
@@ -842,7 +843,8 @@ def columns_equal(
                     compare = compare_string_and_date_columns(col_1, col_2)
                 else:
                     compare = pd.Series(
-                        (col_1 == col_2) | (col_1.isnull() & col_2.isnull())
+                        (col_1.fillna(default_value) == col_2.fillna(default_value))
+                        | (col_1.isnull() & col_2.isnull())
                     )
             except Exception:
                 # Blanket exception should just return all False

diff --git a/tests/test_core.py b/tests/test_core.py
@@ -958,6 +958,38 @@ def test_sample_mismatch():
     assert (output.name_df1 != output.name_df2).all()
 
 
+def test_sample_mismatch_with_nans():
+    """Checks that comparison of StringArrays with pd.NA values returns booleans
+
+    When comparing pd.NA with a string the result is pd.NA, this breaks the compare
+    report with the following error:
+    "E ValueError: a must be greater than 0 unless no samples are taken"
+
+    Dataframes with StringArray type rows come when using pd.Dataframes created from
+    parquet files using the pyarrow engine.
+    """
+    df1 = pd.DataFrame(
+        {
+            "acct_id": [10000001221, 10000001222, 10000001223],
+            "name": pd.array([pd.NA, pd.NA, pd.NA], dtype="string"),
+        }
+    )
+    df1.set_index("acct_id", inplace=True)
+
+    df2 = pd.DataFrame(
+        {
+            "acct_id": [10000001221, 10000001222, 10000001223],
+            "name": pd.array([pd.NA, "Tobias Funke", pd.NA], dtype="string"),
+        }
+    )
+
+    df2.set_index("acct_id", inplace=True)
+
+    report = datacompy.Compare(df1=df1, df2=df2, on_index=True).report()
+
+    assert "Tobias Funke" in report
+
+
 def test_all_mismatch_not_ignore_matching_cols_no_cols_matching():
     data1 = """acct_id,dollar_amt,name,float_fld,date_fld
     10000001234,123.45,George Maharis,14530.1555,2017-01-01