Fix: updated test_(int/float)_column_profile.py test_diff to cover js…

…on.dumps() functionality. Updated numerical_column_stats.py _perform_t_test to cast conservative and welch values to floats before assignment. (#446) Co-authored-by: Michael <[email protected]>
capitalone · Mar 30, 2022 · e4548d5 · e4548d5
1 parent 31bc817
commit e4548d5
Show file tree

Hide file tree

Showing 3 changed files with 23 additions and 9 deletions.
diff --git a/dataprofiler/profilers/numerical_column_stats.py b/dataprofiler/profilers/numerical_column_stats.py
@@ -422,16 +422,16 @@ def _perform_t_test(mean1, var1, n1,
         welch_df = s_delta ** 2 / ((var1 / n1) ** 2 /
                                    (n1 - 1) + (var2 / n2) ** 2 / (n2 - 1))
         results['t-statistic'] = t
-        results['conservative']['df'] = conservative_df
-        results['welch']['df'] = welch_df
+        results['conservative']['df'] = float(conservative_df)
+        results['welch']['df'] = float(welch_df)
 
         conservative_t = scipy.stats.t(conservative_df)
         conservative_p_val = (1 - conservative_t.cdf(abs(t))) * 2
         welch_t = scipy.stats.t(welch_df)
         welch_p_val = (1 - welch_t.cdf(abs(t))) * 2
 
-        results['conservative']['p-value'] = conservative_p_val
-        results['welch']['p-value'] = welch_p_val
+        results['conservative']['p-value'] = float(conservative_p_val)
+        results['welch']['p-value'] = float(welch_p_val)
         return results
 
     def _update_variance(self, batch_mean, batch_var, batch_count):

diff --git a/dataprofiler/tests/profilers/test_float_column_profile.py b/dataprofiler/tests/profilers/test_float_column_profile.py
@@ -3,6 +3,7 @@
 from collections import defaultdict
 from unittest import mock
 import warnings
+import json
 
 import pandas as pd
 import numpy as np
@@ -294,7 +295,7 @@ def test_profiled_mode(self):
         profiler.update(df)
         np.testing.assert_array_almost_equal([1.9, 2.01], profiler.mode,
                                              decimal=2)
-        
+
         # all unique values
         df = pd.Series([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]).apply(str)
         profiler = FloatColumn(df.name)
@@ -1569,6 +1570,12 @@ def test_diff(self):
             }
         }
         profile_diff = profiler1.diff(profiler2)
+        try:
+            json.dumps(profile_diff)
+        except TypeError as e:
+            self.fail(
+                'JSON Serializing issue with the profile diff. '
+                'Exception raised: {}'.format(str(e)))
         self.assertAlmostEqual(
             expected_diff.pop('median'), profile_diff.pop('median'), places=2)
         expected_diff_mode = expected_diff.pop('mode')

diff --git a/dataprofiler/tests/profilers/test_int_column_profile.py b/dataprofiler/tests/profilers/test_int_column_profile.py
@@ -2,6 +2,7 @@
 import unittest
 from unittest import mock
 from collections import defaultdict
+import json
 
 import pandas as pd
 import numpy as np
@@ -540,7 +541,7 @@ def test_profile(self):
                         'sum': 1.0, 'variance': 1.0, 'skewness': 1.0,
                         'kurtosis': 1.0, 'num_negatives': 1.0,
                         'num_zeros': 1.0})
-            
+
         )
         time_array = [float(i) for i in range(100, 0, -1)]
         with mock.patch('time.time', side_effect=lambda: time_array.pop()):
@@ -824,7 +825,7 @@ def test_profile_merge_no_bin_overlap(self):
                                     'Profiles have no overlapping bin methods '
                                     'and therefore cannot be added together.'):
             profiler1 + profiler2
-            
+
     def test_profile_merge_with_different_options(self):
         # Creating first profiler with default options
         options = IntOptions()
@@ -850,13 +851,13 @@ def test_profile_merge_with_different_options(self):
                                    "max is disabled because it is not enabled in"
                                    " both profiles."):
             profiler3 = profiler1 + profiler2
-        
+
         # Assert that these features are still merged
         profile = profiler3.profile
         self.assertIsNotNone(profiler3.histogram_selection)
         self.assertIsNotNone(profile['variance'])
         self.assertIsNotNone(profiler3.sum)
-        
+
         # Assert that these features are not calculated
         self.assertIsNone(profiler3.max)
         self.assertIsNone(profiler3.min)
@@ -1006,6 +1007,12 @@ def test_diff(self):
             }
         }
         profile_diff = profiler1.diff(profiler2)
+        try:
+            json.dumps(profile_diff)
+        except TypeError as e:
+            self.fail(
+                'JSON Serializing issue with the profile diff. '
+                'Exception raised: {}'.format(str(e)))
         self.assertAlmostEqual(
             expected_diff.pop('median'), profile_diff.pop('median'), places=2)
         expected_diff_mode = expected_diff.pop('mode')