Skip to content

Commit

Permalink
Fix: updated test_(int/float)_column_profile.py test_diff to cover js…
Browse files Browse the repository at this point in the history
…on.dumps() functionality. Updated numerical_column_stats.py _perform_t_test to cast conservative and welch values to floats before assignment. (#446)

Co-authored-by: Michael <[email protected]>
  • Loading branch information
micdavis and Michael authored Mar 30, 2022
1 parent 31bc817 commit e4548d5
Show file tree
Hide file tree
Showing 3 changed files with 23 additions and 9 deletions.
8 changes: 4 additions & 4 deletions dataprofiler/profilers/numerical_column_stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -422,16 +422,16 @@ def _perform_t_test(mean1, var1, n1,
welch_df = s_delta ** 2 / ((var1 / n1) ** 2 /
(n1 - 1) + (var2 / n2) ** 2 / (n2 - 1))
results['t-statistic'] = t
results['conservative']['df'] = conservative_df
results['welch']['df'] = welch_df
results['conservative']['df'] = float(conservative_df)
results['welch']['df'] = float(welch_df)

conservative_t = scipy.stats.t(conservative_df)
conservative_p_val = (1 - conservative_t.cdf(abs(t))) * 2
welch_t = scipy.stats.t(welch_df)
welch_p_val = (1 - welch_t.cdf(abs(t))) * 2

results['conservative']['p-value'] = conservative_p_val
results['welch']['p-value'] = welch_p_val
results['conservative']['p-value'] = float(conservative_p_val)
results['welch']['p-value'] = float(welch_p_val)
return results

def _update_variance(self, batch_mean, batch_var, batch_count):
Expand Down
9 changes: 8 additions & 1 deletion dataprofiler/tests/profilers/test_float_column_profile.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from collections import defaultdict
from unittest import mock
import warnings
import json

import pandas as pd
import numpy as np
Expand Down Expand Up @@ -294,7 +295,7 @@ def test_profiled_mode(self):
profiler.update(df)
np.testing.assert_array_almost_equal([1.9, 2.01], profiler.mode,
decimal=2)

# all unique values
df = pd.Series([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]).apply(str)
profiler = FloatColumn(df.name)
Expand Down Expand Up @@ -1569,6 +1570,12 @@ def test_diff(self):
}
}
profile_diff = profiler1.diff(profiler2)
try:
json.dumps(profile_diff)
except TypeError as e:
self.fail(
'JSON Serializing issue with the profile diff. '
'Exception raised: {}'.format(str(e)))
self.assertAlmostEqual(
expected_diff.pop('median'), profile_diff.pop('median'), places=2)
expected_diff_mode = expected_diff.pop('mode')
Expand Down
15 changes: 11 additions & 4 deletions dataprofiler/tests/profilers/test_int_column_profile.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import unittest
from unittest import mock
from collections import defaultdict
import json

import pandas as pd
import numpy as np
Expand Down Expand Up @@ -540,7 +541,7 @@ def test_profile(self):
'sum': 1.0, 'variance': 1.0, 'skewness': 1.0,
'kurtosis': 1.0, 'num_negatives': 1.0,
'num_zeros': 1.0})

)
time_array = [float(i) for i in range(100, 0, -1)]
with mock.patch('time.time', side_effect=lambda: time_array.pop()):
Expand Down Expand Up @@ -824,7 +825,7 @@ def test_profile_merge_no_bin_overlap(self):
'Profiles have no overlapping bin methods '
'and therefore cannot be added together.'):
profiler1 + profiler2

def test_profile_merge_with_different_options(self):
# Creating first profiler with default options
options = IntOptions()
Expand All @@ -850,13 +851,13 @@ def test_profile_merge_with_different_options(self):
"max is disabled because it is not enabled in"
" both profiles."):
profiler3 = profiler1 + profiler2

# Assert that these features are still merged
profile = profiler3.profile
self.assertIsNotNone(profiler3.histogram_selection)
self.assertIsNotNone(profile['variance'])
self.assertIsNotNone(profiler3.sum)

# Assert that these features are not calculated
self.assertIsNone(profiler3.max)
self.assertIsNone(profiler3.min)
Expand Down Expand Up @@ -1006,6 +1007,12 @@ def test_diff(self):
}
}
profile_diff = profiler1.diff(profiler2)
try:
json.dumps(profile_diff)
except TypeError as e:
self.fail(
'JSON Serializing issue with the profile diff. '
'Exception raised: {}'.format(str(e)))
self.assertAlmostEqual(
expected_diff.pop('median'), profile_diff.pop('median'), places=2)
expected_diff_mode = expected_diff.pop('mode')
Expand Down

0 comments on commit e4548d5

Please sign in to comment.