Skip to content

Commit

Permalink
test refactor for floating point failures + 1 mypy fix
Browse files Browse the repository at this point in the history
  • Loading branch information
armaan-dhillon committed Feb 5, 2025
1 parent 1639641 commit d3b91ad
Show file tree
Hide file tree
Showing 4 changed files with 54 additions and 5 deletions.
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ repos:
# Flake8: complexity and style checking
# https://flake8.pycqa.org/en/latest/user/using-hooks.html
- repo: https://github.com/pycqa/flake8
rev: 4.0.1
rev: 5.0.4
hooks:
- id: flake8
additional_dependencies: [flake8-docstrings]
Expand Down
2 changes: 1 addition & 1 deletion dataprofiler/data_readers/parquet_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ def __init__(
self._load_data(data)

@property
def file_encoding(self) -> None:
def file_encoding(self) -> Optional[str]:
"""Set file encoding to None since not detected for avro."""
return None

Expand Down
40 changes: 39 additions & 1 deletion dataprofiler/tests/profilers/test_categorical_column_profile.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import json
import math
import os
import unittest
from collections import defaultdict
Expand Down Expand Up @@ -731,7 +732,44 @@ def test_categorical_diff(self):
},
}
actual_diff = profile.diff(profile2)
self.assertDictEqual(expected_diff, actual_diff)

assert expected_diff["categorical"] == actual_diff["categorical"]
assert (
expected_diff["statistics"]["unique_count"]
== actual_diff["statistics"]["unique_count"]
)
assert math.isclose(
expected_diff["statistics"]["unique_ratio"],
actual_diff["statistics"]["unique_ratio"],
)
assert (
expected_diff["statistics"]["categories"]
== actual_diff["statistics"]["categories"]
)
assert math.isclose(
expected_diff["statistics"]["gini_impurity"],
actual_diff["statistics"]["gini_impurity"],
)
assert math.isclose(
expected_diff["statistics"]["unalikeability"],
actual_diff["statistics"]["unalikeability"],
)
assert (
expected_diff["statistics"]["categorical_count"]
== actual_diff["statistics"]["categorical_count"]
)
assert math.isclose(
expected_diff["statistics"]["chi2-test"]["chi2-statistic"],
actual_diff["statistics"]["chi2-test"]["chi2-statistic"],
)
assert (
expected_diff["statistics"]["chi2-test"]["deg_of_free"]
== actual_diff["statistics"]["chi2-test"]["deg_of_free"]
)
assert math.isclose(
expected_diff["statistics"]["chi2-test"]["p-value"],
actual_diff["statistics"]["chi2-test"]["p-value"],
)

# Test with one categorical column matching
df_not_categorical = pd.Series(
Expand Down
15 changes: 13 additions & 2 deletions dataprofiler/tests/profilers/test_profile_builder.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import json
import logging
import math
import os
import random
import re
Expand Down Expand Up @@ -2162,8 +2163,18 @@ def test_diff_categorical_chi2_test(self, *mocks):
"deg_of_free": 2,
"p-value": 0.3099238764710244,
}
self.assertDictEqual(
expected_chi2_test_dict, diff["data_stats"][0]["statistics"]["chi2-test"]
actual_chi2_test_dict = diff["data_stats"][0]["statistics"]["chi2-test"]

assert math.isclose(
expected_chi2_test_dict["chi2-statistic"],
actual_chi2_test_dict["chi2-statistic"],
)
assert (
expected_chi2_test_dict["deg_of_free"]
== actual_chi2_test_dict["deg_of_free"]
)
assert math.isclose(
expected_chi2_test_dict["p-value"], actual_chi2_test_dict["p-value"]
)

@mock.patch(
Expand Down

0 comments on commit d3b91ad

Please sign in to comment.