diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index a2047e1b..092cc5a4 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -19,7 +19,7 @@ repos: # Flake8: complexity and style checking # https://flake8.pycqa.org/en/latest/user/using-hooks.html - repo: https://github.com/pycqa/flake8 - rev: 4.0.1 + rev: 5.0.4 hooks: - id: flake8 additional_dependencies: [flake8-docstrings] diff --git a/dataprofiler/data_readers/parquet_data.py b/dataprofiler/data_readers/parquet_data.py index ee625316..0af1b563 100644 --- a/dataprofiler/data_readers/parquet_data.py +++ b/dataprofiler/data_readers/parquet_data.py @@ -68,7 +68,7 @@ def __init__( self._load_data(data) @property - def file_encoding(self) -> None: + def file_encoding(self) -> Optional[str]: """Set file encoding to None since not detected for avro.""" return None diff --git a/dataprofiler/tests/profilers/test_categorical_column_profile.py b/dataprofiler/tests/profilers/test_categorical_column_profile.py index 55d2ea68..16a223f4 100644 --- a/dataprofiler/tests/profilers/test_categorical_column_profile.py +++ b/dataprofiler/tests/profilers/test_categorical_column_profile.py @@ -1,4 +1,5 @@ import json +import math import os import unittest from collections import defaultdict @@ -731,7 +732,44 @@ def test_categorical_diff(self): }, } actual_diff = profile.diff(profile2) - self.assertDictEqual(expected_diff, actual_diff) + + assert expected_diff["categorical"] == actual_diff["categorical"] + assert ( + expected_diff["statistics"]["unique_count"] + == actual_diff["statistics"]["unique_count"] + ) + assert math.isclose( + expected_diff["statistics"]["unique_ratio"], + actual_diff["statistics"]["unique_ratio"], + ) + assert ( + expected_diff["statistics"]["categories"] + == actual_diff["statistics"]["categories"] + ) + assert math.isclose( + expected_diff["statistics"]["gini_impurity"], + actual_diff["statistics"]["gini_impurity"], + ) + assert math.isclose( + expected_diff["statistics"]["unalikeability"], + actual_diff["statistics"]["unalikeability"], + ) + assert ( + expected_diff["statistics"]["categorical_count"] + == actual_diff["statistics"]["categorical_count"] + ) + assert math.isclose( + expected_diff["statistics"]["chi2-test"]["chi2-statistic"], + actual_diff["statistics"]["chi2-test"]["chi2-statistic"], + ) + assert ( + expected_diff["statistics"]["chi2-test"]["deg_of_free"] + == actual_diff["statistics"]["chi2-test"]["deg_of_free"] + ) + assert math.isclose( + expected_diff["statistics"]["chi2-test"]["p-value"], + actual_diff["statistics"]["chi2-test"]["p-value"], + ) # Test with one categorical column matching df_not_categorical = pd.Series( diff --git a/dataprofiler/tests/profilers/test_profile_builder.py b/dataprofiler/tests/profilers/test_profile_builder.py index f9bbf14a..02521194 100644 --- a/dataprofiler/tests/profilers/test_profile_builder.py +++ b/dataprofiler/tests/profilers/test_profile_builder.py @@ -1,5 +1,6 @@ import json import logging +import math import os import random import re @@ -2162,8 +2163,18 @@ def test_diff_categorical_chi2_test(self, *mocks): "deg_of_free": 2, "p-value": 0.3099238764710244, } - self.assertDictEqual( - expected_chi2_test_dict, diff["data_stats"][0]["statistics"]["chi2-test"] + actual_chi2_test_dict = diff["data_stats"][0]["statistics"]["chi2-test"] + + assert math.isclose( + expected_chi2_test_dict["chi2-statistic"], + actual_chi2_test_dict["chi2-statistic"], + ) + assert ( + expected_chi2_test_dict["deg_of_free"] + == actual_chi2_test_dict["deg_of_free"] + ) + assert math.isclose( + expected_chi2_test_dict["p-value"], actual_chi2_test_dict["p-value"] ) @mock.patch(