synthesized-io · marqueewinq · Feb 15, 2024 · Feb 15, 2024 · Feb 15, 2024 · Feb 15, 2024
diff --git a/src/insight/metrics/__init__.py b/src/insight/metrics/__init__.py
@@ -1,6 +1,8 @@
 from .base import OneColumnMetric, TwoColumnMetric, TwoDataFrameMetric
 from .metrics import (
     BhattacharyyaCoefficient,
+    ChiSquareContingencyPValue,
+    ChiSquareContingencyRawValue,
     CramersV,
     EarthMoversDistance,
     EarthMoversDistanceBinned,
@@ -19,6 +21,8 @@
 __all__ = [
     "BhattacharyyaCoefficient",
     "CorrMatrix",
+    "ChiSquareContingencyRawValue",
+    "ChiSquareContingencyPValue",
     "CramersV",
     "DiffCorrMatrix",
     "EarthMoversDistance",

diff --git a/src/insight/metrics/metrics.py b/src/insight/metrics/metrics.py
@@ -5,7 +5,7 @@
 import numpy as np
 import pandas as pd
 from scipy.spatial.distance import jensenshannon
-from scipy.stats import entropy, ks_2samp, wasserstein_distance
+from scipy.stats import chi2_contingency, entropy, ks_2samp, wasserstein_distance
 
 from ..check import Check, ColumnCheck
 from .base import OneColumnMetric, TwoColumnMetric
@@ -534,3 +534,55 @@ def _compute_metric(self, sr_a: pd.Series, sr_b: pd.Series) -> float:
             return 1.0
 
         return ks_2samp(sr_a.dropna(), sr_b.dropna())[0]  # The first element is the KS statistic
+
+
+class ChiSquareContingencyRawValue(TwoColumnMetric):
+    """Chi-Square Contingency test for comparing the distribution of two categorical variables.
+
+    The result is a measure of independence between two categorical variables.
+    """
+
+    name = "chi_square_contingency_raw"
+
+    @classmethod
+    def check_column_types(
+        cls, sr_a: pd.Series, sr_b: pd.Series, check: Check = ColumnCheck()
+    ) -> bool:
+        if check.categorical(sr_a) and check.categorical(sr_b):
+            return True
+        return False
+
+    def _scalar(self, chi2, p, dof, expected) -> float:
+        """Given a results of the chi2 test (chi2, p, dof, expected), outputs a single value: raw chi2 result"""
+        return chi2
+
+    def _compute_metric(self, sr_a: pd.Series, sr_b: pd.Series) -> float:
+        """Calculate the Chi-square contingency metric.
+
+        Args:
+            sr_a (pd.Series): Categories of the first variable.
+            sr_b (pd.Series): Categories of the second variable to compare.
+
+        Returns:
+            A normalized value indicating the level of association between the two variables.
+            Closer to 0 means more closely associated (similar distributions),
+            closer to 1 means less associated (dissimilar distributions).
+        """
+        if sr_a.empty or sr_b.empty:
+            return 1.0
+        if sr_a.equals(sr_b):
+            return 0.0
+
+        # Create a contingency table
+        contingency_table = pd.crosstab(sr_a, sr_b)
+
+        # Perform the Chi-square test
+        chi2, p, dof, expected = chi2_contingency(contingency_table)
+
+        return self._scalar(chi2, p, dof, expected)
+
+
+class ChiSquareContingencyPValue(ChiSquareContingencyRawValue):
+    def _scalar(self, chi2, p, dof, expected) -> float:
+        """Given a results of the chi2 test (chi2, p, dof, expected), outputs a single p-value"""
+        return p
diff --git a/tests/test_metrics/test_metrics.py b/tests/test_metrics/test_metrics.py
@@ -7,6 +7,8 @@
 from insight.check import ColumnCheck
 from insight.metrics import (
     BhattacharyyaCoefficient,
+    ChiSquareContingencyPValue,
+    ChiSquareContingencyRawValue,
     CramersV,
     EarthMoversDistance,
     EarthMoversDistanceBinned,
@@ -338,6 +340,72 @@ def categorical(self, sr: pd.Series) -> bool:
     assert 0 <= ksd(pd.Series([1, np.nan, 3]), pd.Series([1, 2, 3])) <= 1
 
 
+def test_chi_square_contingency():
+    # Test with identical distributions
+    cat_a = pd.Series(["a", "b", "c", "b"], dtype="string")
+    cat_b = pd.Series(["a", "b", "c", "a"], dtype="string")
+    cont_a = pd.Series([1, 2, 3, 3], dtype="int64")
+    cont_b = pd.Series([1, 1, 2, 3], dtype="int64")
+
+    class SimpleCheck(ColumnCheck):
+        def infer_dtype(self, sr: pd.Series) -> str:
+            return sr.dtype
+
+        def continuous(self, sr: pd.Series) -> bool:
+            return sr.dtype.kind in ("i", "f")
+
+        def categorical(self, sr: pd.Series) -> bool:
+            return sr.dtype == "string"
+
+    chi_square_contingency = ChiSquareContingencyPValue(check=SimpleCheck())
+    chi_square_contingency_raw = ChiSquareContingencyRawValue(check=SimpleCheck())
+
+    # Test with categorical data
+    assert (
+        0 <= chi_square_contingency._compute_metric(cat_a, cat_b) <= 1
+    ), "Chi-square should return a normalized value between 0 and 1"
+    assert (
+        chi_square_contingency_raw._compute_metric(cat_a, cat_b) is not None
+    ), "Chi-square should return not null on categoricals"
+
+    # Test with mixed types (should not be possible, so adjust according to your method of handling this in your class)
+    assert chi_square_contingency(cat_a, cont_b) is None
+    assert chi_square_contingency(cont_a, cat_b) is None
+
+    # Test with identical categorical distributions
+    assert (
+        chi_square_contingency._compute_metric(cat_a, cat_a) == 0
+    ), "Identical distributions should return a distance of 0"
+
+    # Test with completely different distributions (using a simple method to make them "completely different")
+    cat_c = pd.Series(["d", "e", "f"], dtype="string")
+    assert (
+        chi_square_contingency._compute_metric(cat_a, cat_c) > 0
+    ), "Completely different distributions should not have a 0 distance"
+
+    # Edge cases
+    # Test with one or both series empty
+    assert (
+        chi_square_contingency._compute_metric(pd.Series([], dtype="string"), cat_a) == 1
+    ), "Comparison with an empty series should return 1"
+    assert (
+        chi_square_contingency._compute_metric(cat_a, pd.Series([], dtype="string")) == 1
+    ), "Comparison with an empty series should return 1"
+    assert (
+        chi_square_contingency._compute_metric(
+            pd.Series([], dtype="string"), pd.Series([], dtype="string")
+        )
+        == 1
+    ), "Both series empty should return 1"
+
+    # Test with series containing unique categories
+    unique_a = pd.Series(["a", "b", "c"], dtype="string")
+    unique_b = pd.Series(["x", "y", "z"], dtype="string")
+    assert (
+        chi_square_contingency._compute_metric(unique_a, unique_b) > 0
+    ), "Unique non-overlapping categories should not have a 0 distance"
+
+
 def test_js_divergence(group1, group2, group3):
     assert js_divergence(pd.Series([1, 0]), pd.Series([1, 0])) == 0