Minor refactoring of KS test. (#41)

kklein · web-flow · commit da059c5f6d41 · 2022-07-28T17:58:21.000+02:00
diff --git a/src/datajudge/constraints/stats.py b/src/datajudge/constraints/stats.py
@@ -1,9 +1,8 @@
 import math
 import warnings
-from typing import Optional, Tuple, Union
+from typing import Optional, Tuple
 
 import sqlalchemy as sa
-from sqlalchemy.sql import Selectable
 
 from .. import db_access
 from ..db_access import DataReference
@@ -71,15 +70,20 @@ def c(alpha: float):
     @staticmethod
     def calculate_statistic(
         engine,
-        table1_def: Tuple[Union[Selectable, str], str],
-        table2_def: Tuple[Union[Selectable, str], str],
+        ref1: DataReference,
+        ref2: DataReference,
     ) -> Tuple[float, Optional[float], int, int]:
 
         # retrieve test statistic d, as well as sample sizes m and n
-        d_statistic, n_samples, m_samples = db_access.get_ks_2sample(
-            engine, table1=table1_def, table2=table2_def
+        d_statistic = db_access.get_ks_2sample(
+            engine,
+            ref1,
+            ref2,
         )
 
+        n_samples, _ = db_access.get_row_count(engine, ref1)
+        m_samples, _ = db_access.get_row_count(engine, ref2)
+
         # calculate approximate p-value
         p_value = KolmogorovSmirnov2Sample.approximate_p_value(
             d_statistic, n_samples, m_samples
@@ -90,13 +94,11 @@ def calculate_statistic(
     def test(self, engine: sa.engine.Engine) -> TestResult:
 
         # get query selections and column names for target columns
-        selection1 = self.ref.data_source.get_clause(engine)
-        column1 = self.ref.get_column(engine)
-        selection2 = self.ref2.data_source.get_clause(engine)
-        column2 = self.ref2.get_column(engine)
 
         d_statistic, p_value, n_samples, m_samples = self.calculate_statistic(
-            engine, (selection1, column1), (selection2, column2)
+            engine,
+            self.ref,
+            self.ref2,
         )
 
         # calculate test acceptance
diff --git a/src/datajudge/db_access.py b/src/datajudge/db_access.py
@@ -905,34 +905,28 @@ def get_column_array_agg(
 
 
 def get_ks_2sample(
-    engine: sa.engine.Engine, table1: tuple, table2: tuple
-) -> tuple[float, int, int]:
+    engine: sa.engine.Engine,
+    ref1: DataReference,
+    ref2: DataReference,
+) -> float:
     """
     Runs the query for the two-sample Kolmogorov-Smirnov test and returns the test statistic d.
     """
-
-    # make sure we have a string representation here
-    table1_selection, col1 = str(table1[0]), str(table1[1])
-    table2_selection, col2 = str(table2[0]), str(table2[1])
-
-    if is_mssql(engine):  # "tempdb.dbo".table_name -> tempdb.dbo.table_name
-        table1_selection = table1_selection.replace('"', "")
-        table2_selection = table2_selection.replace('"', "")
-
-    # for RawQueryDataSource this could be a whole subquery and will therefore need to be wrapped
-    if "SELECT" in table1_selection:
-        table1_selection = f"({table1_selection})"
-        table2_selection = f"({table2_selection})"
+    # For mssql: "tempdb.dbo".table_name -> tempdb.dbo.table_name
+    table1_str = str(ref1.data_source.get_clause(engine)).replace('"', "")
+    col1 = ref1.get_column(engine)
+    table2_str = str(ref2.data_source.get_clause(engine)).replace('"', "")
+    col2 = ref2.get_column(engine)
 
     # for a more extensive explanation, see:
     # https://github.com/Quantco/datajudge/pull/28#issuecomment-1165587929
     ks_query_string = f"""
         WITH
         tab1 AS ( -- Step 0: Prepare data source and value column
-            SELECT {col1} as val FROM {table1_selection}
+            SELECT {col1} as val FROM {table1_str}
         ),
         tab2 AS (
-            SELECT {col2} as val FROM {table2_selection}
+            SELECT {col2} as val FROM {table2_str}
         ),
         tab1_cdf AS ( -- Step 1: Calculate the CDF over the value column
             SELECT val, cume_dist() over (order by val) as cdf
@@ -980,14 +974,7 @@ def get_ks_2sample(
     """
 
     d_statistic = engine.execute(ks_query_string).scalar()
-    n_samples = engine.execute(
-        f"SELECT COUNT(*) FROM {table1_selection} as n_table"
-    ).scalar()
-    m_samples = engine.execute(
-        f"SELECT COUNT(*) FROM {table2_selection} as m_table"
-    ).scalar()
-
-    return d_statistic, n_samples, m_samples
+    return d_statistic
 
 
 def get_regex_violations(engine, ref, aggregated, regex, n_counterexamples):
diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py
@@ -665,7 +665,7 @@ def groupby_aggregation_table_incorrect(engine, metadata):
 @pytest.fixture(scope="module")
 def random_normal_table(engine, metadata):
     """
-    Table containing 10_000 randomly distributed values with mean = 0 and std.dev = 1.
+    Table with normally distributed values of varying means and sd 1.
     """
     table_name = "random_normal_table"
     columns = [
diff --git a/tests/integration/test_integration.py b/tests/integration/test_integration.py
@@ -1897,25 +1897,59 @@ def test_diff_average_between():
 @pytest.mark.parametrize(
     "data",
     [
-        (identity, "col_int", "col_int", None, 1.0),
-        (identity, "col_int", "col_int", Condition("col_int >= 3"), 1.0),
+        (identity, "col_int", "col_int", None, None, 1.0),
+        (
+            identity,
+            "col_int",
+            "col_int",
+            Condition("col_int >= 3"),
+            Condition("col_int >= 3"),
+            1.0,
+        ),
     ],
 )
 def test_ks_2sample_constraint_perfect_between(engine, int_table1, data):
     """
     Test Kolmogorov-Smirnov for the same column -> p-value should be perfect 1.0.
     """
-    (operation, col_1, col_2, condition, significance_level) = data
+    (operation, col_1, col_2, condition1, condition2, significance_level) = data
     req = requirements.BetweenRequirement.from_tables(*int_table1, *int_table1)
     req.add_ks_2sample_constraint(
         column1=col_1,
         column2=col_2,
-        condition1=condition,
-        condition2=condition,
+        condition1=condition1,
+        condition2=condition2,
         significance_level=significance_level,
     )
+    test_result = req[0].test(engine)
+    assert operation(test_result.outcome), test_result.failure_message
 
-    assert operation(req[0].test(engine).outcome)
+
+# TODO: Enable this test once the bug is fixed.
+@pytest.mark.skip(reason="This is a known bug and unintended behaviour.")
+@pytest.mark.parametrize(
+    "data",
+    [
+        (negation, "col_int", "col_int", None, Condition("col_int >= 10"), 1.0),
+    ],
+)
+def test_ks_2sample_constraint_perfect_between_different_condition(
+    engine, int_table1, data
+):
+    """
+    Test Kolmogorov-Smirnov for the same column -> p-value should be perfect 1.0.
+    """
+    (operation, col_1, col_2, condition1, condition2, significance_level) = data
+    req = requirements.BetweenRequirement.from_tables(*int_table1, *int_table1)
+    req.add_ks_2sample_constraint(
+        column1=col_1,
+        column2=col_2,
+        condition1=condition1,
+        condition2=condition2,
+        significance_level=significance_level,
+    )
+    test_result = req[0].test(engine)
+    assert operation(test_result.outcome), test_result.failure_message
 
 
 @pytest.mark.parametrize(
@@ -1933,8 +1967,8 @@ def test_ks_2sample_constraint_wrong_between(
     req.add_ks_2sample_constraint(
         column1=col_1, column2=col_2, significance_level=min_p_value
     )
-
-    assert operation(req[0].test(engine).outcome)
+    test_result = req[0].test(engine)
+    assert operation(test_result.outcome), test_result.failure_message
 
 
 @pytest.mark.parametrize(
@@ -1964,7 +1998,7 @@ def test_ks_2sample_random(engine, random_normal_table, configuration):
         column1=col_1, column2=col_2, significance_level=min_p_value
     )
     test_result = req[0].test(engine)
-    assert operation(test_result.outcome)
+    assert operation(test_result.outcome), test_result.failure_message
 
 
 @pytest.mark.parametrize(
@@ -1983,20 +2017,12 @@ def test_ks_2sample_implementation(engine, random_normal_table, configuration):
     ref = DataReference(tds, columns=[col_1])
     ref2 = DataReference(tds, columns=[col_2])
 
-    # retrieve table selections from data references
-    selection1 = ref.data_source.get_clause(engine)
-    column1 = ref.get_column(engine)
-    selection2 = ref2.data_source.get_clause(engine)
-    column2 = ref2.get_column(engine)
-
     (
         d_statistic,
         p_value,
         n_samples,
         m_samples,
-    ) = KolmogorovSmirnov2Sample.calculate_statistic(
-        engine, (selection1, column1), (selection2, column2)
-    )
+    ) = KolmogorovSmirnov2Sample.calculate_statistic(engine, ref, ref2)
 
     assert (
         abs(d_statistic - expected_d) <= 1e-10