logic phenotype tests now running except for inverse which is not imp…

…lemented yet
Bayer-Group · Nov 14, 2024 · 6c2b507 · 6c2b507
1 parent 287cf49
commit 6c2b507
Show file tree

Hide file tree

Showing 4 changed files with 124 additions and 36 deletions.
diff --git a/phenex/phenotypes/computation_graph_phenotypes.py b/phenex/phenotypes/computation_graph_phenotypes.py
@@ -36,13 +36,15 @@ def __init__(
         expression: ComputationGraph,
         return_date: Union[str, Phenotype],
         name: str = None,
+        aggregation_index=["PERSON_ID"],
         _operate_on: str = "boolean",
         _populate: str = "value",
         _reduce: bool = False,
     ):
         super(ComputationGraphPhenotype, self).__init__()
         self.computation_graph = expression
         self.return_date = return_date
+        self.aggregation_index = aggregation_index
         self._name = name
         self._operate_on = _operate_on
         self._populate = _populate
@@ -74,16 +76,32 @@ def _execute(self, tables: Dict[str, Table]) -> PhenotypeTable:
             _expression = self.computation_graph.get_value_expression(
                 joined_table, operate_on=self._operate_on
             )
-            joined_table = joined_table.mutate(VALUE=_expression).mutate(
-                EVENT_DATE=ibis.null(date)
-            )
+            joined_table = joined_table.mutate(VALUE=_expression)
         elif self._populate == "boolean":
             _expression = self.computation_graph.get_boolean_expression(
                 joined_table, operate_on=self._operate_on
             )
-            joined_table = joined_table.mutate(BOOLEAN=_expression).mutate(
-                EVENT_DATE=ibis.null(date)
+            joined_table = joined_table.mutate(BOOLEAN=_expression)
+
+        # Return the first or last event date
+        ibis.options.interactive = True
+        date_columns = self._coalesce_all_date_columns(joined_table)
+        if self.return_date == "first":
+            joined_table = joined_table.mutate(
+                EVENT_DATE=ibis.least(*date_columns)
+            )
+        elif self.return_date == "last":
+            joined_table = joined_table.mutate(
+                EVENT_DATE=ibis.greatest(*date_columns)
             )
+        elif self.return_date == 'all':
+            joined_table = self._return_all_dates(joined_table, date_columns)
+        elif isinstance(self.return_date, Phenotype):
+            joined_table = joined_table.mutate(
+                EVENT_DATE=getattr(joined_table,f"{self.return_date.name}_EVENT_DATE")
+            )
+        else:
+            joined_table = joined_table.mutate(EVENT_DATE=ibis.null(date))
 
         # Reduce the table to only include rows where the boolean column is True
         if self._reduce:
@@ -96,6 +114,54 @@ def _execute(self, tables: Dict[str, Table]) -> PhenotypeTable:
 
         return joined_table
 
+    def _return_all_dates(self, table, date_columns):
+        """
+        If return date = all, we want to return all the dates on which phenotype criteria are fulfilled; this is a union of all the non-null dates in any leaf phenotype date columns.
+
+        Args:
+            table: The Ibis table object (e.g., joined_table) that contains all leaf phenotypes stacked horizontally
+            date_columns: List of base columns as ibis objects
+
+        Returns:
+            Ibis expression representing the UNION of all non null dates.
+        """
+        # get all the non-null dates for each date column
+        non_null_dates_by_date_col = []
+        for date_col in date_columns:
+            non_null_dates = (
+                table.filter(date_col.notnull())
+                    .mutate(EVENT_DATE=date_col)
+            )
+            non_null_dates_by_date_col.append(non_null_dates)
+
+        # do the union of all the non-null dates
+        all_dates = non_null_dates_by_date_col[0]
+        for non_null_dates in non_null_dates_by_date_col[1:]:
+            all_dates = all_dates.union(non_null_dates)
+        return all_dates
+
+    def _coalesce_all_date_columns(self, table):
+        """
+        Generate COALESCE strings for the GREATEST or LEAST function using Ibis.
+
+        Args:
+            table: The Ibis table object (e.g., joined_table).
+            names: List of base column names (without the '_date' suffix).
+
+        Returns:
+            Ibis expression representing the COALESCE of the columns.
+        """
+        coalesce_expressions = []
+
+        names = [col for col in table.columns if "EVENT_DATE" in col]
+
+        for i in range(len(names)):
+            rotated_names = names[i:] + names[:i]
+            coalesce_expr = ibis.coalesce(*(getattr(table,col) for col in rotated_names))
+            coalesce_expressions.append(coalesce_expr)
+
+        return coalesce_expressions
+
 
 class ScorePhenotype(ComputationGraphPhenotype):
     """

diff --git a/phenex/test/phenotype_test_generator.py b/phenex/test/phenotype_test_generator.py
@@ -20,6 +20,8 @@ class PhenotypeTestGenerator:
     name_space = ""
     date_format = "%m-%d-%Y"
     test_values = False
+    test_date = False
+    join_on = ["PERSON_ID"]
 
     def run_tests(self, verbose=False):
         self.verbose = verbose
@@ -77,7 +79,7 @@ def df_from_test_info(test_info):
             df["PERSON_ID"] = test_info["persons"]
 
             columnname_boolean = "boolean"
-            columnname_date = "DATE"
+            columnname_date = "EVENT_DATE"
             columnname_value = "VALUE"
 
             df[columnname_boolean] = True
@@ -126,13 +128,16 @@ def df_from_test_info(test_info):
                 self.name_output_file(test_info), df
             )
 
-            join_on = ["PERSON_ID"]
+            join_on = ['PERSON_ID']
             if self.test_values:
                 join_on.append("VALUE")
+            if self.test_date:
+                join_on.append("EVENT_DATE")
             check_equality(
                 result_table,
                 expected_output_table,
                 test_name=test_info["name"],
                 test_values=self.test_values,
+                test_date = self.test_date,
                 join_on=join_on
             )
diff --git a/phenex/test/phenotypes/test_logic_phenotype.py b/phenex/test/phenotypes/test_logic_phenotype.py
@@ -133,6 +133,7 @@ def define_phenotype_tests(self):
 
 class LogicPhenotypeReturnDateLastTestGenerator(PhenotypeTestGenerator):
     name_space = "lgpt_returndate_last"
+    test_date = True
 
     def define_input_tables(self):
         """
@@ -179,7 +180,7 @@ def define_input_tables(self):
                 "01-01-2022",  # P7 c3 11
             ]
         ]
-        df["event_date"] = self.event_dates
+        df["EVENT_DATE"] = self.event_dates
 
         df_person = pd.DataFrame()
         df_person["PERSON_ID"] = list(df["PERSON_ID"].unique())
@@ -323,6 +324,8 @@ def define_phenotype_tests(self):
 
 class LogicPhenotypeReturnDateAllTestGenerator(PhenotypeTestGenerator):
     name_space = "lgpt_returndate_all"
+    test_date = True
+    join_on = ["PERSON_ID", "EVENT_DATE"]
 
     def define_input_tables(self):
         """
@@ -369,7 +372,7 @@ def define_input_tables(self):
                 "01-01-2022",  # P7 c3 11
             ]
         ]
-        df["event_date"] = self.event_dates
+        df["EVENT_DATE"] = self.event_dates
 
         df_person = pd.DataFrame()
         df_person["PERSON_ID"] = list(df["PERSON_ID"].unique())
@@ -502,12 +505,12 @@ def define_phenotype_tests(self):
 
         test_infos = [
             c1andc2,
-            # c1orc2,
-            # c1andc3,
-            # c1andc2orc1andc3,
-            # c1andc2andc1andc3,
-            # c1andc2orc3,
-            # c1andc2andc3,
+            c1orc2,
+            c1andc3,
+            c1andc2orc1andc3,
+            c1andc2andc1andc3,
+            c1andc2orc3,
+            c1andc2andc3,
         ]
 
         for test_info in test_infos:
@@ -520,6 +523,7 @@ class LogicPhenotypeInverseReturnDateLastTestGenerator(
     LogicPhenotypeReturnDateLastTestGenerator
 ):
     name_space = "lgpt_inverse_returndate_last"
+    test_date = True
 
     def define_phenotype_tests(self):
         codelist_factory = LocalCSVCodelistFactory(
@@ -672,7 +676,7 @@ def define_phenotype_tests(self):
 
 class LogicPhenotypeReturnDateFirstTestGenerator(PhenotypeTestGenerator):
     name_space = "lgpt_returndate_first"
-
+    test_date = True
     def define_input_tables(self):
         """
         P1,c1,01-01-2022  0
@@ -718,7 +722,7 @@ def define_input_tables(self):
                 "01-01-2022",  # P7 c3 11
             ]
         ]
-        df["event_date"] = self.event_dates
+        df["EVENT_DATE"] = self.event_dates
 
         df_person = pd.DataFrame()
         df_person["PERSON_ID"] = list(df["PERSON_ID"].unique())
@@ -860,25 +864,31 @@ def define_phenotype_tests(self):
         return test_infos
 
 
-def test_logic_phenotype():
-    import ibis
+def test_logic_phenotype_1():
     spg = LogicPhenotypeTestGenerator()
-    # spg.con = ibis.duckdb.connect()
-
-    # spg.run_tests()
+    spg.run_tests()
 
-    # spg = LogicPhenotypeReturnDateLastTestGenerator()
-    # spg.generate()
+def test_logic_phenotype_2():
+    spg = LogicPhenotypeReturnDateLastTestGenerator()
+    spg.run_tests()
 
-    # spg = LogicPhenotypeInverseReturnDateLastTestGenerator()
-    # spg.generate()
+def test_logic_phenotype_3():
+    spg = LogicPhenotypeReturnDateAllTestGenerator()
+    spg.run_tests()
 
-    # spg = LogicPhenotypeReturnDateAllTestGenerator()
-    # spg.generate()
+def test_logic_phenotype_4():
+    spg = LogicPhenotypeReturnDateFirstTestGenerator()
+    spg.run_tests()
 
-    # spg = LogicPhenotypeReturnDateFirstTestGenerator()
-    # spg.generate()
+def test_logic_phenotype_5():
+    pass
+    # spg = LogicPhenotypeInverseReturnDateLastTestGenerator()
+    # spg.run_tests()
 
 
 if __name__ == "__main__":
-    test_logic_phenotype()
+    test_logic_phenotype_1()
+    test_logic_phenotype_2()
+    test_logic_phenotype_3()
+    test_logic_phenotype_4()
+    test_logic_phenotype_5()
diff --git a/phenex/test/util/check_equality.py b/phenex/test/util/check_equality.py
@@ -2,7 +2,7 @@
 
 
 def check_equality(
-    result, expected, join_on=["PERSON_ID"], test_name="test", test_values=False
+    result, expected, join_on=["PERSON_ID"], test_name="test", test_values=False, test_date=False
 ):
     result = result.to_pandas()
     result.loc[:, "DUMMY"] = 1
@@ -21,16 +21,23 @@ def check_equality(
     ), f"Expected not found in test {test_name}: {expected_not_found['PERSON_ID'].values}"
 
     if test_values and 'VALUE' not in join_on:
-        print(full_results)
         values_match = full_results["VALUE_result"] == full_results["VALUE_expected"]
-        print(values_match)
         assert (
             values_match.all()
         ), f"Found unexpected in test {test_name} : not all pairs match"
     elif test_values and 'VALUE' in join_on:
-            print(full_results)
             values_match = full_results["DUMMY_result"] == full_results["DUMMY_expected"]
-            print(values_match)
             assert (
                 values_match.all()
             ), f"Found unexpected in test {test_name} : not all pairs match"
+
+    if test_date and 'EVENT_DATE' not in join_on:
+        dates_match = full_results["EVENT_DATE_result"] == full_results["EVENT_DATE_expected"]
+        assert (
+            dates_match.all()
+        ), f"Found unexpected in test {test_name} : not all pairs match"
+    elif test_date and 'EVENT_DATE' in join_on:
+        dates_match = full_results["DUMMY_result"] == full_results["DUMMY_expected"]
+        assert (
+            dates_match.all()
+        ), f"Found unexpected in test {test_name} : not all pairs match"