Biomarkers transform for ModelAD (#148)

* Added biomarker files and functions to necessary locations, none are funcitonal at the moment * Added biomarker transform for the Model-AD project. The transform outputs a list and so a new list_to_json() function was added to the load module and logic to handle this was added to the process_dataset function * Biomarkers input and output test files * Added tests for biomarkers * Ran black formatter * Biomarkers test passes when it should * Biomarkers transform working, need to remove custom_transform from yaml * Correct use of the custom_transformations parameter in yaml config file * Added fake test data made by hand for testing biomarkers transform * Added testing for duplicate data * Formatting with black * Addressing PR comment about process_dataset() error message. * Reformatting process.py * Addressing PR comment about TypeError for biomarkers dataset. * Addressing PR comment: Improved docstring and typing for the transform_biomarkers() function. * PR comment: Reverting back to using standard typing hints to prevent problems with puthon 3.8 * Removed unused import that caused pre-commit to fail. * Removed unnecessary formatting from ADTDataProcessingError message. * Using typing library to add more specific type hints to the transform_biomarkers() function. * PR comment - using preferred context managed open for converting a list to a json in list_to_json() * Reverting change to see if it fixes CI: pre-commit fail * Maybe now the CI pre-commit will pass? * What about now? Will the CI pre-commit pass now? * I think the problem was just formatting * Added test for none/NA/nan values and using pd fillna for missing or none data * Added test for missing data * Added a fail test case for missing columns * Added passing test for datasets with extra/unknown columns * PR comment: removed unnecessary type checking in transform_biomarkers() funciton * Removed biomarkers test with real data, we have lots of test with fake data that are easier to validate by hand * Using warning instead of raise exception to allow datasets without a custom transform to be processed as pd.DataFrame. * Simplifying transform_biomarkers() to make it more readable and maintainable * Removing unnecessary warning and explicit isinstance(df,DataFrame) in process_dataset() for converting to json * Added output type hint to apply_custom_transformations() * Removing unnecessary comment * Improved docstring for list_to_json() * Outputting biomarkers transform as pd.DataFrame, removing unnecessary functions that were being used to convert a list to a json * Removed unused import statements * PR comments - removed unused warnings import and added None as part of the apply_custom_transformations() output types * Removing extra typing import, pre-commit passes * Changing ageDead to age_death in all processing and test files --------- Co-authored-by: Beatriz Saldana <[email protected]>
Sage-Bionetworks · Oct 4, 2024 · e02c29b · e02c29b
1 parent 86b5f29
commit e02c29b
Show file tree

Hide file tree

Showing 16 changed files with 406 additions and 3 deletions.
diff --git a/modelad_test_config.yaml b/modelad_test_config.yaml
@@ -0,0 +1,17 @@
+destination: &dest syn51498092
+staging_path: ./staging
+gx_folder: none
+gx_table: none
+datasets:
+  - biomarkers:
+      files:
+        - name: biomarkers
+          id: syn61250724.1
+          format: csv
+      final_format: json
+      provenance:
+        - syn61250724.1
+      destination: *dest
+      custom_transformations: 1
+      column_rename:
+        agedeath: age_death
diff --git a/src/agoradatatools/etl/transform/__init__.py b/src/agoradatatools/etl/transform/__init__.py
@@ -16,6 +16,7 @@
 )
 from agoradatatools.etl.transform.team_info import transform_team_info
 from agoradatatools.etl.transform.proteomics import transform_proteomics
+from agoradatatools.etl.transform.biomarkers import transform_biomarkers
 
 __all__ = [
     "transform_distribution_data",
@@ -28,4 +29,5 @@
     "transform_rnaseq_differential_expression",
     "transform_team_info",
     "transform_proteomics",
+    "transform_biomarkers",
 ]
diff --git a/src/agoradatatools/etl/transform/biomarkers.py b/src/agoradatatools/etl/transform/biomarkers.py
@@ -0,0 +1,46 @@
+"""
+This module contains the transformation logic for the biomarkers dataset.
+This is for the Model AD project.
+"""
+
+import pandas as pd
+from typing import Dict
+
+
+def transform_biomarkers(datasets: Dict[str, pd.DataFrame]) -> pd.DataFrame:
+    """
+    Takes a dictionary of dataset DataFrames, extracts the biomarkers
+    DataFrame, and transforms it into a DataFrame grouped by
+    'model', 'type', 'age_death', 'tissue', and 'units'.
+
+    Args:
+        datasets (Dict[str, pd.DataFrame]): Dictionary of dataset names mapped to their DataFrame.
+
+    Returns:
+        pd.DataFrame: A DataFrame containing biomarker data modeled after intended final structure.
+    """
+    biomarkers_dataset = datasets["biomarkers"]
+    group_columns = ["model", "type", "age_death", "tissue", "units"]
+    point_columns = ["genotype", "measurement", "sex"]
+
+    missing_columns = [
+        col
+        for col in group_columns + point_columns
+        if col not in biomarkers_dataset.columns
+    ]
+    if missing_columns:
+        raise ValueError(
+            f"Biomarker dataset missing columns: {', '.join(missing_columns)}"
+        )
+
+    biomarkers_dataset = biomarkers_dataset.fillna("none")
+    data_rows = []
+
+    grouped = biomarkers_dataset.groupby(group_columns)
+
+    for group_key, group in grouped:
+        entry = dict(zip(group_columns, group_key))
+        entry["points"] = group[point_columns].to_dict("records")
+        data_rows.append(entry)
+
+    return pd.DataFrame(data_rows)
diff --git a/src/agoradatatools/process.py b/src/agoradatatools/process.py
@@ -1,5 +1,4 @@
 import logging
-import typing
 from typing import Union
 
 import synapseclient
@@ -13,11 +12,14 @@
 from agoradatatools.reporter import ADTGXReporter, DatasetReport
 from agoradatatools.constants import Platform
 
+
 logger = logging.getLogger(__name__)
 
 
 # TODO refactor to avoid so many if's - maybe some sort of mapping to callables
-def apply_custom_transformations(datasets: dict, dataset_name: str, dataset_obj: dict):
+def apply_custom_transformations(
+    datasets: dict, dataset_name: str, dataset_obj: dict
+) -> Union[DataFrame, dict, None]:
     if not isinstance(datasets, dict) or not isinstance(dataset_name, str):
         return None
     if dataset_name == "biodomain_info":
@@ -59,6 +61,8 @@ def apply_custom_transformations(datasets: dict, dataset_name: str, dataset_obj:
     if dataset_name in ["proteomics", "proteomics_tmt", "proteomics_srm"]:
         df = datasets[dataset_name]
         return transform.transform_proteomics(df=df)
+    if dataset_name == "biomarkers":
+        return transform.transform_biomarkers(datasets=datasets)
     else:
         return None
 
@@ -186,7 +190,7 @@ def process_dataset(
 
 def create_data_manifest(
     syn: synapseclient.Synapse, parent: synapseclient.Folder = None
-) -> typing.Union[DataFrame, None]:
+) -> Union[DataFrame, None]:
     """Creates data manifest (dataframe) that has the IDs and version numbers of child synapse folders
 
     Args:

diff --git a/tests/test_assets/biomarkers/input/biomarkers_duplicated_input.csv b/tests/test_assets/biomarkers/input/biomarkers_duplicated_input.csv
@@ -0,0 +1,5 @@
+model,type,measurement,units,age_death,tissue,sex,genotype
+ModelA,TypeA,1,A,1,TissueA,male,genotype1
+ModelA,TypeA,1,A,1,TissueA,male,genotype1
+ModelA,TypeA,1,A,1,TissueA,male,genotype2
+ModelA,TypeA,1,A,1,TissueA,male,genotype2
diff --git a/tests/test_assets/biomarkers/input/biomarkers_extra_column.csv b/tests/test_assets/biomarkers/input/biomarkers_extra_column.csv
@@ -0,0 +1,3 @@
+model,type,measurement,units,age_death,tissue,sex,genotype,extra
+ModelA,TypeA,1,A,1,TissueA,male,genotype1,extra1
+ModelA,TypeA,2,A,1,TissueA,male,genotype1,extra1
diff --git a/tests/test_assets/biomarkers/input/biomarkers_good_test_input.csv b/tests/test_assets/biomarkers/input/biomarkers_good_test_input.csv
@@ -0,0 +1,7 @@
+model,type,measurement,units,age_death,tissue,sex,genotype
+ModelA,TypeA,1,A,1,TissueA,male,genotype1
+ModelA,TypeA,2,A,1,TissueA,male,genotype1
+ModelA,TypeA,3,A,2,TissueA,male,genotype2
+ModelA,TypeB,4,A,2,TissueA,male,genotype1
+ModelA,TypeB,5,A,3,TissueA,male,genotype1
+ModelA,TypeB,6,A,3,TissueA,male,genotype2
diff --git a/tests/test_assets/biomarkers/input/biomarkers_missing_column.csv b/tests/test_assets/biomarkers/input/biomarkers_missing_column.csv
@@ -0,0 +1,3 @@
+model,type,measurement,units,age_death,tissue,sex
+ModelA,TypeA,1,A,1,TissueA,male
+ModelA,TypeA,2,A,1,TissueA,male
diff --git a/tests/test_assets/biomarkers/input/biomarkers_missing_input.csv b/tests/test_assets/biomarkers/input/biomarkers_missing_input.csv
@@ -0,0 +1,4 @@
+model,type,measurement,units,age_death,tissue,sex,genotype
+ModelA,TypeA,1,A,1,TissueA,male,genotype1
+ModelA,TypeA,,A,1,TissueA,male,genotype1
+,TypeA,1,A,1,TissueA,male,genotype1
diff --git a/tests/test_assets/biomarkers/input/biomarkers_none_input.csv b/tests/test_assets/biomarkers/input/biomarkers_none_input.csv
@@ -0,0 +1,10 @@
+model,type,measurement,units,age_death,tissue,sex,genotype
+ModelA,TypeA,1,A,1,TissueA,male,genotype1
+ModelA,TypeA,none,A,1,TissueA,male,genotype1
+ModelA,TypeA,NA,A,1,TissueA,male,genotype1
+ModelA,TypeA,nan,A,1,TissueA,male,genotype1
+ModelA,TypeA,N/A,A,1,TissueA,male,genotype1
+none,TypeA,1,A,1,TissueA,male,genotype1
+none,TypeA,1,A,1,TissueA,male,NA
+NA,TypeA,1,A,1,TissueA,male,genotype1
+ModelA,NA,1,A,1,TissueA,male,genotype1
diff --git a/tests/test_assets/biomarkers/output/biomarkers_duplicated_output.json b/tests/test_assets/biomarkers/output/biomarkers_duplicated_output.json
@@ -0,0 +1,31 @@
+[
+    {
+        "model": "ModelA",
+        "type": "TypeA",
+        "age_death": 1,
+        "tissue": "TissueA",
+        "units": "A",
+        "points": [
+            {
+                "genotype": "genotype1",
+                "measurement": 1,
+                "sex": "male"
+            },
+            {
+                "genotype": "genotype1",
+                "measurement": 1,
+                "sex": "male"
+            },
+            {
+                "genotype": "genotype2",
+                "measurement": 1,
+                "sex": "male"
+            },
+            {
+                "genotype": "genotype2",
+                "measurement": 1,
+                "sex": "male"
+            }
+        ]
+    }
+]
diff --git a/tests/test_assets/biomarkers/output/biomarkers_extra_column_output.json b/tests/test_assets/biomarkers/output/biomarkers_extra_column_output.json
@@ -0,0 +1,21 @@
+[
+    {
+        "model": "ModelA",
+        "type": "TypeA",
+        "age_death": 1,
+        "tissue": "TissueA",
+        "units": "A",
+        "points": [
+            {
+                "genotype": "genotype1",
+                "measurement": 1,
+                "sex": "male"
+            },
+            {
+                "genotype": "genotype1",
+                "measurement": 2,
+                "sex": "male"
+            }
+        ]
+    }
+]
diff --git a/tests/test_assets/biomarkers/output/biomarkers_good_test_output.json b/tests/test_assets/biomarkers/output/biomarkers_good_test_output.json
@@ -0,0 +1,68 @@
+[
+    {
+        "model": "ModelA",
+        "type": "TypeA",
+        "age_death": 1,
+        "tissue": "TissueA",
+        "units": "A",
+        "points": [
+            {
+                "genotype": "genotype1",
+                "measurement": 1,
+                "sex": "male"
+            },
+            {
+                "genotype": "genotype1",
+                "measurement": 2,
+                "sex": "male"
+            }
+        ]
+    },
+    {
+        "model": "ModelA",
+        "type": "TypeA",
+        "age_death": 2,
+        "tissue": "TissueA",
+        "units": "A",
+        "points": [
+            {
+                "genotype": "genotype2",
+                "measurement": 3,
+                "sex": "male"
+            }
+        ]
+    },
+    {
+        "model": "ModelA",
+        "type": "TypeB",
+        "age_death": 2,
+        "tissue": "TissueA",
+        "units": "A",
+        "points": [
+            {
+                "genotype": "genotype1",
+                "measurement": 4,
+                "sex": "male"
+            }
+        ]
+    },
+    {
+        "model": "ModelA",
+        "type": "TypeB",
+        "age_death": 3,
+        "tissue": "TissueA",
+        "units": "A",
+        "points": [
+            {
+                "genotype": "genotype1",
+                "measurement": 5,
+                "sex": "male"
+            },
+            {
+                "genotype": "genotype2",
+                "measurement": 6,
+                "sex": "male"
+            }
+        ]
+    }
+]
diff --git a/tests/test_assets/biomarkers/output/biomarkers_missing_output.json b/tests/test_assets/biomarkers/output/biomarkers_missing_output.json
@@ -0,0 +1,35 @@
+[
+    {
+        "model": "ModelA",
+        "type": "TypeA",
+        "age_death": 1,
+        "tissue": "TissueA",
+        "units": "A",
+        "points": [
+            {
+                "genotype": "genotype1",
+                "measurement": 1.0,
+                "sex": "male"
+            },
+            {
+                "genotype": "genotype1",
+                "measurement": "none",
+                "sex": "male"
+            }
+        ]
+    },
+    {
+        "model": "none",
+        "type": "TypeA",
+        "age_death": 1,
+        "tissue": "TissueA",
+        "units": "A",
+        "points": [
+            {
+                "genotype": "genotype1",
+                "measurement": 1.0,
+                "sex": "male"
+            }
+        ]
+    }
+]
diff --git a/tests/test_assets/biomarkers/output/biomarkers_none_output.json b/tests/test_assets/biomarkers/output/biomarkers_none_output.json
@@ -0,0 +1,74 @@
+[
+    {
+        "model": "ModelA",
+        "type": "TypeA",
+        "age_death": 1,
+        "tissue": "TissueA",
+        "units": "A",
+        "points": [
+            {
+                "genotype": "genotype1",
+                "measurement": "1",
+                "sex": "male"
+            },
+            {
+                "genotype": "genotype1",
+                "measurement": "none",
+                "sex": "male"
+            },
+            {
+                "genotype": "genotype1",
+                "measurement": "none",
+                "sex": "male"
+            },
+            {
+                "genotype": "genotype1",
+                "measurement": "none",
+                "sex": "male"
+            }, 
+            {
+                "genotype": "genotype1",
+                "measurement": "none",
+                "sex": "male"
+            }
+        ]
+    },
+    {
+        "model": "ModelA",
+        "type": "none",
+        "age_death": 1,
+        "tissue": "TissueA",
+        "units": "A",
+        "points": [
+            {
+                "genotype": "genotype1",
+                "measurement": "1",
+                "sex": "male"
+            }
+        ]
+    },
+    {
+        "model": "none",
+        "type": "TypeA",
+        "age_death": 1,
+        "tissue": "TissueA",
+        "units": "A",
+        "points": [
+            {
+                "genotype": "genotype1",
+                "measurement": "1",
+                "sex": "male"
+            },
+            {
+                "genotype": "none",
+                "measurement": "1",
+                "sex": "male"
+            },
+            {
+                "genotype": "genotype1",
+                "measurement": "1",
+                "sex": "male"
+            }
+        ]
+    }
+]