diff --git a/modelad_test_config.yaml b/modelad_test_config.yaml new file mode 100644 index 00000000..28a54bed --- /dev/null +++ b/modelad_test_config.yaml @@ -0,0 +1,17 @@ +destination: &dest syn51498092 +staging_path: ./staging +gx_folder: none +gx_table: none +datasets: + - biomarkers: + files: + - name: biomarkers + id: syn61250724.1 + format: csv + final_format: json + provenance: + - syn61250724.1 + destination: *dest + custom_transformations: 1 + column_rename: + agedeath: age_death diff --git a/src/agoradatatools/etl/transform/__init__.py b/src/agoradatatools/etl/transform/__init__.py index ef000e81..2bbd6b9d 100644 --- a/src/agoradatatools/etl/transform/__init__.py +++ b/src/agoradatatools/etl/transform/__init__.py @@ -16,6 +16,7 @@ ) from agoradatatools.etl.transform.team_info import transform_team_info from agoradatatools.etl.transform.proteomics import transform_proteomics +from agoradatatools.etl.transform.biomarkers import transform_biomarkers __all__ = [ "transform_distribution_data", @@ -28,4 +29,5 @@ "transform_rnaseq_differential_expression", "transform_team_info", "transform_proteomics", + "transform_biomarkers", ] diff --git a/src/agoradatatools/etl/transform/biomarkers.py b/src/agoradatatools/etl/transform/biomarkers.py new file mode 100644 index 00000000..d820e0b5 --- /dev/null +++ b/src/agoradatatools/etl/transform/biomarkers.py @@ -0,0 +1,46 @@ +""" +This module contains the transformation logic for the biomarkers dataset. +This is for the Model AD project. +""" + +import pandas as pd +from typing import Dict + + +def transform_biomarkers(datasets: Dict[str, pd.DataFrame]) -> pd.DataFrame: + """ + Takes a dictionary of dataset DataFrames, extracts the biomarkers + DataFrame, and transforms it into a DataFrame grouped by + 'model', 'type', 'age_death', 'tissue', and 'units'. + + Args: + datasets (Dict[str, pd.DataFrame]): Dictionary of dataset names mapped to their DataFrame. + + Returns: + pd.DataFrame: A DataFrame containing biomarker data modeled after intended final structure. + """ + biomarkers_dataset = datasets["biomarkers"] + group_columns = ["model", "type", "age_death", "tissue", "units"] + point_columns = ["genotype", "measurement", "sex"] + + missing_columns = [ + col + for col in group_columns + point_columns + if col not in biomarkers_dataset.columns + ] + if missing_columns: + raise ValueError( + f"Biomarker dataset missing columns: {', '.join(missing_columns)}" + ) + + biomarkers_dataset = biomarkers_dataset.fillna("none") + data_rows = [] + + grouped = biomarkers_dataset.groupby(group_columns) + + for group_key, group in grouped: + entry = dict(zip(group_columns, group_key)) + entry["points"] = group[point_columns].to_dict("records") + data_rows.append(entry) + + return pd.DataFrame(data_rows) diff --git a/src/agoradatatools/process.py b/src/agoradatatools/process.py index 75854767..46232143 100644 --- a/src/agoradatatools/process.py +++ b/src/agoradatatools/process.py @@ -1,5 +1,4 @@ import logging -import typing from typing import Union import synapseclient @@ -13,11 +12,14 @@ from agoradatatools.reporter import ADTGXReporter, DatasetReport from agoradatatools.constants import Platform + logger = logging.getLogger(__name__) # TODO refactor to avoid so many if's - maybe some sort of mapping to callables -def apply_custom_transformations(datasets: dict, dataset_name: str, dataset_obj: dict): +def apply_custom_transformations( + datasets: dict, dataset_name: str, dataset_obj: dict +) -> Union[DataFrame, dict, None]: if not isinstance(datasets, dict) or not isinstance(dataset_name, str): return None if dataset_name == "biodomain_info": @@ -59,6 +61,8 @@ def apply_custom_transformations(datasets: dict, dataset_name: str, dataset_obj: if dataset_name in ["proteomics", "proteomics_tmt", "proteomics_srm"]: df = datasets[dataset_name] return transform.transform_proteomics(df=df) + if dataset_name == "biomarkers": + return transform.transform_biomarkers(datasets=datasets) else: return None @@ -186,7 +190,7 @@ def process_dataset( def create_data_manifest( syn: synapseclient.Synapse, parent: synapseclient.Folder = None -) -> typing.Union[DataFrame, None]: +) -> Union[DataFrame, None]: """Creates data manifest (dataframe) that has the IDs and version numbers of child synapse folders Args: diff --git a/tests/test_assets/biomarkers/input/biomarkers_duplicated_input.csv b/tests/test_assets/biomarkers/input/biomarkers_duplicated_input.csv new file mode 100644 index 00000000..b0091d2f --- /dev/null +++ b/tests/test_assets/biomarkers/input/biomarkers_duplicated_input.csv @@ -0,0 +1,5 @@ +model,type,measurement,units,age_death,tissue,sex,genotype +ModelA,TypeA,1,A,1,TissueA,male,genotype1 +ModelA,TypeA,1,A,1,TissueA,male,genotype1 +ModelA,TypeA,1,A,1,TissueA,male,genotype2 +ModelA,TypeA,1,A,1,TissueA,male,genotype2 diff --git a/tests/test_assets/biomarkers/input/biomarkers_extra_column.csv b/tests/test_assets/biomarkers/input/biomarkers_extra_column.csv new file mode 100644 index 00000000..2e0b287f --- /dev/null +++ b/tests/test_assets/biomarkers/input/biomarkers_extra_column.csv @@ -0,0 +1,3 @@ +model,type,measurement,units,age_death,tissue,sex,genotype,extra +ModelA,TypeA,1,A,1,TissueA,male,genotype1,extra1 +ModelA,TypeA,2,A,1,TissueA,male,genotype1,extra1 diff --git a/tests/test_assets/biomarkers/input/biomarkers_good_test_input.csv b/tests/test_assets/biomarkers/input/biomarkers_good_test_input.csv new file mode 100644 index 00000000..72e5b23f --- /dev/null +++ b/tests/test_assets/biomarkers/input/biomarkers_good_test_input.csv @@ -0,0 +1,7 @@ +model,type,measurement,units,age_death,tissue,sex,genotype +ModelA,TypeA,1,A,1,TissueA,male,genotype1 +ModelA,TypeA,2,A,1,TissueA,male,genotype1 +ModelA,TypeA,3,A,2,TissueA,male,genotype2 +ModelA,TypeB,4,A,2,TissueA,male,genotype1 +ModelA,TypeB,5,A,3,TissueA,male,genotype1 +ModelA,TypeB,6,A,3,TissueA,male,genotype2 diff --git a/tests/test_assets/biomarkers/input/biomarkers_missing_column.csv b/tests/test_assets/biomarkers/input/biomarkers_missing_column.csv new file mode 100644 index 00000000..f3099769 --- /dev/null +++ b/tests/test_assets/biomarkers/input/biomarkers_missing_column.csv @@ -0,0 +1,3 @@ +model,type,measurement,units,age_death,tissue,sex +ModelA,TypeA,1,A,1,TissueA,male +ModelA,TypeA,2,A,1,TissueA,male diff --git a/tests/test_assets/biomarkers/input/biomarkers_missing_input.csv b/tests/test_assets/biomarkers/input/biomarkers_missing_input.csv new file mode 100644 index 00000000..e1b0a8d0 --- /dev/null +++ b/tests/test_assets/biomarkers/input/biomarkers_missing_input.csv @@ -0,0 +1,4 @@ +model,type,measurement,units,age_death,tissue,sex,genotype +ModelA,TypeA,1,A,1,TissueA,male,genotype1 +ModelA,TypeA,,A,1,TissueA,male,genotype1 +,TypeA,1,A,1,TissueA,male,genotype1 diff --git a/tests/test_assets/biomarkers/input/biomarkers_none_input.csv b/tests/test_assets/biomarkers/input/biomarkers_none_input.csv new file mode 100644 index 00000000..fda48b74 --- /dev/null +++ b/tests/test_assets/biomarkers/input/biomarkers_none_input.csv @@ -0,0 +1,10 @@ +model,type,measurement,units,age_death,tissue,sex,genotype +ModelA,TypeA,1,A,1,TissueA,male,genotype1 +ModelA,TypeA,none,A,1,TissueA,male,genotype1 +ModelA,TypeA,NA,A,1,TissueA,male,genotype1 +ModelA,TypeA,nan,A,1,TissueA,male,genotype1 +ModelA,TypeA,N/A,A,1,TissueA,male,genotype1 +none,TypeA,1,A,1,TissueA,male,genotype1 +none,TypeA,1,A,1,TissueA,male,NA +NA,TypeA,1,A,1,TissueA,male,genotype1 +ModelA,NA,1,A,1,TissueA,male,genotype1 diff --git a/tests/test_assets/biomarkers/output/biomarkers_duplicated_output.json b/tests/test_assets/biomarkers/output/biomarkers_duplicated_output.json new file mode 100644 index 00000000..35acca28 --- /dev/null +++ b/tests/test_assets/biomarkers/output/biomarkers_duplicated_output.json @@ -0,0 +1,31 @@ +[ + { + "model": "ModelA", + "type": "TypeA", + "age_death": 1, + "tissue": "TissueA", + "units": "A", + "points": [ + { + "genotype": "genotype1", + "measurement": 1, + "sex": "male" + }, + { + "genotype": "genotype1", + "measurement": 1, + "sex": "male" + }, + { + "genotype": "genotype2", + "measurement": 1, + "sex": "male" + }, + { + "genotype": "genotype2", + "measurement": 1, + "sex": "male" + } + ] + } +] \ No newline at end of file diff --git a/tests/test_assets/biomarkers/output/biomarkers_extra_column_output.json b/tests/test_assets/biomarkers/output/biomarkers_extra_column_output.json new file mode 100644 index 00000000..3561d90f --- /dev/null +++ b/tests/test_assets/biomarkers/output/biomarkers_extra_column_output.json @@ -0,0 +1,21 @@ +[ + { + "model": "ModelA", + "type": "TypeA", + "age_death": 1, + "tissue": "TissueA", + "units": "A", + "points": [ + { + "genotype": "genotype1", + "measurement": 1, + "sex": "male" + }, + { + "genotype": "genotype1", + "measurement": 2, + "sex": "male" + } + ] + } +] \ No newline at end of file diff --git a/tests/test_assets/biomarkers/output/biomarkers_good_test_output.json b/tests/test_assets/biomarkers/output/biomarkers_good_test_output.json new file mode 100644 index 00000000..e1beeda8 --- /dev/null +++ b/tests/test_assets/biomarkers/output/biomarkers_good_test_output.json @@ -0,0 +1,68 @@ +[ + { + "model": "ModelA", + "type": "TypeA", + "age_death": 1, + "tissue": "TissueA", + "units": "A", + "points": [ + { + "genotype": "genotype1", + "measurement": 1, + "sex": "male" + }, + { + "genotype": "genotype1", + "measurement": 2, + "sex": "male" + } + ] + }, + { + "model": "ModelA", + "type": "TypeA", + "age_death": 2, + "tissue": "TissueA", + "units": "A", + "points": [ + { + "genotype": "genotype2", + "measurement": 3, + "sex": "male" + } + ] + }, + { + "model": "ModelA", + "type": "TypeB", + "age_death": 2, + "tissue": "TissueA", + "units": "A", + "points": [ + { + "genotype": "genotype1", + "measurement": 4, + "sex": "male" + } + ] + }, + { + "model": "ModelA", + "type": "TypeB", + "age_death": 3, + "tissue": "TissueA", + "units": "A", + "points": [ + { + "genotype": "genotype1", + "measurement": 5, + "sex": "male" + }, + { + "genotype": "genotype2", + "measurement": 6, + "sex": "male" + } + ] + } +] \ No newline at end of file diff --git a/tests/test_assets/biomarkers/output/biomarkers_missing_output.json b/tests/test_assets/biomarkers/output/biomarkers_missing_output.json new file mode 100644 index 00000000..f736159a --- /dev/null +++ b/tests/test_assets/biomarkers/output/biomarkers_missing_output.json @@ -0,0 +1,35 @@ +[ + { + "model": "ModelA", + "type": "TypeA", + "age_death": 1, + "tissue": "TissueA", + "units": "A", + "points": [ + { + "genotype": "genotype1", + "measurement": 1.0, + "sex": "male" + }, + { + "genotype": "genotype1", + "measurement": "none", + "sex": "male" + } + ] + }, + { + "model": "none", + "type": "TypeA", + "age_death": 1, + "tissue": "TissueA", + "units": "A", + "points": [ + { + "genotype": "genotype1", + "measurement": 1.0, + "sex": "male" + } + ] + } +] \ No newline at end of file diff --git a/tests/test_assets/biomarkers/output/biomarkers_none_output.json b/tests/test_assets/biomarkers/output/biomarkers_none_output.json new file mode 100644 index 00000000..f9272cb3 --- /dev/null +++ b/tests/test_assets/biomarkers/output/biomarkers_none_output.json @@ -0,0 +1,74 @@ +[ + { + "model": "ModelA", + "type": "TypeA", + "age_death": 1, + "tissue": "TissueA", + "units": "A", + "points": [ + { + "genotype": "genotype1", + "measurement": "1", + "sex": "male" + }, + { + "genotype": "genotype1", + "measurement": "none", + "sex": "male" + }, + { + "genotype": "genotype1", + "measurement": "none", + "sex": "male" + }, + { + "genotype": "genotype1", + "measurement": "none", + "sex": "male" + }, + { + "genotype": "genotype1", + "measurement": "none", + "sex": "male" + } + ] + }, + { + "model": "ModelA", + "type": "none", + "age_death": 1, + "tissue": "TissueA", + "units": "A", + "points": [ + { + "genotype": "genotype1", + "measurement": "1", + "sex": "male" + } + ] + }, + { + "model": "none", + "type": "TypeA", + "age_death": 1, + "tissue": "TissueA", + "units": "A", + "points": [ + { + "genotype": "genotype1", + "measurement": "1", + "sex": "male" + }, + { + "genotype": "none", + "measurement": "1", + "sex": "male" + }, + { + "genotype": "genotype1", + "measurement": "1", + "sex": "male" + } + ] + } +] \ No newline at end of file diff --git a/tests/transform/test_biomarkers.py b/tests/transform/test_biomarkers.py new file mode 100644 index 00000000..280ce788 --- /dev/null +++ b/tests/transform/test_biomarkers.py @@ -0,0 +1,73 @@ +import os + +import pandas as pd +import pytest + +from agoradatatools.etl.transform import biomarkers + + +class TestTransformBiomarkers: + data_files_path = "tests/test_assets/biomarkers" + pass_test_data = [ + ( + # Pass with good fake data + "biomarkers_good_test_input.csv", + "biomarkers_good_test_output.json", + ), + ( + # Pass with duplicated data + "biomarkers_duplicated_input.csv", + "biomarkers_duplicated_output.json", + ), + ( + # Pass with none data + "biomarkers_none_input.csv", + "biomarkers_none_output.json", + ), + ( + # Pass with missing data + "biomarkers_missing_input.csv", + "biomarkers_missing_output.json", + ), + ( + # Pass with extra column + "biomarkers_extra_column.csv", + "biomarkers_extra_column_output.json", + ), + ] + pass_test_ids = [ + "Pass with good fake data", + "Pass with duplicated data", + "Pass with none data", + "Pass with missing data", + "Pass with extra column", + ] + fail_test_data = [("biomarkers_missing_column.csv")] + fail_test_ids = [("Fail with missing column")] + + @pytest.mark.parametrize( + "biomarkers_file, expected_output_file", pass_test_data, ids=pass_test_ids + ) + def test_transform_biomarkers_should_pass( + self, biomarkers_file, expected_output_file + ): + biomarkers_df = pd.read_csv( + os.path.join(self.data_files_path, "input", biomarkers_file) + ) + output_df = pd.DataFrame( + biomarkers.transform_biomarkers(datasets={"biomarkers": biomarkers_df}) + ) + expected_df = pd.read_json( + os.path.join(self.data_files_path, "output", expected_output_file), + ) + pd.testing.assert_frame_equal(output_df, expected_df) + + @pytest.mark.parametrize("biomarkers_file", fail_test_data, ids=fail_test_ids) + def test_transform_biomarkers_should_fail( + self, biomarkers_file, error_type: BaseException = ValueError + ): + biomarkers_df = pd.read_csv( + os.path.join(self.data_files_path, "input", biomarkers_file) + ) + with pytest.raises(error_type): + biomarkers.transform_biomarkers(datasets={"biomarkers": biomarkers_df})