diff --git a/src/agoradatatools/etl/transform/__init__.py b/src/agoradatatools/etl/transform/__init__.py index d5184a95..13917a49 100644 --- a/src/agoradatatools/etl/transform/__init__.py +++ b/src/agoradatatools/etl/transform/__init__.py @@ -16,9 +16,11 @@ from agoradatatools.etl.transform.proteomics_distribution import ( create_proteomics_distribution_data, ) +from agoradatatools.etl.transform.rnaseq_differential_expression import ( + transform_rnaseq_differential_expression, +) from agoradatatools.etl.transform.rna_distribution import ( transform_rna_distribution_data, - transform_rna_seq_data, ) from agoradatatools.etl.transform.team_info import transform_team_info @@ -30,6 +32,6 @@ "transform_overall_scores", "create_proteomics_distribution_data", "transform_rna_distribution_data", - "transform_rna_seq_data", + "transform_rnaseq_differential_expression", "transform_team_info", ] diff --git a/src/agoradatatools/etl/transform/rna_distribution.py b/src/agoradatatools/etl/transform/rna_distribution.py index 12e068f3..d5618f25 100644 --- a/src/agoradatatools/etl/transform/rna_distribution.py +++ b/src/agoradatatools/etl/transform/rna_distribution.py @@ -1,52 +1,10 @@ import numpy as np - - -def transform_rna_seq_data(datasets: dict): - diff_exp_data = datasets["diff_exp_data"] - - diff_exp_data["study"].replace( - to_replace={"MAYO": "MayoRNAseq", "MSSM": "MSBB"}, regex=True, inplace=True - ) - diff_exp_data["sex"].replace( - to_replace={ - "ALL": "males and females", - "FEMALE": "females only", - "MALE": "males only", - }, - regex=True, - inplace=True, - ) - diff_exp_data["model"].replace( - to_replace="\\.", value=" x ", regex=True, inplace=True - ) - diff_exp_data["model"].replace( - to_replace={"Diagnosis": "AD Diagnosis"}, regex=True, inplace=True - ) - diff_exp_data["fc"] = 2 ** diff_exp_data["logfc"] - diff_exp_data["model"] = diff_exp_data["model"] + " (" + diff_exp_data["sex"] + ")" - - diff_exp_data = diff_exp_data[ - [ - "ensembl_gene_id", - "hgnc_symbol", - "logfc", - "fc", - "ci_l", - "ci_r", - "adj_p_val", - "tissue", - "study", - "model", - ] - ] - - return diff_exp_data - +from agoradatatools.etl import transform def transform_rna_distribution_data(datasets: dict): # "datasets" contains the unprocessed RNA-seq data, which needs to go # through the same processing as before in order to use it here. - rna_df = transform_rna_seq_data(datasets) + rna_df = transform.transform_rnaseq_differential_expression(datasets) rna_df = rna_df[["tissue", "model", "logfc"]] rna_df = ( diff --git a/src/agoradatatools/etl/transform/rnaseq_differential_expression.py b/src/agoradatatools/etl/transform/rnaseq_differential_expression.py new file mode 100644 index 00000000..fccb2fb5 --- /dev/null +++ b/src/agoradatatools/etl/transform/rnaseq_differential_expression.py @@ -0,0 +1,40 @@ +def transform_rnaseq_differential_expression(datasets: dict): + diff_exp_data = datasets["diff_exp_data"] + + diff_exp_data["study"].replace( + to_replace={"MAYO": "MayoRNAseq", "MSSM": "MSBB"}, regex=True, inplace=True + ) + diff_exp_data["sex"].replace( + to_replace={ + "ALL": "males and females", + "FEMALE": "females only", + "MALE": "males only", + }, + regex=True, + inplace=True, + ) + diff_exp_data["model"].replace( + to_replace="\\.", value=" x ", regex=True, inplace=True + ) + diff_exp_data["model"].replace( + to_replace={"Diagnosis": "AD Diagnosis"}, regex=True, inplace=True + ) + diff_exp_data["fc"] = 2 ** diff_exp_data["logfc"] + diff_exp_data["model"] = diff_exp_data["model"] + " (" + diff_exp_data["sex"] + ")" + + diff_exp_data = diff_exp_data[ + [ + "ensembl_gene_id", + "hgnc_symbol", + "logfc", + "fc", + "ci_l", + "ci_r", + "adj_p_val", + "tissue", + "study", + "model", + ] + ] + + return diff_exp_data diff --git a/src/agoradatatools/process.py b/src/agoradatatools/process.py index 932304ed..5682eefc 100644 --- a/src/agoradatatools/process.py +++ b/src/agoradatatools/process.py @@ -35,7 +35,7 @@ def apply_custom_transformations(datasets: dict, dataset_name: str, dataset_obj: if dataset_name == "team_info": return transform.transform_team_info(datasets=datasets) if dataset_name == "rnaseq_differential_expression": - return transform.transform_rna_seq_data(datasets=datasets) + return transform.transform_rnaseq_differential_expression(datasets=datasets) if dataset_name == "gene_info": return transform.transform_gene_info( datasets=datasets, diff --git a/tests/test_assets/rna_distribution_data/input/test_rna_distribution_data_bad_input_keyerror.csv b/tests/test_assets/rna_distribution_data/input/test_rna_distribution_data_bad_input_keyerror.csv new file mode 100644 index 00000000..946207be --- /dev/null +++ b/tests/test_assets/rna_distribution_data/input/test_rna_distribution_data_bad_input_keyerror.csv @@ -0,0 +1,7 @@ +model,tissue,comparison,ensembl_gene_id,logfc,ci_l,ci_r,aveexpr,t,p_value,adj_p_val,gene_biotype,chromosome_name,direction,hgnc_symbol,percentage_gc_content,gene_length,sex,study +,CBE,AD-CONTROL,ENSG00000228521,1.646715414,1.219378549,2.074052279,-2.452298708,7.544599094,2.21E-13,2.26E-10,lncRNA,7,UP,AC099552.3,60.6903164,1043,ALL,MAYO +,CBE,AD-CONTROL,ENSG00000286872,-0.181247795,-0.341187573,-0.021308017,0.032732484,-2.226401902,0.026439738,0.06693269,lncRNA,15,NONE,AC024270.5,46.5131873,13422,ALL,MAYO +,CBE,AD-CONTROL,ENSG00000144228,0.080531879,-0.016465937,0.177529695,4.913442808,1.63093857,0.10355023,0.193939662,protein_coding,2,NONE,SPOPL,36.67609796,71747,ALL,MAYO +Diagnosis.AOD,,AD-CONTROL,ENSG00000005339,0.074072272,0.042805132,0.105339413,7.626719604,4.656313999,4.15E-06,8.32E-05,protein_coding,16,NONE,CREBBP,45.00523533,155673,ALL,MAYO +Diagnosis.AOD,,AD-CONTROL,ENSG00000188994,0.03375383,-0.010318552,0.077826212,6.703806497,1.501322866,0.133920382,0.239511453,protein_coding,6,NONE,ZNF292,37.67626491,113111,ALL,MAYO +Diagnosis.AOD,,AD-CONTROL,ENSG00000261823,-0.008434191,-0.077650231,0.060781849,-0.970547902,-0.239149259,0.811089637,0.873760793,lncRNA,15,NONE,AC084782.2,42.13615023,1704,ALL,MAYO \ No newline at end of file diff --git a/tests/test_assets/rna_distribution_data/input/test_rna_distribution_data_bad_input_typeerror.csv b/tests/test_assets/rna_distribution_data/input/test_rna_distribution_data_bad_input_typeerror.csv new file mode 100644 index 00000000..49df9718 --- /dev/null +++ b/tests/test_assets/rna_distribution_data/input/test_rna_distribution_data_bad_input_typeerror.csv @@ -0,0 +1,2 @@ +model,tissue,comparison,ensembl_gene_id,logfc,ci_l,ci_r,aveexpr,t,p_value,adj_p_val,gene_biotype,chromosome_name,direction,hgnc_symbol,percentage_gc_content,gene_length,sex,study +Diagnosis,CBE,AD-CONTROL,ENSG00000228521,No change,1.219378549,2.074052279,-2.452298708,7.544599094,2.21E-13,2.26E-10,lncRNA,7,UP,AC099552.3,60.6903164,1043,ALL,MAYO \ No newline at end of file diff --git a/tests/test_assets/rna_distribution_data/input/test_rna_distribution_data_good_input.csv b/tests/test_assets/rna_distribution_data/input/test_rna_distribution_data_good_input.csv new file mode 100644 index 00000000..6ad87835 --- /dev/null +++ b/tests/test_assets/rna_distribution_data/input/test_rna_distribution_data_good_input.csv @@ -0,0 +1,26 @@ +model,tissue,comparison,ensembl_gene_id,logfc,ci_l,ci_r,aveexpr,t,p_value,adj_p_val,gene_biotype,chromosome_name,direction,hgnc_symbol,percentage_gc_content,gene_length,sex,study +Diagnosis,CBE,AD-CONTROL,ENSG00000228521,1.646715414,1.219378549,2.074052279,-2.452298708,7.544599094,2.21E-13,2.26E-10,lncRNA,7,UP,AC099552.3,60.6903164,1043,ALL,MAYO +Diagnosis,CBE,AD-CONTROL,ENSG00000286872,-0.181247795,-0.341187573,-0.021308017,0.032732484,-2.226401902,0.026439738,0.06693269,lncRNA,15,NONE,AC024270.5,46.5131873,13422,ALL,MAYO +Diagnosis,CBE,AD-CONTROL,ENSG00000144228,0.080531879,-0.016465937,0.177529695,4.913442808,1.63093857,0.10355023,0.193939662,protein_coding,2,NONE,SPOPL,36.67609796,71747,ALL,MAYO +Diagnosis,CBE,AD-CONTROL,ENSG00000285778,0.011774265,-0.225427841,0.248976371,-1.286000104,0.097352193,0.922486354,0.950960873,lncRNA,1,NONE,AL591463.1,36.23399051,127971,ALL,MAYO +Diagnosis.AOD,CBE,AD-CONTROL,ENSG00000005339,0.074072272,0.042805132,0.105339413,7.626719604,4.656313999,4.15E-06,8.32E-05,protein_coding,16,NONE,CREBBP,45.00523533,155673,ALL,MAYO +Diagnosis.AOD,CBE,AD-CONTROL,ENSG00000188994,0.03375383,-0.010318552,0.077826212,6.703806497,1.501322866,0.133920382,0.239511453,protein_coding,6,NONE,ZNF292,37.67626491,113111,ALL,MAYO +Diagnosis.AOD,CBE,AD-CONTROL,ENSG00000261823,-0.008434191,-0.077650231,0.060781849,-0.970547902,-0.239149259,0.811089637,0.873760793,lncRNA,15,NONE,AC084782.2,42.13615023,1704,ALL,MAYO +Diagnosis.Sex,CBE,AD-CONTROL,ENSG00000146733,0.245221101,0.088916757,0.401525444,3.6245628,3.081227128,0.002180227,0.025429476,protein_coding,7,NONE,PSPH,46.3530108,40554,FEMALE,MAYO +Diagnosis.Sex,CBE,AD-CONTROL,ENSG00000164574,0.111683536,-0.075953728,0.299320799,4.556619644,1.16706539,0.24377083,0.459714102,protein_coding,5,NONE,GALNT10,44.28655187,230255,FEMALE,MAYO +Diagnosis.Sex,CBE,AD-CONTROL,ENSG00000105127,0.036793733,-0.059745776,0.133333242,4.795637019,0.751265208,0.452857433,0.660329327,protein_coding,19,NONE,AKAP8,53.23691982,26414,FEMALE,MAYO +Diagnosis.Sex,CBE,AD-CONTROL,ENSG00000205790,0.347641938,0.137145533,0.558138342,-0.677376322,3.255063005,0.001212908,0.010745247,lncRNA,19,UP,DPP9-AS1,59.2620369,6667,MALE,MAYO +Diagnosis.Sex,CBE,AD-CONTROL,ENSG00000171469,-0.147816242,-0.269059529,-0.026572954,3.644836924,-2.399058545,0.016813764,0.066411124,protein_coding,19,NONE,ZNF561,44.45249743,16557,MALE,MAYO +Diagnosis.Sex,CBE,AD-CONTROL,ENSG00000064703,-0.055752044,-0.165988677,0.054484588,3.592526425,-0.996676453,0.319418656,0.497558937,protein_coding,1,NONE,DDX20,40.26340546,12756,MALE,MAYO +Diagnosis,FP,AD-CONTROL,ENSG00000260751,-0.247109844,-0.416792039,-0.077427649,0.47017593,-2.856419551,0.004377501,0.037095362,lncRNA,16,NONE,AC008870.2,43.65234375,1024,ALL,MSSM +Diagnosis,FP,AD-CONTROL,ENSG00000170017,0.116124024,0.013752868,0.218495179,7.605164653,2.220609241,0.026685651,0.111182878,protein_coding,3,NONE,ALCAM,35.28991581,209992,ALL,MSSM +Diagnosis,FP,AD-CONTROL,ENSG00000235290,-0.057859204,-0.252212735,0.136494327,-0.16909436,-0.584873994,0.558811376,0.712709557,unprocessed_pseudogene,6,NONE,HLA-W,53.26582278,1975,ALL,MSSM +Diagnosis,FP,AD-CONTROL,ENSG00000177728,0.017353424,-0.086789821,0.12149667,6.788782124,0.32609337,0.744427926,0.845969576,protein_coding,17,NONE,TMEM94,52.00188889,59294,ALL,MSSM +Diagnosis.Sex,FP,AD-CONTROL,ENSG00000172780,-0.434385421,-0.717838177,-0.150932665,0.396064873,-3.00527216,0.00271497,0.065139157,protein_coding,3,NONE,RAB43,50.78477564,35233,MALE,MSSM +Diagnosis.Sex,FP,AD-CONTROL,ENSG00000269054,-0.155588882,-0.396743565,0.0855658,0.942597545,-1.265404041,0.206045462,0.491878373,lncRNA,19,NONE,AC012313.6,56.27619238,5011,MALE,MSSM +Diagnosis.Sex,FP,AD-CONTROL,ENSG00000197497,0.079706407,-0.081429533,0.240842348,2.881636183,0.973898187,0.33048948,0.611179707,protein_coding,19,NONE,ZNF665,45.25955405,34174,MALE,MSSM +Diagnosis.Sex,FP,AD-CONTROL,ENSG00000155827,0.02916353,-0.045914974,0.104242034,5.680912284,0.768042101,0.442712225,0.699692125,protein_coding,9,NONE,RNF20,38.16967313,29492,MALE,MSSM +Diagnosis,ACC,AD-CONTROL,ENSG00000125841,-0.181579939,-0.256616609,-0.10654327,6.725621038,-4.743548393,2.22E-06,6.90E-05,protein_coding,20,NONE,NRSN2,51.93726221,12879,ALL,ROSMAP +Diagnosis,ACC,AD-CONTROL,ENSG00000135898,-0.074241909,-0.471057434,0.322573617,0.222262803,-0.366638152,0.713920945,0.827016898,protein_coding,2,NONE,GPR55,48.82695492,53749,ALL,ROSMAP +Diagnosis,ACC,AD-CONTROL,ENSG00000151834,-0.027222811,-0.121567916,0.067122295,6.530023004,-0.56550242,0.571784411,0.72424515,protein_coding,4,NONE,GABRA2,35.62900125,226804,ALL,ROSMAP +Diagnosis.AOD,ACC,AD-CONTROL,ENSG00000099365,-0.093331151,-0.124320202,-0.0623421,8.150447477,-5.901609746,4.10E-09,4.84E-08,protein_coding,16,NONE,STX1B,55.21208437,21383,ALL,ROSMAP \ No newline at end of file diff --git a/tests/test_assets/rna_distribution_data/input/test_rna_distribution_data_missing_values.csv b/tests/test_assets/rna_distribution_data/input/test_rna_distribution_data_missing_values.csv new file mode 100644 index 00000000..5d3c3ce7 --- /dev/null +++ b/tests/test_assets/rna_distribution_data/input/test_rna_distribution_data_missing_values.csv @@ -0,0 +1,26 @@ +model,tissue,comparison,ensembl_gene_id,logfc,ci_l,ci_r,aveexpr,t,p_value,adj_p_val,gene_biotype,chromosome_name,direction,hgnc_symbol,percentage_gc_content,gene_length,sex,study +,CBE,AD-CONTROL,ENSG00000228521,1.646715414,1.219378549,2.074052279,-2.452298708,7.544599094,2.21E-13,2.26E-10,lncRNA,7,UP,AC099552.3,60.6903164,1043,ALL,MAYO +Diagnosis,CBE,AD-CONTROL,ENSG00000286872,-0.181247795,-0.341187573,-0.021308017,0.032732484,-2.226401902,0.026439738,0.06693269,lncRNA,15,NONE,AC024270.5,46.5131873,13422,ALL,MAYO +Diagnosis,CBE,AD-CONTROL,ENSG00000144228,0.080531879,-0.016465937,0.177529695,4.913442808,1.63093857,0.10355023,0.193939662,protein_coding,2,NONE,SPOPL,36.67609796,71747,ALL,MAYO +Diagnosis,CBE,AD-CONTROL,ENSG00000285778,0.011774265,-0.225427841,0.248976371,-1.286000104,0.097352193,0.922486354,0.950960873,lncRNA,1,NONE,AL591463.1,36.23399051,127971,ALL,MAYO +Diagnosis.AOD,CBE,AD-CONTROL,ENSG00000005339,0.074072272,0.042805132,0.105339413,7.626719604,4.656313999,4.15E-06,8.32E-05,protein_coding,16,NONE,CREBBP,45.00523533,155673,ALL,MAYO +Diagnosis.AOD,,AD-CONTROL,ENSG00000188994,0.03375383,-0.010318552,0.077826212,6.703806497,1.501322866,0.133920382,0.239511453,protein_coding,6,NONE,ZNF292,37.67626491,113111,ALL,MAYO +Diagnosis.AOD,CBE,AD-CONTROL,ENSG00000261823,-0.008434191,-0.077650231,0.060781849,-0.970547902,-0.239149259,0.811089637,0.873760793,lncRNA,15,NONE,AC084782.2,42.13615023,1704,ALL,MAYO +Diagnosis.Sex,CBE,AD-CONTROL,ENSG00000146733,0.245221101,0.088916757,0.401525444,3.6245628,3.081227128,0.002180227,0.025429476,protein_coding,7,NONE,PSPH,46.3530108,40554,FEMALE,MAYO +Diagnosis.Sex,CBE,AD-CONTROL,ENSG00000164574,0.111683536,-0.075953728,0.299320799,4.556619644,1.16706539,0.24377083,0.459714102,protein_coding,5,NONE,GALNT10,44.28655187,230255,FEMALE,MAYO +Diagnosis.Sex,CBE,AD-CONTROL,ENSG00000105127,0.036793733,-0.059745776,0.133333242,4.795637019,0.751265208,0.452857433,0.660329327,protein_coding,19,NONE,AKAP8,53.23691982,26414,FEMALE,MAYO +Diagnosis.Sex,CBE,AD-CONTROL,ENSG00000205790,0.347641938,0.137145533,0.558138342,-0.677376322,3.255063005,0.001212908,0.010745247,lncRNA,19,UP,DPP9-AS1,59.2620369,6667,MALE,MAYO +Diagnosis.Sex,CBE,AD-CONTROL,ENSG00000171469,-0.147816242,-0.269059529,-0.026572954,3.644836924,-2.399058545,0.016813764,0.066411124,protein_coding,19,NONE,ZNF561,44.45249743,16557,MALE,MAYO +Diagnosis.Sex,CBE,AD-CONTROL,ENSG00000064703,-0.055752044,-0.165988677,0.054484588,3.592526425,-0.996676453,0.319418656,0.497558937,protein_coding,1,NONE,DDX20,40.26340546,12756,MALE,MAYO +Diagnosis,FP,AD-CONTROL,ENSG00000260751,-0.247109844,-0.416792039,-0.077427649,0.47017593,-2.856419551,0.004377501,0.037095362,lncRNA,16,NONE,AC008870.2,43.65234375,1024,ALL,MSSM +Diagnosis,FP,AD-CONTROL,ENSG00000170017,,0.013752868,0.218495179,7.605164653,2.220609241,0.026685651,0.111182878,protein_coding,3,NONE,ALCAM,35.28991581,209992,ALL,MSSM +Diagnosis,FP,AD-CONTROL,ENSG00000235290,-0.057859204,-0.252212735,0.136494327,-0.16909436,-0.584873994,0.558811376,0.712709557,unprocessed_pseudogene,6,NONE,HLA-W,53.26582278,1975,ALL,MSSM +Diagnosis,FP,AD-CONTROL,ENSG00000177728,0.017353424,-0.086789821,0.12149667,6.788782124,0.32609337,0.744427926,0.845969576,protein_coding,17,NONE,TMEM94,52.00188889,59294,ALL,MSSM +Diagnosis.Sex,FP,AD-CONTROL,ENSG00000172780,,-0.717838177,-0.150932665,0.396064873,-3.00527216,0.00271497,0.065139157,protein_coding,3,NONE,RAB43,50.78477564,35233,MALE,MSSM +Diagnosis.Sex,FP,AD-CONTROL,ENSG00000269054,,-0.396743565,0.0855658,0.942597545,-1.265404041,0.206045462,0.491878373,lncRNA,19,NONE,AC012313.6,56.27619238,5011,MALE,MSSM +Diagnosis.Sex,FP,AD-CONTROL,ENSG00000197497,,-0.081429533,0.240842348,2.881636183,0.973898187,0.33048948,0.611179707,protein_coding,19,NONE,ZNF665,45.25955405,34174,MALE,MSSM +Diagnosis.Sex,FP,AD-CONTROL,ENSG00000155827,,-0.045914974,0.104242034,5.680912284,0.768042101,0.442712225,0.699692125,protein_coding,9,NONE,RNF20,38.16967313,29492,MALE,MSSM +Diagnosis,ACC,AD-CONTROL,ENSG00000125841,-0.181579939,-0.256616609,-0.10654327,6.725621038,-4.743548393,2.22E-06,6.90E-05,protein_coding,20,NONE,NRSN2,51.93726221,12879,ALL,ROSMAP +Diagnosis,ACC,AD-CONTROL,ENSG00000135898,-0.074241909,-0.471057434,0.322573617,0.222262803,-0.366638152,0.713920945,0.827016898,protein_coding,2,NONE,GPR55,48.82695492,53749,ALL,ROSMAP +Diagnosis,ACC,AD-CONTROL,ENSG00000151834,-0.027222811,-0.121567916,0.067122295,6.530023004,-0.56550242,0.571784411,0.72424515,protein_coding,4,NONE,GABRA2,35.62900125,226804,ALL,ROSMAP +Diagnosis.AOD,ACC,AD-CONTROL,ENSG00000099365,-0.093331151,-0.124320202,-0.0623421,8.150447477,-5.901609746,4.10E-09,4.84E-08,protein_coding,16,NONE,STX1B,55.21208437,21383,ALL,ROSMAP \ No newline at end of file diff --git a/tests/test_assets/rna_distribution_data/output/rna_distribution_data_good_output.json b/tests/test_assets/rna_distribution_data/output/rna_distribution_data_good_output.json new file mode 100644 index 00000000..64cb4f78 --- /dev/null +++ b/tests/test_assets/rna_distribution_data/output/rna_distribution_data_good_output.json @@ -0,0 +1,74 @@ +[ + { + "model": "AD Diagnosis (males and females)", + "tissue": "ACC", + "min": -0.2437, + "max": 0.065, + "first_quartile": -0.1279, + "median": -0.0742, + "third_quartile": -0.0507 + }, + { + "model": "AD Diagnosis x AOD (males and females)", + "tissue": "ACC", + "min": -0.0933, + "max": -0.0933, + "first_quartile": -0.0933, + "median": -0.0933, + "third_quartile": -0.0933 + }, + { + "model": "AD Diagnosis (males and females)", + "tissue": "CBE", + "min": -0.7993, + "max": 1.2349, + "first_quartile": -0.0365, + "median": 0.0462, + "third_quartile": 0.4721 + }, + { + "model": "AD Diagnosis x AOD (males and females)", + "tissue": "CBE", + "min": -0.0492, + "max": 0.1158, + "first_quartile": 0.0127, + "median": 0.0338, + "third_quartile": 0.0539 + }, + { + "model": "AD Diagnosis x Sex (females only)", + "tissue": "CBE", + "min": -0.0821, + "max": 0.3348, + "first_quartile": 0.0742, + "median": 0.1117, + "third_quartile": 0.1785 + }, + { + "model": "AD Diagnosis x Sex (males only)", + "tissue": "CBE", + "min": -0.4734, + "max": 0.5175, + "first_quartile": -0.1018, + "median": -0.0558, + "third_quartile": 0.1459 + }, + { + "model": "AD Diagnosis (males and females)", + "tissue": "FP", + "min": -0.326, + "max": 0.2629, + "first_quartile": -0.1052, + "median": -0.0203, + "third_quartile": 0.042 + }, + { + "model": "AD Diagnosis x Sex (males only)", + "tissue": "FP", + "min": -0.6259, + "max": 0.4424, + "first_quartile": -0.2253, + "median": -0.0632, + "third_quartile": 0.0418 + } +] \ No newline at end of file diff --git a/tests/test_assets/rna_distribution_data/output/rna_distribution_data_missing_data_output.json b/tests/test_assets/rna_distribution_data/output/rna_distribution_data_missing_data_output.json new file mode 100644 index 00000000..3c0e9a62 --- /dev/null +++ b/tests/test_assets/rna_distribution_data/output/rna_distribution_data_missing_data_output.json @@ -0,0 +1,74 @@ +[ + { + "model": "AD Diagnosis (males and females)", + "tissue": "ACC", + "min": -0.2437, + "max": 0.065, + "first_quartile": -0.1279, + "median": -0.0742, + "third_quartile": -0.0507 + }, + { + "model": "AD Diagnosis x AOD (males and females)", + "tissue": "ACC", + "min": -0.0933, + "max": -0.0933, + "first_quartile": -0.0933, + "median": -0.0933, + "third_quartile": -0.0933 + }, + { + "model": "AD Diagnosis (males and females)", + "tissue": "CBE", + "min": -0.2811, + "max": 0.2425, + "first_quartile": -0.0847, + "median": 0.0118, + "third_quartile": 0.0462 + }, + { + "model": "AD Diagnosis x AOD (males and females)", + "tissue": "CBE", + "min": -0.0497, + "max": 0.1153, + "first_quartile": 0.0122, + "median": 0.0328, + "third_quartile": 0.0534 + }, + { + "model": "AD Diagnosis x Sex (females only)", + "tissue": "CBE", + "min": -0.0821, + "max": 0.3348, + "first_quartile": 0.0742, + "median": 0.1117, + "third_quartile": 0.1785 + }, + { + "model": "AD Diagnosis x Sex (males only)", + "tissue": "CBE", + "min": -0.4734, + "max": 0.5175, + "first_quartile": -0.1018, + "median": -0.0558, + "third_quartile": 0.1459 + }, + { + "model": "AD Diagnosis (males and females)", + "tissue": "FP", + "min": -0.3508, + "max": 0.1781, + "first_quartile": -0.1525, + "median": -0.0579, + "third_quartile": -0.0203 + }, + { + "model": "AD Diagnosis x Sex (males only)", + "tissue": "FP", + "min": null, + "max": null, + "first_quartile": null, + "median": null, + "third_quartile": null + } +] \ No newline at end of file diff --git a/tests/test_assets/rnaseq_differential_expression/input/test_rnaseq_differential_expression_bad_input.csv b/tests/test_assets/rnaseq_differential_expression/input/test_rnaseq_differential_expression_bad_input.csv new file mode 100644 index 00000000..edbcce43 --- /dev/null +++ b/tests/test_assets/rnaseq_differential_expression/input/test_rnaseq_differential_expression_bad_input.csv @@ -0,0 +1,2 @@ +model,tissue,comparison,ensembl_gene_id,logfc,ci_l,ci_r,aveexpr,t,p_value,adj_p_val,gene_biotype,chromosome_name,direction,hgnc_symbol,percentage_gc_content,gene_length,sex,study +Diagnosis.AOD,TCX,AD-CONTROL,ENSG00000122965,No change,-0.00786889,0.040130378,5.039925164,1.326614942,0.185257779,0.264418818,protein_coding,12,NONE,RBM19,50.4271733,149588,ALL,MAYO \ No newline at end of file diff --git a/tests/test_assets/rnaseq_differential_expression/input/test_rnaseq_differential_expression_good_input.csv b/tests/test_assets/rnaseq_differential_expression/input/test_rnaseq_differential_expression_good_input.csv new file mode 100644 index 00000000..2f7afbee --- /dev/null +++ b/tests/test_assets/rnaseq_differential_expression/input/test_rnaseq_differential_expression_good_input.csv @@ -0,0 +1,10 @@ +model,tissue,comparison,ensembl_gene_id,logfc,ci_l,ci_r,aveexpr,t,p_value,adj_p_val,gene_biotype,chromosome_name,direction,hgnc_symbol,percentage_gc_content,gene_length,sex,study +Diagnosis,CBE,AD-CONTROL,ENSG00000228521,1.646715414,1.219378549,2.074052279,-2.452298708,7.544599094,2.21E-13,2.26E-10,lncRNA,7,UP,AC099552.3,60.6903164,1043,ALL,MAYO +Diagnosis.AOD,TCX,AD-CONTROL,ENSG00000122965,0.016130744,-0.00786889,0.040130378,5.039925164,1.326614942,0.185257779,0.264418818,protein_coding,12,NONE,RBM19,50.4271733,149588,ALL,MAYO +Diagnosis.Sex,TCX,AD-CONTROL,ENSG00000117505,-0.100739949,-0.227581707,0.026101809,6.089715998,-1.561013119,0.119190521,0.221713628,protein_coding,1,NONE,DR1,36.43956417,23587,MALE,MAYO +Diagnosis,PHG,AD-CONTROL,ENSG00000205500,0.00024765,-0.113637527,0.114132827,1.88042486,0.004268084,0.996595427,0.997692468,lncRNA,2,NONE,AC013472.1,43.6002279,29837,ALL,MSSM +Diagnosis.AOD,PHG,AD-CONTROL,ENSG00000117395,-0.096808478,-0.133253512,-0.060363443,4.966345698,-5.251633926,3.14E-07,2.15E-06,protein_coding,1,NONE,EBNA1BP2,44.37908619,106762,ALL,MSSM +Diagnosis.Sex,IFG,AD-CONTROL,ENSG00000265148,-0.080245797,-0.279006952,0.118515359,1.038905826,-0.791926933,0.428588197,0.651561105,lncRNA,17,NONE,TSPOAP1-AS1,46.31031272,91295,FEMALE,MSSM +Diagnosis,DLPFC,AD-CONTROL,ENSG00000196296,-0.156662145,-0.275874831,-0.037449459,-0.187763112,-2.576961292,0.010025873,0.036221827,protein_coding,16,NONE,ATP2A1,54.03945604,26105,ALL,ROSMAP +Diagnosis.AOD,ACC,AD-CONTROL,ENSG00000171115,0.090141673,0.060772347,0.119510999,2.785585776,6.015396381,2.07E-09,2.63E-08,protein_coding,7,NONE,GIMAP8,43.24155194,28764,ALL,ROSMAP +Diagnosis.Sex,ACC,AD-CONTROL,ENSG00000240889,-0.297125273,-0.526509713,-0.067740833,-0.443143774,-2.539789602,0.011154011,0.120844742,lncRNA,7,NONE,NDUFB2-AS1,55.74052813,1742,MALE,ROSMAP \ No newline at end of file diff --git a/tests/test_assets/rnaseq_differential_expression/input/test_rnaseq_differential_expression_missing_values.csv b/tests/test_assets/rnaseq_differential_expression/input/test_rnaseq_differential_expression_missing_values.csv new file mode 100644 index 00000000..a6448d65 --- /dev/null +++ b/tests/test_assets/rnaseq_differential_expression/input/test_rnaseq_differential_expression_missing_values.csv @@ -0,0 +1,6 @@ +model,tissue,comparison,ensembl_gene_id,logfc,ci_l,ci_r,aveexpr,t,p_value,adj_p_val,gene_biotype,chromosome_name,direction,hgnc_symbol,percentage_gc_content,gene_length,sex,study +Diagnosis,CBE,AD-CONTROL,ENSG00000228521,1.646715414,1.219378549,2.074052279,-2.452298708,7.544599094,2.21E-13,2.26E-10,lncRNA,7,UP,AC099552.3,60.6903164,1043,ALL,MAYO +,TCX,AD-CONTROL,ENSG00000122965,0.016130744,-0.00786889,0.040130378,5.039925164,1.326614942,0.185257779,0.264418818,protein_coding,12,NONE,RBM19,50.4271733,149588,ALL,MAYO +Diagnosis.Sex,TCX,AD-CONTROL,ENSG00000117505,-0.100739949,-0.227581707,0.026101809,6.089715998,-1.561013119,0.119190521,0.221713628,protein_coding,1,NONE,DR1,36.43956417,23587,MALE, +Diagnosis.AOD,PHG,AD-CONTROL,ENSG00000117395,-0.096808478,-0.133253512,-0.060363443,4.966345698,-5.251633926,3.14E-07,2.15E-06,protein_coding,1,NONE,EBNA1BP2,44.37908619,106762,,MSSM +Diagnosis.AOD,ACC,AD-CONTROL,ENSG00000171115,,0.060772347,0.119510999,2.785585776,6.015396381,2.07E-09,2.63E-08,protein_coding,7,NONE,GIMAP8,43.24155194,28764,FEMALE,ROSMAP \ No newline at end of file diff --git a/tests/test_assets/rnaseq_differential_expression/output/rnaseq_differential_expression_good_output.json b/tests/test_assets/rnaseq_differential_expression/output/rnaseq_differential_expression_good_output.json new file mode 100644 index 00000000..c3d50f54 --- /dev/null +++ b/tests/test_assets/rnaseq_differential_expression/output/rnaseq_differential_expression_good_output.json @@ -0,0 +1,110 @@ +[ + { + "ensembl_gene_id": "ENSG00000228521", + "hgnc_symbol": "AC099552.3", + "logfc": 1.64671541373435, + "fc": 3.1311994631711295, + "ci_l": 1.21937854853272, + "ci_r": 2.07405227893598, + "adj_p_val": 2.26170424459412e-10, + "tissue": "CBE", + "study": "MayoRNAseq", + "model": "AD Diagnosis (males and females)" + }, + { + "ensembl_gene_id": "ENSG00000122965", + "hgnc_symbol": "RBM19", + "logfc": 0.0161307443150122, + "fc": 1.011243720714922, + "ci_l": -0.0078688895717805, + "ci_r": 0.0401303782018049, + "adj_p_val": 0.264418817515348, + "tissue": "TCX", + "study": "MayoRNAseq", + "model": "AD Diagnosis x AOD (males and females)" + }, + { + "ensembl_gene_id": "ENSG00000117505", + "hgnc_symbol": "DR1", + "logfc": -0.100739949027997, + "fc": 0.9325545676039445, + "ci_l": -0.227581706924239, + "ci_r": 0.026101808868246, + "adj_p_val": 0.221713627957082, + "tissue": "TCX", + "study": "MayoRNAseq", + "model": "AD Diagnosis x Sex (males only)" + }, + { + "ensembl_gene_id": "ENSG00000205500", + "hgnc_symbol": "AC013472.1", + "logfc": 0.0002476496605992, + "fc": 1.0001716723980307, + "ci_l": -0.11363752717897, + "ci_r": 0.114132826500169, + "adj_p_val": 0.997692467509696, + "tissue": "PHG", + "study": "MSBB", + "model": "AD Diagnosis (males and females)" + }, + { + "ensembl_gene_id": "ENSG00000117395", + "hgnc_symbol": "EBNA1BP2", + "logfc": -0.0968084778973198, + "fc": 0.9350993267580804, + "ci_l": -0.133253512461975, + "ci_r": -0.0603634433326643, + "adj_p_val": 2.15192980822548e-06, + "tissue": "PHG", + "study": "MSBB", + "model": "AD Diagnosis x AOD (males and females)" + }, + { + "ensembl_gene_id": "ENSG00000265148", + "hgnc_symbol": "TSPOAP1-AS1", + "logfc": -0.0802457967097812, + "fc": 0.9458964774956181, + "ci_l": -0.279006952050323, + "ci_r": 0.11851535863076, + "adj_p_val": 0.651561104581782, + "tissue": "IFG", + "study": "MSBB", + "model": "AD Diagnosis x Sex (females only)" + }, + { + "ensembl_gene_id": "ENSG00000196296", + "hgnc_symbol": "ATP2A1", + "logfc": -0.156662145003257, + "fc": 0.89709822043115, + "ci_l": -0.275874831245501, + "ci_r": -0.0374494587610133, + "adj_p_val": 0.0362218271087407, + "tissue": "DLPFC", + "study": "ROSMAP", + "model": "AD Diagnosis (males and females)" + }, + { + "ensembl_gene_id": "ENSG00000171115", + "hgnc_symbol": "GIMAP8", + "logfc": 0.0901416730208155, + "fc": 1.0644747090088102, + "ci_l": 0.0607723473670167, + "ci_r": 0.119510998674614, + "adj_p_val": 2.63101445252297e-08, + "tissue": "ACC", + "study": "ROSMAP", + "model": "AD Diagnosis x AOD (males and females)" + }, + { + "ensembl_gene_id": "ENSG00000240889", + "hgnc_symbol": "NDUFB2-AS1", + "logfc": -0.297125272991841, + "fc": 0.8138725113186905, + "ci_l": -0.526509712833402, + "ci_r": -0.0677408331502798, + "adj_p_val": 0.120844742424367, + "tissue": "ACC", + "study": "ROSMAP", + "model": "AD Diagnosis x Sex (males only)" + } +] \ No newline at end of file diff --git a/tests/test_assets/rnaseq_differential_expression/output/rnaseq_differential_expression_missing_data_output.json b/tests/test_assets/rnaseq_differential_expression/output/rnaseq_differential_expression_missing_data_output.json new file mode 100644 index 00000000..32da0fcf --- /dev/null +++ b/tests/test_assets/rnaseq_differential_expression/output/rnaseq_differential_expression_missing_data_output.json @@ -0,0 +1,62 @@ +[ + { + "ensembl_gene_id": "ENSG00000228521", + "hgnc_symbol": "AC099552.3", + "logfc": 1.646715414, + "fc": 3.1311994637476914, + "ci_l": 1.219378549, + "ci_r": 2.074052279, + "adj_p_val": 2.26e-10, + "tissue": "CBE", + "study": "MayoRNAseq", + "model": "AD Diagnosis (males and females)" + }, + { + "ensembl_gene_id": "ENSG00000122965", + "hgnc_symbol": "RBM19", + "logfc": 0.016130744, + "fc": 1.0112437204941171, + "ci_l": -0.00786889, + "ci_r": 0.040130378, + "adj_p_val": 0.264418818, + "tissue": "TCX", + "study": "MayoRNAseq", + "model": null + }, + { + "ensembl_gene_id": "ENSG00000117505", + "hgnc_symbol": "DR1", + "logfc": -0.100739949, + "fc": 0.9325545676220417, + "ci_l": -0.227581707, + "ci_r": 0.026101809, + "adj_p_val": 0.221713628, + "tissue": "TCX", + "study": null, + "model": "AD Diagnosis x Sex (males only)" + }, + { + "ensembl_gene_id": "ENSG00000117395", + "hgnc_symbol": "EBNA1BP2", + "logfc": -0.096808478, + "fc": 0.9350993266915271, + "ci_l": -0.133253512, + "ci_r": -0.060363443, + "adj_p_val": 2.15e-06, + "tissue": "PHG", + "study": "MSBB", + "model": null + }, + { + "ensembl_gene_id": "ENSG00000171115", + "hgnc_symbol": "GIMAP8", + "logfc": null, + "fc": null, + "ci_l": 0.060772347, + "ci_r": 0.119510999, + "adj_p_val": 2.63e-08, + "tissue": "ACC", + "study": "ROSMAP", + "model": "AD Diagnosis x AOD (females only)" + } +] \ No newline at end of file diff --git a/tests/transform/test_rna_distribution_data.py b/tests/transform/test_rna_distribution_data.py new file mode 100644 index 00000000..87727897 --- /dev/null +++ b/tests/transform/test_rna_distribution_data.py @@ -0,0 +1,63 @@ +import os + +import pandas as pd +import pytest + +from agoradatatools.etl.transform import rna_distribution + + +class TestTransformRnaDistributionData: + data_files_path = "tests/test_assets/rna_distribution_data" + pass_test_data = [ + ( # pass with good data + "test_rna_distribution_data_good_input.csv", + "rna_distribution_data_good_output.json", + ), + ( # pass with missing data + "test_rna_distribution_data_missing_values.csv", + "rna_distribution_data_missing_data_output.json", + ), + ] + pass_test_ids = [ + "Pass with good data", + "Pass with missing data", + ] + fail_test_data = [ + ( # Fail with a TypeError due to string value in logfc + "test_rna_distribution_data_bad_input_typeerror.csv", + TypeError, + ), + ( # Fail with a KeyError due to too many missing values + "test_rna_distribution_data_bad_input_keyerror.csv", + KeyError, + ), + ] + fail_test_ids = [ + "Fail with bad data type", + "Fail with too many missing values", + ] + + @pytest.mark.parametrize( + "input_file, expected_output_file", pass_test_data, ids=pass_test_ids + ) + def test_transform_rna_distribution_data_should_pass( + self, input_file, expected_output_file + ): + input_df = pd.read_csv(os.path.join(self.data_files_path, "input", input_file)) + output_df = rna_distribution.transform_rna_distribution_data( + datasets={"diff_exp_data": input_df} + ) + expected_df = pd.read_json( + os.path.join(self.data_files_path, "output", expected_output_file), + ) + pd.testing.assert_frame_equal(output_df, expected_df) + + @pytest.mark.parametrize("input_file, error_type", fail_test_data, ids=fail_test_ids) + def test_transform_rna_distribution_data_should_fail(self, input_file, error_type): + with pytest.raises(error_type): + input_df = pd.read_csv( + os.path.join(self.data_files_path, "input", input_file) + ) + rna_distribution.transform_rna_distribution_data( + datasets={"diff_exp_data": input_df} + ) diff --git a/tests/transform/test_rnaseq_differential_expression.py b/tests/transform/test_rnaseq_differential_expression.py new file mode 100644 index 00000000..e37d5d71 --- /dev/null +++ b/tests/transform/test_rnaseq_differential_expression.py @@ -0,0 +1,57 @@ +import os + +import pandas as pd +import pytest + +from agoradatatools.etl.transform import rnaseq_differential_expression + + +class TestTransformRnaseqDifferentialExpression: + data_files_path = "tests/test_assets/rnaseq_differential_expression" + pass_test_data = [ + ( # pass with good data + "test_rnaseq_differential_expression_good_input.csv", + "rnaseq_differential_expression_good_output.json", + ), + ( # pass with missing data + "test_rnaseq_differential_expression_missing_values.csv", + "rnaseq_differential_expression_missing_data_output.json", + ), + ] + pass_test_ids = [ + "Pass with good data", + "Pass with missing data", + ] + fail_test_data = [ + "test_rnaseq_differential_expression_bad_input.csv", # fail with bad data + ] + fail_test_ids = [ + "Fail with bad data type", + ] + + @pytest.mark.parametrize( + "input_file, expected_output_file", pass_test_data, ids=pass_test_ids + ) + def test_transform_rnaseq_differential_expression_should_pass( + self, input_file, expected_output_file + ): + input_df = pd.read_csv(os.path.join(self.data_files_path, "input", input_file)) + output_df = ( + rnaseq_differential_expression.transform_rnaseq_differential_expression( + datasets={"diff_exp_data": input_df} + ) + ) + expected_df = pd.read_json( + os.path.join(self.data_files_path, "output", expected_output_file), + ) + pd.testing.assert_frame_equal(output_df, expected_df) + + @pytest.mark.parametrize("input_file", fail_test_data, ids=fail_test_ids) + def test_transform_rnaseq_differential_expression_should_fail(self, input_file): + with pytest.raises(TypeError): + input_df = pd.read_csv( + os.path.join(self.data_files_path, "input", input_file) + ) + rnaseq_differential_expression.transform_rnaseq_differential_expression( + datasets={"diff_exp_data": input_df} + )