-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path2_within_ards.R
127 lines (115 loc) · 5.06 KB
/
2_within_ards.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
# Script to perform per omics per clinical manifestation comparison within ARDS etiologies
# Generates supplementary files 4, 5, and 6
#### SET UP --------
### clear workspace and set directories ------
rm(list = ls())
# set working directory to location of source code
setwd(dirname(rstudioapi::getActiveDocumentContext()$path))
# make sure result directory exists
dir.create("results/", showWarnings = F, recursive = T)
### input variables ------
# datasets
datasets <- c("plasma_metabo", "plasma_proteo", 'plasma_lipids')
# clinical manifestations
outcomes <- data.frame(outcome = c('aki', 'platelet', 'pf', 'death'),
outcome_type = c(rep('numeric', 3), 'binary'),
outcome_mode = c(rep('numeric', 3), 'character'))
# groups to compare
complst <- list(comps = list(c("Group","Co19-ARDS","Bact-Seps")),
check_groups = c("Bact-Seps", "Co19-ARDS"))
# adjusted p-value cutoff
pcut <- 0.05
sfile_num <- 4
### libraries ------
library(tidyverse)
source('custom_functions.R')
#### MAIN --------
### load, prepare, and perform within-ards analysis on each dataset ------
for (dataset in datasets) {
intermediate_file <- sprintf("results/tmp_%s_within_stats.xlsx", dataset)
formatted_file <- sprintf("results/supplementary_table_%d_%s_within_ards_clinical_manifestation_stats.xlsx", sfile_num, dataset)
# load processed data
D <- mt_load_se_xls(file=paste0('input/', dataset, '_processed.xlsx')) %>%
# flag that data is logged
mt_load_flag_logged() %>%
# format clinical manifestation columns as per data type (numeric/factor)
outcome_type_conversion(outcomes)
# get pathway annotations if there is an UniProt column (used for proteomics)
if ("UniprotID" %in% (D %>% rowData() %>% colnames())) {
D %<>%
mt_anno_pathways_uniprot(
in_col = "UniprotID",
out_col = "kegg_db") %>%
mt_anno_pathways_remove_redundant(feat_col = "UniprotID", pw_col = "kegg_db")
}
## perform analysis per outcome per group ----
for (i in 1:nrow(outcomes)) {
for (group in complst$check_groups) {
comp_name <- sprintf("%s_%s", outcomes$outcome[[i]], group)
D %<>%
within_ards_analysis(
outcome_info = outcomes[i,],
comp_name = comp_name,
keep = group, pcut =pcut)
}
}
# save intermediate output
write_stats(D, out_file = intermediate_file)
## format the dataset output excel files ----
# read all sheets
tmp_stats <- intermediate_file %>%
excel_sheets() %>%
purrr::set_names() %>%
map(read_excel, path = intermediate_file)
# for each dataset, select desired columns from the statistical result for mortality and
# other clinical outcomes; rename columns from each where necessary
if(dataset=='plasma_proteo'){
for(i in 1:length(tmp_stats)){
if(grepl('death', names(tmp_stats)[i])){
tmp_stats[[i]] %<>% select(name, outcome, estimate, std_error, statistic, fold_change, p_value, adj_p,
effect_high_in, OlinkID, UniprotID, Panel, Panel_Version, kegg_db) %>%
dplyr::rename(KEGG_Pathway_IDs=kegg_db)
} else {
tmp_stats[[i]] %<>% select(name, outcome, estimate, std_error, statistic, p_value, adj_p,
OlinkID, UniprotID, Panel, Panel_Version, kegg_db) %>%
dplyr::rename(KEGG_Pathway_IDs=kegg_db)
}
}
} else if(dataset=='plasma_metabo'){
for(i in 1:length(tmp_stats)){
if(grepl('death', names(tmp_stats)[i])){
tmp_stats[[i]] %<>% select(name, outcome, estimate, std_error,
statistic, fold_change, p_value, adj_p, effect_high_in,
SUPER_PATHWAY, SUB_PATHWAY, COMP_ID, PUBCHEM, CAS, KEGG, HMDb)%>%
dplyr::rename(HMDB=HMDb)
} else {
tmp_stats[[i]] %<>% select(name, outcome, estimate, std_error,
statistic, p_value, adj_p,
SUPER_PATHWAY, SUB_PATHWAY, COMP_ID, PUBCHEM, CAS, KEGG, HMDb)%>%
dplyr::rename(HMDB=HMDb)
}
}
}else if(dataset=='plasma_lipids'){
for(i in 1:length(tmp_stats)){
if(grepl('death', names(tmp_stats)[i])){
tmp_stats[[i]] %<>% select(name, outcome, estimate, std_error,
statistic, fold_change, p_value, adj_p, effect_high_in,
Class)%>%
dplyr::rename(Lipid_Class=Class)
} else {
tmp_stats[[i]] %<>% select(name, outcome, estimate, std_error,
statistic, p_value, adj_p,
Class)%>%
dplyr::rename(Lipid_Class=Class)
}
}
} else{
break('unidentified dataset!')
}
## write formatted output ----
write_dataframes (df_list=tmp_stats, out_file=formatted_file)
sfile_num <- sfile_num + 1
} # END dataset loop
### finished ----
print("Done! per omics per clinical manifestation comparison within ARDS etiologies!")
print("Generated excel files with supplementary tables 4, 5, and 6 in results folder!")