-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path0_dataprocessing.R
71 lines (61 loc) · 2.22 KB
/
0_dataprocessing.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
#### data processing script
# The processed data used in this study can be downloaded at https://doi.org/10.6084/m9.figshare.19775359
# clear workspace
rm(list = ls())
# set working directory to location of source code
setwd(dirname(rstudioapi::getActiveDocumentContext()$path))
#### init ----
library(maplet)
library(tidyverse)
library(magrittr)
# run preprocessing for datasets
data_preprocessing <- function(D, dataset) {
D <- D %>%
mt_pre_filter_missingness(samp_max=0.5) %>%
mt_pre_filter_missingness(feat_max=0.25) %>%
{.}
# normalization
if(grepl("_proteo", dataset)){
# undo log transformation of data for quot norm
D %<>% mt_pre_trans_exp()
}
# normalization, log, imputation
D %<>%
mt_pre_norm_quot(feat_max = 0) %>%
# transformation & imputation
mt_pre_trans_log() %>%
mt_pre_impute_knn() %>%
{.}
# average duplicate molecules --> specifically needed for proteins
if (grepl("proteo", dataset)){
D %<>% mt_modify_avg_features(group_col = 'name')
}
# final dataset info
D %<>% # dataset info
mt_reporting_data()
}
# make sure result directory exists
dir.create("input/", showWarnings = F, recursive = T)
datasets <- c('plasma_lipids', "plasma_metabo", "plasma_proteo") #
##### start analysis ----
# loop over datasets ----
for (dataset in datasets) {
# load and preprocess
D <-
# load data
mt_load_se_xls(file=sprintf('input/%s_raw.xlsx', dataset)) %>%
# adjust the age variable with ">" sign
mt_anno_mutate(anno_type = "samples", col_name='age', term=case_when(age%in%">90" ~ "90", TRUE~age)) %>%
# make sure age is numeric
mt_anno_mutate(anno_type = "samples", col_name = "age", term = as.numeric(as.matrix(age))) %>%
# preprocess
data_preprocessing(dataset=dataset)
# select columns needed
Y <- D %>% colData() %>% data.frame() %>% select(Subject_ID, sex, age, aki, platelet, pf, death, Group)
D1 <- SummarizedExperiment(assay= assay(D), rowData=rowData(D), colData=Y)
# save processed data
D1 %<>% mt_write_se_xls(file = paste0("input/", dataset, '_processed.xlsx'))
} # closing dataset loop
## finished
print("Done! per omics data processing!")
print("Generated excel files with processed data in input folder!")