-
Notifications
You must be signed in to change notification settings - Fork 0
/
create_mofa_embedding.R
160 lines (129 loc) · 6.77 KB
/
create_mofa_embedding.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
create_mofa_embedding <- function(df_outcome = NULL, df_pt_use = NULL,
embedded_data_types = NULL,
use_types = names(embedded_data_types),
embed_pt = FALSE,
no_mofa_factors = 5, # default
sparse_weights = TRUE, # default
sparse_factors = FALSE, # default
seed = 3,
python_path = python_path,
save_path_task = './', res_dir = './', force_recreation = FALSE){
list_data_types = list()
#reticulate::use_python(python = python_path)
all_pts = row.names(df_outcome)
if ((length(use_types)==1) & (use_types[1] == 'pt')){
df_use = merge(df_outcome, df_pt_use, by = 'row.names')
row.names(df_use) = df_use[["Row.names"]]
df_use = df_use[, !(names(df_use) %in% "Row.names")]
return(df_use)
}else{
for (ut in use_types){
if (ut !='pt') {
if (!(is.data.frame(embedded_data_types[[ut]]))) {
load(embedded_data_types[[ut]])
}else mat_data = embedded_data_types[[ut]]
mat_data_all = matrix(NA, nrow = length(all_pts), ncol=ncol(mat_data))
row.names(mat_data_all) = all_pts
colnames(mat_data_all) = colnames(mat_data)
idx_names = match(row.names(mat_data), all_pts)
mat_data_all[idx_names[!is.na(idx_names)], ] = mat_data[!(is.na(idx_names)), ]
mat_data_all = t(mat_data_all)
list_data_types[[ut]] = mat_data_all # FOR MOFA, samples are stored in columns and features in rows of the matrix
print(dim(list_data_types[[ut]]))
}else if (embed_pt){
df_pt_use = df_pt_use[match(all_pts, row.names(df_pt_use)), ]
df_pt_use_mat = do.call(rbind, lapply(df_pt_use, as.numeric))
colnames(df_pt_use_mat) = row.names(df_pt_use)
row.names(df_pt_use_mat) = names(df_pt_use)
if (any(row.names(df_pt_use_mat) %in%
c('years_to_birth', 'patient.age_at_initial_pathologic_diagnosis'))){
int_cols = df_pt_use_mat[ row.names(df_pt_use_mat) %in%
c('years_to_birth', 'patient.age_at_initial_pathologic_diagnosis'), ]
list_data_types[['pt_int']] = int_cols
}
if (any(!(row.names(df_pt_use_mat) %in%
c('years_to_birth', 'patient.age_at_initial_pathologic_diagnosis')))){
factor_cols = df_pt_use_mat[ !(row.names(df_pt_use_mat) %in%
c('years_to_birth', 'patient.age_at_initial_pathologic_diagnosis')), ]
list_data_types[['pt_factor']] = factor_cols
}
# mat_data_all = matrix(NA, nrow = length(all_pts), ncol=3)
# row.names(mat_data_all) = all_pts
# idx_names = match(row.names(df_pt_use), all_pts)
# mat_data_all[idx_names[!is.na(idx_names)], ] = df_pt_use[!is.na(idx_names), c('race', 'ethnicity', 'gender')]
#
}
}
names_aligned_maybe = do.call(rbind, lapply(list_data_types, colnames))
data_str = paste(use_types, collapse = '_')
mofa_outfile = file.path(save_path_task, paste('MOFA', ifelse(embed_pt, 'PT', ''), '_', data_str, '.Rda', sep = '') )
# if (file.exists(mofa_outfile)){
# load(mofa_outfile)
# Z = mofaobj@expectations$Z # latent factor matrix
# Z = Z$group1
# W = mofaobj@expectations$W # individual weights
#
# if (any(!is.finite(Z))){
# force_recreation = TRUE
# }else{
# factor_mean = colMeans(Z, na.rm = TRUE)
# factor_std = apply(Z,2, sd, na.rm = TRUE)
# if (sum(factor_std> 10^(-20)) < 2) force_recreation = TRUE
# }
# }
if ((force_recreation) | (!file.exists(mofa_outfile))){
mofaobj = create_mofa_from_matrix(list_data_types)
data_opts <- get_default_data_options( mofaobj )
#data_opts$scale_views = TRUE
model_opts <- get_default_model_options( mofaobj )
model_opts$num_factors <- no_mofa_factors
model_opts$spikeslab_factors = sparse_factors
model_opts$spikeslab_weights = sparse_weights
model_opts$ard_factors = sparse_factors
model_opts$ard_weights = sparse_weights
train_opts <- get_default_training_options( mofaobj )
train_opts$seed <- seed
# train_opts$convergence_mode <- "slow"
mofaobj <- prepare_mofa( mofaobj,
training_options = train_opts,
model_options = model_opts,
data_options = data_opts )
mofaobj <- run_mofa( mofaobj,
outfile = file.path(res_dir, 'MOFA.hdf5'), save_data = FALSE, use_basilisk = TRUE)
Z = mofaobj@expectations$Z # latent factor matrix
Z = Z$group1
W = mofaobj@expectations$W # individual weights
save( mofaobj , file = mofa_outfile )
# sometimes it returns all Inf or all NA values
if (any(!is.finite(Z))) return(NULL)
# sometimes it returns a Z matrix with all values set to 0
factor_mean = colMeans(Z, na.rm = TRUE)
factor_std = apply(Z,2, sd, na.rm = TRUE)
if (sum(factor_std> 10^(-20)) < 2) return(NULL)
}else{
cat('loading MOFA outfile: ', mofa_outfile, '\n')
load(mofa_outfile)
Z = mofaobj@expectations$Z # latent factor matrix
Z = Z$group1
W = mofaobj@expectations$W # individual weights
# sometimes it returns all Inf or all NA values
if (any(!is.finite(Z))) return(NULL)
# sometimes it returns a Z matrix with all values set to 0
factor_mean = colMeans(Z, na.rm = TRUE)
factor_std = apply(Z,2, sd, na.rm = TRUE)
if (sum(factor_std> 10^(-20)) < 2) return(NULL)
}
df_emb = data.frame(Z)
if ('pt' %in% use_types){
# se voglio viste complete comunque ho giá risolto togliendo a priori i pz a cui mancava una vista all.x / all.y = TRUE
df_emb = merge(df_emb, df_pt_use, by = 'row.names', all.x = TRUE, all.y = TRUE)
row.names(df_emb) = df_emb[["Row.names"]]
df_emb = df_emb[, !(names(df_emb) %in% "Row.names")]
}
# se voglio viste complete comunque ho giá risolto togliendo a priori i pz a cui mancava una vista all.x / all.y = TRUE
df_emb = merge(df_emb, df_outcome, by = 'row.names', all.x = TRUE, all.y = TRUE)
row.names(df_emb) = df_emb[["Row.names"]]
df_emb = df_emb[, !(names(df_emb) %in% "Row.names")]
return(df_emb)
}
}