-
Notifications
You must be signed in to change notification settings - Fork 8
/
Copy pathgenerate_synthetic_data.R
199 lines (169 loc) · 7.46 KB
/
generate_synthetic_data.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
#' Generate a dataset according to the probabilistic dropout model
#'
#'
#' @param n_proteins the number of rows in the dataset
#' @param n_conditions the number of conditions. Default: 2
#' @param n_replicates the number of replicates per condition.
#' Can either be a single number or a vector with
#' \code{length(n_replicates) == n_conditions}. Default: 3
#' @param frac_changed the fraction of proteins that actually
#' differ between the conditions. Default: 0.1
#' @param dropout_curve_position the point where the chance
#' to observe a value is 50\%. Can be a single number or
#' a vector of \code{length(dropout_curve_position) == n_conditions * n_replicates}.
#' Default: 18.5
#' @param dropout_curve_scale The width of the dropout curve.
#' Negative numbers mean that lower intensities are more likely
#' to be missing.
#' Can be a single number or a vector of
#' \code{length(dropout_curve_position) == n_conditions * n_replicates}.
#' Default: -1.2
#' @param location_prior_mean,location_prior_scale
#' the position and the variance around which the individual
#' condition means (\code{t_mu}) scatter. Default: 20 and 4
#' @param variance_prior_scale,variance_prior_df
#' the scale and the degrees of freedom of the inverse
#' Chi-squared distribution used as a prior for the
#' variances. Default: 0.05 and 2
#' @param effect_size the standard deviation that is used to draw
#' different values for the \code{frac_changed} part of the
#' proteins. Default: 2
#' @param return_summarized_experiment a boolean indicator if
#' the method should return a \code{SummarizedExperiment}
#' object instead of a list. Default: \code{FALSE}
#'
#' @return a list with the following elements
#' \describe{
#' \item{Y}{the intensity matrix including the missing values}
#' \item{Z}{the intensity matrix before dropping out values}
#' \item{t_mu}{a matrix with \code{n_proteins} rows and
#' \code{n_conditions} columns that contains the underlying
#' means for each protein}
#' \item{t_sigma2}{a vector with the true variances for each
#' protein}
#' \item{changed}{a vector with boolean values if the
#' protein is actually changed}
#' \item{group}{the group structure mapping samples to conditions}
#' }
#' if \code{return_summarized_experiment} is \code{FALSE}. Otherwise
#' returns a \code{SummarizedExperiment} with the same information.
#'
#' @examples
#' syn_data <- generate_synthetic_data(n_proteins = 10)
#' names(syn_data)
#' head(syn_data$Y)
#'
#' # Returning a SummarizedExperiment
#' se <- generate_synthetic_data(n_proteins = 10, return_summarized_experiment = TRUE)
#' se
#' head(SummarizedExperiment::assay(se))
#'
#' @export
generate_synthetic_data <- function(n_proteins, n_conditions = 2,
n_replicates = 3,
frac_changed = 0.1,
dropout_curve_position = 18.5,
dropout_curve_scale = -1.2,
location_prior_mean = 20,
location_prior_scale = 4,
variance_prior_scale = 0.05,
variance_prior_df = 2,
effect_size = 2,
return_summarized_experiment = FALSE) {
if(length(n_replicates) == 1){
n_replicates <- rep(n_replicates, n_conditions)
}else if(length(n_replicates) != n_conditions){
stop("The number of elements in n_replicates must match n_conditions")
}
groups <- unlist(lapply(seq_len(length(n_replicates)), function(idx){
rep(paste0("Condition_", idx), times = n_replicates[idx])
}))
groups <- factor(groups, levels = unique(groups))
if(length(dropout_curve_position) == 1){
dropout_curve_position <- rep(dropout_curve_position, times = sum(n_replicates))
}else if(length(dropout_curve_position) != sum(n_replicates)){
stop("length(dropout_curve_position)=", length(dropout_curve_position)," must either be one or match the",
" number of samples (sum(n_replicates)=", sum(n_replicates), ")")
}
if(length(dropout_curve_scale) == 1){
dropout_curve_scale <- rep(dropout_curve_scale, times = sum(n_replicates))
}else if(length(dropout_curve_scale) != sum(n_replicates)){
stop("length(dropout_curve_scale)=", length(dropout_curve_scale)," must either be one or match the",
" number of samples (sum(n_replicates)=", sum(n_replicates), ")")
}
t_sigma2 <- extraDistr::rinvchisq(n_proteins, nu = variance_prior_df, tau = variance_prior_scale)
n_changed <- round(n_proteins * frac_changed)
t_mu <- matrix(rep(rnorm(n_proteins-n_changed, location_prior_mean,
sd=sqrt(location_prior_scale)), times=n_conditions),
ncol=n_conditions)
t_mu <- rbind(t_mu, mply_dbl(seq_len(n_changed), ncol = n_conditions, function(idx){
avg <- rnorm(1, mean=location_prior_mean, sd=sqrt(location_prior_scale))
eff <- rnorm(n_conditions, mean = 0, sd = effect_size)
avg + eff
}))
Z <- do.call(cbind, lapply(as.numeric(groups), function(cond){
rnorm(n_proteins, t_mu[, cond], sd=sqrt(t_sigma2))
}))
Y <- t(vapply(seq_len(n_proteins), function(idx){
y <- Z[idx, ]
dropout_prob <- invprobit(y, dropout_curve_position, dropout_curve_scale)
y[runif(length(y)) < dropout_prob] <- NA
y
}, FUN.VALUE = rep(0, length(groups))))
changed <- c(rep(FALSE, n_proteins-n_changed), rep(TRUE, n_changed))
colnames(Y) <- paste0(groups, "-", as_replicate(groups))
colnames(Z) <- paste0(groups, "-", as_replicate(groups))
colnames(t_mu) <- levels(groups)
prot_names <- if(nrow(Y) == 0) character(0) else paste0("protein_", seq_len(nrow(Y)))
rownames(Y) <- prot_names
rownames(Z) <- prot_names
rownames(t_mu) <- prot_names
names(t_sigma2) <- prot_names
names(changed) <- prot_names
if(return_summarized_experiment){
cdf <- data.frame(group = groups,
true_dropout_curve_position = dropout_curve_position,
true_dropout_curve_scale = dropout_curve_scale)
rownames(cdf) <- colnames(Y)
rdf <- data.frame(changed = changed,
true_s2 = t_sigma2)
feat_df <- as.data.frame(t_mu)
colnames(feat_df) <- paste0("true_", colnames(feat_df))
rdf <- cbind(rdf, feat_df)
rownames(rdf) <- rownames(Y)
SummarizedExperiment(assays = list(abundances = Y,
full_observations = Z),
colData = cdf,
rowData = rdf)
}else{
return(list(Y=Y, Z=Z, t_mu=t_mu, t_sigma2=t_sigma2,
changed=changed,
groups = groups))
}
}
#' Get numeric vector with the count of the replicate for each element
#'
#' For a vector with repeated values return a vector where each element
#' is the count how often the element was observed previously
#'
#' @param x a vector with repeated elements
#'
#' @return numeric vector
#'
#' @keywords internal
#'
#' @seealso \code{\link[base]{order}}, \code{\link[base]{rank}}
#' @examples
#' x <- c("a", "b", "a", "b", "b", "d")
#' all(proDA:::as_replicate(x) == c(1,1,2,2,3,1))
#'
as_replicate <- function(x){
counter <- as.list(rep(1, times=length(unique(x))))
names(counter) <- unique(x)
replicate <- rep(NA, times=length(x))
for(idx in seq_along(x)){
replicate[idx] <- counter[[x[idx]]]
counter[[x[idx]]] <- counter[[x[idx]]] + 1
}
replicate
}