-
Notifications
You must be signed in to change notification settings - Fork 2
/
generate_fakedata.R
132 lines (97 loc) · 4.08 KB
/
generate_fakedata.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
# Notes -------------------------------------------------------------------
# Generate fake data in a similar format as the Derexyl/PFDC datasets by sampling the prior predictive distribution
# Initialisation ----------------------------------------------------------
rm(list = ls()) # Clear Workspace (better to restart the session)
source(here::here("analysis", "00_init.R"))
library(foreach)
library(doParallel)
set.seed(1744834965) # Reproducibility (Stan use a different seed)
#### OPTIONS
n_pt <- 16 # Number of patients
n_dur <- rpois(n_pt, 80) # Time-series duration for each patient
p_mis <- 0.1 # Proportion of missing values
p_obs_obs <- 0.9 # Probability that the next value is observed when current value is observed
dgp <- detail_POSCORAD("Items") %>%
mutate(Model = case_when(Name %in% c("extent") ~ "BinMC",
Name %in% c("itching", "sleep") ~ "BinRW",
TRUE ~ "OrderedRW")) # Model for each item
n_chains <- 4
n_it <- 2000
####
dgp <- dgp %>%
mutate(M = Maximum / Resolution) %>%
group_by(Model, M) %>%
mutate(ID = cur_group_id()) %>%
ungroup()
id <- get_index2(n_dur)
# Missing values
id <- lapply(1:n_pt,
function(pid) {
id %>%
filter(Patient == pid) %>%
mutate(Missing = generate_missing(nrow(.), type = "markovchain", p_mis = p_mis, p_obs_obs = p_obs_obs))
}) %>%
bind_rows()
# Processing --------------------------------------------------------------
runs <- dgp %>%
select(ID, Model, M) %>%
distinct()
cl <- makeCluster(nrow(runs), outfile = "")
registerDoParallel(cl)
out <- foreach(i = 1:nrow(runs), .combine = c) %dopar% {
source(here::here("analysis", "00_init.R"))
dgp_i <- dgp %>% filter(ID == i)
model <- EczemaModel(runs$Model[i], max_score = runs$M[i], discrete = TRUE)
param <- list_parameters(model)
fit_prior <- sample_prior(model,
N_patient = n_pt,
t_max = n_dur,
pars = unlist(param),
iter = n_it,
chains = n_chains)
draws <- sample(n_it * n_chains / 2, nrow(dgp_i), replace = FALSE)
out <- lapply(1:nrow(dgp_i),
function(j) {
sim <- extract_simulations(fit_prior, id = id, draw = draws[j], pars = unlist(param[c("Population", "Patient")]))
df <- sim$Data %>%
mutate(Score = Score * dgp_i$Resolution[j],
Score = replace(Score, Missing, NA)) %>%
rename(setNames("Score", dgp_i$Label[j])) %>%
select(-Index, -Missing)
par <- dgp_i[j, ] %>%
select(Model, Label) %>%
rename(Item = Label) %>%
bind_cols(sim$Parameters)
out <- list(Data = df, Parameters = par)
return(out)
})
return(out)
}
stopCluster(cl)
# Save data ---------------------------------------------------------------
df <- lapply(out, function(x) {x$Data}) %>%
Reduce(function(x, y) {full_join(x, y, by = c("Patient", "Time"))}, .) %>%
rename(Day = Time) %>%
mutate(Intensity = Redness + Dryness + `Traces of scratching` + Thickening + Swelling + `Scabs/Oozing`,
`Subjective symptoms` = `Sleep disturbance VAS` + `Itching VAS`,
oSCORAD = 0.2 * Extent + 3.5 * Intensity,
SCORAD = oSCORAD + `Subjective symptoms`)
par <- lapply(out, function(x) {x$Parameters}) %>% bind_rows()
FakeData <- list(Data = df, Parameters = par)
# usethis::use_data(FakeData, overwrite = TRUE)
# Plot data ---------------------------------------------------------------
# df <- FakeData$Data
lapply(sort(sample(1:n_pt, min(n_pt, 4))),
function(pid) {
df %>%
filter(Patient == pid) %>%
drop_na() %>%
ggplot(aes(x = Day, y = SCORAD)) +
geom_line() +
geom_point() +
coord_cartesian(ylim = c(0, 103)) +
labs(colour = "", title = paste0("Patient ", pid)) +
theme_bw(base_size = 15) +
theme(legend.position = "top")
}) %>%
plot_grid(plotlist = ., ncol = 2)