-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy path32a_multilevel_model.R
245 lines (205 loc) · 10 KB
/
32a_multilevel_model.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
## Multilevel models using Gelman book (chapter 14)
##------------------------------------------------------------------------------
## INITIALIZE / IMPORT DATA
##------------------------------------------------------------------------------
library(geneorama)
sourceDir("R/functions/", trace = FALSE)
loadinstall_libraries(c("labeling", "ggplot2", "e1071", "ROCR", "pROC", "arm", "scales"))
idTable <- readRDS("data/10_calculate_idtable.Rds")
dat <- readRDS("data/21_full_wnv_data_aggregated.Rds")
# oracle_traps <- download_oracle_traps()
# wnv <- open_latest_wnv_file()
# mat <- dcast(dat[ , list(tot = total_true + total_false, id, date)],
# date ~ id, value.var = "tot")
# mat_length <- dcast(dat[ , list(tot = total_true + total_false, id, date=round_weeks(date))],
# date ~ id, value.var = "tot", fun.aggregate = length)
# mat_sum <- dcast(dat[ , list(tot = total_true + total_false, id, date=round_weeks(date))],
# date ~ id, value.var = "tot", fun.aggregate = sum)
# n_valid <- apply(mat, 1, function(x)length(x) - length(x[is.na(x)]) + 1)
#
##------------------------------------------------------------------------------
## CREATE CV FOLDS
##------------------------------------------------------------------------------
## For named lists it works better if group is a character value
dat[ , grp := paste0("year", year(date))]
##------------------------------------------------------------------------------
## CREATE XMAT
## The variable xmat is the matrix of variables that are used in the model. In
## this case it also contains the y variables. For some models xmat must
## literally be a matrix, but in this case a data.table / data.frame was fine.
## The y value being predicted is wnvwnv, or "WNV" happening two weeks in a
## row.
##------------------------------------------------------------------------------
xmat <- dat[ , list(id = as.factor(id),
grp = as.factor(grp),
week = as.factor(week),
date,
wnvw1, wnvw2, wnv_ytd,
awnd = (awnd - mean(awnd)) / sd (awnd),
prcp = (prcp - mean(prcp)) / sd (prcp),
tmax = (tmax - mean(tmax)) / sd (tmax),
tot = total_true + total_false,
total_true = total_true,
total_false = total_false,
# tot_log = log(total_true + total_false),
# tot_half = (total_true + total_false) / 2,
wnvnum = wnv,
wnv = as.factor(wnv),
wnvwnv = as.factor(as.integer(wnv & wnvw1)),
wnvwnvnum = as.integer(wnv & wnvw1),
wnv_f1 = wnv_f1,
wnvwnv_f1 = as.integer(wnv & wnv_f1))]
xmat_means <- xmat[ , list(mean = mean(tot)), keyby = list(id, grp)]
xmat <- merge(xmat, xmat_means, key(xmat_means))
# xmat[ , tot := (tot - mean(tot)) / sd(tot)]
# xmat[ , tot_half := (tot_half - mean(tot_half)) / sd(tot_half)]
## Remove rows where wnvwnv is NA
## Sometimes a trap will only appear once in a season, which casuses the NA value
xmat <- xmat[!is.na(wnvwnvnum)]
## Labor day indicator
xmat[ , labor_day := labor_day(year(date))]
# xmat[ , range(as.numeric(as.character(week)))]
weeksummary <- xmat[ , list(
.N,
postwice = sum(wnvwnvnum),
integertime = year(date) + (as.numeric(as.character(week)) - 20) / 20),
keyby = list(year(date), week, date)]
# plot(N ~ integertime, weeksummary, type = "o")
# points(postwice ~ integertime, weeksummary, type = "o", col = "red")
## remove 2009 because it never had any postive values that met the business rule
# xmat[ , list(.N, wnvwnv = sum(wnvwnvnum)), keyby = list(year(date))]
# xmat <- xmat[year(date)!= 2009]
## Get common ids and common weeks so that we can do prediction later
common_ids <- xmat[grp!="year2016" & grp!="year2009", unique(id)]
common_weeks <- xmat[grp!="year2016" & grp!="year2009", unique(week)]
xmat <- xmat[id %in% common_ids]
xmat <- xmat[week %in% common_weeks]
xmat <- droplevels(xmat)
xmat[is.na(wnv_ytd), wnv_ytd := 0]
xmat[is.na(wnvw1), wnvw1 := 0]
xmat[is.na(wnvw2), wnvw2 := 0]
xmat
summary(xmat)
str(xmat)
NAsummary(xmat)
##------------------------------------------------------------------------------
## MODEL 2
## This was the second model using glmer (and the first model with reasonable
## results). Model 2 is only predicting if there will be WNV, which does not
## follow the business rule for spraying in which WNV has to be present two
## weeks in a row.
##------------------------------------------------------------------------------
##------------------------------------
## Production: run model on all data
##------------------------------------
m2p <- glmer(wnv ~ wnvw1 + wnvw2 + wnv_ytd + awnd + tmax + prcp +
(1 | id) + (1 | week),
family = binomial,
data = xmat)
tmpr <- lme4:::predict.merMod(m2p, xmat, type = c("link", "response")[2])
xmat[as.integer(names(tmpr)) , m2 := unname(tmpr)]
rm(tmpr)
##------------------------------------
## Test: Leave out 2016 as a holdout
##------------------------------------
## This is the function to examine for most errors happen with glmer
## Errors happen for example when data isn't complete, or doesn't contain
## enough unique values, e.g. near zero variance.
# undebug(lme4:::mkNewReTrms)
m2t <- glmer(wnv ~ wnvw1 + wnvw2 + wnv_ytd + awnd + tmax + prcp +
(1 | id) + (1 | week),
family = binomial,
data = xmat[grp!="year2016"])
display(m2t)
ranef(m2t)
## Extract the usual prediction, and add the test prediction to the data
tmpr <- lme4:::predict.merMod(m2t, xmat, type = c("link", "response")[2])
xmat[as.integer(names(tmpr)) , m2t := unname(tmpr)]
## Two predictions are given, one is a transform of the other.
tmpl <- lme4:::predict.merMod(m2t, xmat, type = c("link", "response")[1])
# plot(exp(tmpl)/(1+exp(tmpl)) ~ tmpr)
# hist(exp(tmpl)/(1+exp(tmpl)))
# hist(tmpr)
## The temp prediction components are no longer needed
rm(tmpr, tmpl)
## This is a simple dignostic plot that shows the seperation of classes
# boxplot(m2 ~ wnv, xmat)
## AUC / ROC values:
xmat[i = TRUE , calculate_metrics(wnv, m2t)] ## Overall
xmat[grp=="year2016" , calculate_metrics(wnv, m2t)] ## Holdout
xmat[grp=="year2016" , calculate_metrics(wnvwnv, m2t)] ## Effectiveness for wnv twice
## Trap by Date scores
## Trap by Date results
## These views side-by-side are useful views for checking score performance
# matwnv <- dcast(xmat[ , list(wnv, id, date)], date ~ id, value.var = "wnv")
# matm2 <- dcast(xmat[ , list(m2t, id, date)], date ~ id, value.var = "m2t")
##------------------------------------------------------------------------------
## TEST OF DATA WITH PARTIAL FACTOR LEVELS
## Basically this was a round trip test to see how glmer handles factors. Turns
## out that it handles factors very well! So, new data sets with partial factor
## levels are appropriately linked to the right factor. For example, if you
## modeled on AZ, CA, MI, and UT, the predicted on MI and CA, they would be
## linked to levels #3 and #2
##------------------------------------------------------------------------------
# ii <- sample(xmat[, which(!is.na(m2))])[1:10]
# ftemp <- tempfile()
# write.table(xmat[ii,list(wnvw1, wnvw2,awnd,tmax,prcp,id,week)],ftemp,row.names=F)
# fread(ftemp)
# tmp <- fread(ftemp)
# lme4:::predict.merMod(m2, tmp)
# str(tmp)
# xmat[ii,m2]
##------------------------------------------------------------------------------
## MODEL 3
## Predict the actual business case of "two weeks in a row"
##------------------------------------------------------------------------------
options(warn = -1)
m3 <- glmer(wnvwnv ~ wnvw1 + wnvw2 + wnv_ytd +
awnd + tmax + prcp + (1 | id) + (1 | week) ,
family = binomial, data = xmat)
m3t <- glmer(wnvwnv ~ wnvw1 + wnvw2 + wnv_ytd +
awnd + tmax + prcp + (1 | id) + (1 | week) ,
family = binomial, data = xmat[grp!="year2016"])
options(warn = 0)
tmpr <- lme4:::predict.merMod(m3, xmat, type = c("link", "response")[2])
xmat[as.integer(names(tmpr)) , m3 := unname(tmpr)]
tmpr <- lme4:::predict.merMod(m3t, xmat, type = c("link", "response")[2])
xmat[as.integer(names(tmpr)) , m3t := unname(tmpr)]
rm(tmpr)
## Add in model forecast
xmat_f <- xmat[ , list(wnvw1 = wnv,
wnvw2 = wnvw1,
wnv_ytd = wnv_ytd + wnvnum,
awnd, tmax, prcp, id, week)]
tmpr <- lme4:::predict.merMod(m3, xmat_f, type = c("link", "response")[2])
xmat[as.integer(names(tmpr)) , m3_forecast := unname(tmpr)]
rm(tmpr)
tmpr <- lme4:::predict.merMod(m3t, xmat_f, type = c("link", "response")[2])
xmat[as.integer(names(tmpr)) , m3t_forecast := unname(tmpr)]
rm(tmpr)
# boxplot(m3t ~ wnvwnv, xmat[grp=="year2016"])
xmat[grp=="year2016" , calculate_metrics(wnvwnv, m3t_forecast)]
xmat[grp=="year2016" & !is.na(wnvwnv_f1) , calculate_metrics(wnvwnv_f1, m3t_forecast)]
##------------------------------------------------------------------------------
## Model 3a (No sprays after labor day business rule)
##------------------------------------------------------------------------------
options(warn = -1)
m3a <- glmer(wnvwnv ~ wnvw1 + wnvw2 + wnv_ytd +
awnd + tmax + prcp + (1 | id) + (1 | week) ,
family = binomial, data = xmat[date < labor_day][grp!="year2016"])
options(warn = 0)
tmpr <- lme4:::predict.merMod(m3a, xmat, type = c("link", "response")[2], allow.new.levels = TRUE)
xmat[as.integer(names(tmpr)) , m3a := unname(tmpr)]
rm(tmpr)
xmat[grp=="year2016" , calculate_metrics(wnvwnv, m3a)]
## Add in model forecast
tmpr <- lme4:::predict.merMod(m3a, xmat_f, type = c("link", "response")[2], allow.new.levels = TRUE)
xmat[as.integer(names(tmpr)) , m3a_forecast := unname(tmpr)]
rm(tmpr)
xmat[grp == "year2016" & !is.na(wnvwnv_f1) , calculate_metrics(wnvwnv_f1, m3a_forecast)]
xmat[grp == "year2016", calculate_metrics(wnvwnv, m3a_forecast)]
## Round predictions
for (col in colnames(xmat[ , m2:m3a_forecast])){
xmat[ , (col) := round(xmat[[col]], 5)][]
}
xmat