-
Notifications
You must be signed in to change notification settings - Fork 1
/
train_bs.R
196 lines (184 loc) · 9.61 KB
/
train_bs.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
train_bs <- function(data.db.train,data.db.test,parameters,methodz,do.bootstrap=TRUE,R.BS=1000,blocklength=12,sample.countries = FALSE,min_cr = 5){
# ------------------------------------
# CALL
# train_bs(data.db.train,data.db.test,parameters,methodz,R.BS,blocklength)
# train_bs(data.db.train,data.db.test,parameters,methodz,R.BS,blocklength,sample.countries)
# train_bs(data.db.train,data.db.test,parameters,methodz,R.BS,blocklength,sample.countries,min_cr)
# ------------------------------------
# DESCRIPTION
# performs a bootstrap on data from data.db.train in order to provide a bootstrapped forecast on data.db.test
# In case of in-sample estimation, data.db.train (for bootstrapping) and data.db.test (for forecasting) should be identical
# Bootstrap specificities:
# - block bootstrap
# - cross sectional blocks (cannot be excluded at the current moment)
# - panel out-of-sample (dropping whole countries)
# Note that this function transforms input data.tables into data.frames for estimation purposes
# ------------------------------------
# INPUT
# data.db.train: in-sample estimation data.table with country, year, quarter, c0, pre, latepre, post, and early warning indicators
# data.db.test: out-of-sample prediction data.table with country, year, quarter, c0, pre, latepre, post, and early warning indicators
# cols.string: column names (pre and estimation variables)
# parameters: parameters from preparation scripts
# methodz: vector with methods to be estimated. All methods are of the form "m.i$, where "m" stands for an estimation method, and "i" for the numbering of the dataset to be used in the estimation
# do.bootstrap: Boolean if bootstrap on training data should be performed or not
# R.BS: number of bootstrap draws
# blocklength: number of adjacent periods in a block; RECOMMENDED: EARLY WARNING HORIZON. Reduced if only smaller blocks are available.
# sample.countries: TRUE for panel out-of-sample bootstrap, FALSE otherwise (NOT RECOMMENDED FOR RECURSIVE ESTIMATION).
# min_cr: minimum number of pre-crisis observations in bootstrap sample. Needed for valid estimations
# ------------------------------------
# OUTPUT
# list containing
# - data.out: T x output x R.BS array, where
# T is the number of observations in data.db.test
# output is the number of outputs (methodz x c(Prob, PrioP, AUC, MSE))
# - modelz: estimated models (one per method in methodz) (NOT IN BOOTSTRAP-CASE)
# - data.test.use: test data used (NOT IN BOOTSTRAP CASE)
# Gregor von Schweinitz 2020-11-17
if (do.bootstrap){
b <- blocklength
# Define all possible start and end-dates for bootstrap blocks
# Account for the fact that blocks might need to be shorter than b periods in order to give every observation an equal chance to be drawn
# In order to manage shorter blocks at the beginning, we give the additional (b-1) starting observations "fictitious earlier starting dates"
time.vec <- sort(unique(data.db.train[,Date]))
time.add <- seq(-(b-1)/4,0,1/4)
time.vec <- c(time.vec[1]+time.add,time.vec[2:length(time.vec)])
sample.master <- data.frame(pos1=NA,pos2=NA,Date=NA,Country=NA)
for (coun in unique(data.db.train$Country)){
pos <- which(data.db.train$Country==coun)
pos1 <- c(rep(pos[1],b-1),pos)
pos2 <- c(pos,rep(pos[length(pos)],b-1))
time.coun <- data.db.train[pos,Date]
time.coun1 <- data.db.train[pos1,Date]
time.coun2 <- data.db.train[pos2,Date]
time.add1 <- c(time.add[1:min(b,length(pos1))],rep(0,max(0,length(pos1)-b)))
# Account for missing observations, as latepre, c0, post are not included in data.db.train
pos.na <- which(time.coun1+b/4-0.25<time.coun2)
for (k in pos.na){pos2[k] <- pos[max(which(time.coun<=time.coun1[k]+b/4-0.25))]}
time.coun2 <- data.db.train[pos2,Date]
sample.master <- rbind(sample.master,cbind(pos1=pos1,pos2=pos2,Date=time.coun1+time.add1,Country=coun))
}
sample.master <- sample.master[2:dim(sample.master)[1],]
rm(time.add,coun,pos,pos1,pos2,time.coun,time.coun1,time.coun2,time.add1,pos.na)
}else{
R.BS <- 1
}
col.resp <- parameters$col.resp
output.names <- c("PriorP",
paste("Prob(", methodz, ")", sep=""),
paste("OT(", methodz, ")", sep=""))
data.out <- array(dim=c(dim(data.db.test)[1],length(output.names),R.BS),
dimnames=list(rownames(data.db.test),
output.names,
1:R.BS))
# Loop over bootstrap draws
r <- 1
ndraws <- 0
Nobs <- dim(data.db.train)[1]
data.test <- cbind(obs=Nobs+1:dim(data.db.test)[1],data.db.test)
while (r <= R.BS){
if (r%%10==0) print(r)
PriorP <- 0
# Probagg.train <- matrix(0,Nobs,length(methodz))
# colnames(Probagg.train) <- paste("Prob(",methodz,")",sep="")
# Decision.train <- matrix(0,Nobs,length(methodz))
# colnames(Decision.train) <- methodz
if (do.bootstrap){
# construct random bootstrap sample of the data
while (PriorP<min_cr/Nobs){
ndraws <- ndraws+1
# randomly drawn countries (with replacement)
if (sample.countries) {
countries.bs <- sample(unique(data.db.train$Country),replace=TRUE)
}else{
countries.bs <- unique(data.db.train$Country)
}
pos.bs1 <- unlist(lapply(countries.bs,grep,sample.master$Country))
sample.bs <- sample.master[pos.bs1,] # restrict sample.master to selected countries
# randomly drawn starting times
time.bs <- sample(time.vec,replace=TRUE)
pos.bs2 <- unlist(lapply(time.bs,grep,sample.bs$Date))
# get blocks from data.train
pos.bs <- unlist(apply(sample.bs[pos.bs2,],1,function(x) seq(as.numeric(x["pos1"]),as.numeric(x["pos2"]))))
data.train <- data.db.train[pos.bs[1:Nobs],]
PriorP <- mean(data.train[,pre])
rm(countries.bs,pos.bs1,sample.bs,time.bs,pos.bs2,pos.bs)
}
}else{
data.train <- data.db.train[1:Nobs,]
PriorP <- mean(data.train[,pre])
modelz <- list()
}
data.train <- cbind(obs=1:Nobs,data.train)
realizations <- data.train[,pre]
check <- TRUE
#### Standardization ####
if (parameters$standardizeList){
sdvars <- unique(unlist(parameters$cols.expl))
colStats <- data.train[,lapply(.SD, function(x) c(mean(x),sd(x))),.SDcols = sdvars]
data.train.norm = data.train
data.test.norm = data.test
for (sdcol in sdvars){
data.train.norm[,(sdcol) := scale(data.train[,sdcol,with=FALSE],colStats[1,sdcol,with=FALSE],colStats[2,sdcol,with=FALSE])]
data.test.norm[,(sdcol) := scale(data.test[,sdcol,with=FALSE],colStats[1,sdcol,with=FALSE],colStats[2,sdcol,with=FALSE])]
}
}else { # no standardization:
data.train.norm = data.train
data.test.norm = data.test
}
for (method in methodz){
# Set hyperparameters by method and set of explanatory variables.
method.calc <- strsplit(method,".",fixed=TRUE)[[1]][1]
dnum <- as.numeric(strsplit(method,".",fixed=TRUE)[[1]][2])
usevars <- c("Country","Date",parameters$cols.expl[[dnum]],"c0",parameters$col.resp,"latepre","post")
data.train.use <- as.data.frame(data.train.norm[,usevars,with=FALSE])
data.test.use <- as.data.frame(data.test.norm[,usevars,with=FALSE])
# parameters$logitvariables <- parameters$cols.expl[[dnum]]
# parameters$datavariables <- parameters$cols.expl[[dnum]]
cols.string <- columns(parameters$col.resp, parameters$cols.expl[[dnum]])
if (!parameters$optimizationMode){
# Set hyperparameters of method based on paramtable (which comes from an exogenous csv-file specifying hyperparameters for all estimation methods)
namecols <- grep("_name",colnames(parameters$paramtable))
valcols <- grep("_val",colnames(parameters$paramtable))
for (col in 1:length(namecols)){
if (!is.na(parameters$paramtable[method,namecols[col]])){
parameters[[parameters$paramtable[method,namecols[col]]]] <- parameters$paramtable[method,valcols[col]]
}
}
}
train.out <- NULL
try(train.out <- calc.train.bs(method.calc, cols.string, data.train.use, data.test.use, parameters),silent=FALSE)
if (!is.null(train.out)){
data.out[,paste("Prob(", method, ")", sep=""),r] <- train.out$temp.test
threshold <- calculate.threshold(realizations, train.out$temp.train, parameters$mu, optimizethreshold = parameters$optimizethreshold, evaluate = FALSE, PriorP=PriorP)$threshold
data.out[,paste("OT(", method, ")", sep=""),r] <- threshold
if (!do.bootstrap){
modelz <- c(modelz,list(train.out$model))
}
}else{
if (R.BS == 1){
data.out[,paste("Prob(", method, ")", sep=""),r] <- NA
data.out[,paste("OT(", method, ")", sep=""),r] <- mean(realizations)
if (!do.bootstrap){
modelz <- c(modelz,list(NULL))
}
}else{
# If there is an estimation error in the bootstrap case, don't use the bootstrap results
check <- FALSE
break
}
}
}
if (check){
data.out[,"PriorP",r] <- PriorP
r <- r+1
}
}
if (do.bootstrap){
print(paste("total number of draws needed:",ndraws))
return(list(data.out=data.out))
# return(list(data.out=data.out,parameters=parameters))
}else{
names(modelz) <- c(methodz)
return(list(data.out=data.out,modelz=modelz,data.test.use=data.test.use))
}
}