forked from ainaimi/SuperLearnerIntro
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathSL_Intro_EventPrediction_cvAUC.R
183 lines (157 loc) · 7.54 KB
/
SL_Intro_EventPrediction_cvAUC.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
# R code to demonstrate implementation of Super Learner
# for binary classification (Example 2) in
# "Stacked Generalization: An Introdution to Super Learning"
# by Ashley I. Naimi and Laura B. Balzer
# load the relevant packages
library(SuperLearner);library(data.table);library(nnls);library(mvtnorm)
library(ranger);library(xgboost);library(splines);library(Matrix)
library(ggplot2);library(xtable);library(pROC)
library(here)
# EXAMPLE 2
set.seed(123)
n=10000
sigma <- abs(matrix(runif(25,0,1), ncol=5))
sigma <- forceSymmetric(sigma)
sigma <- as.matrix(nearPD(sigma)$mat)
x <- rmvnorm(n, mean=c(0,.25,.15,0,.1), sigma=sigma)
modelMat<-model.matrix(as.formula(~ (x[,1]+x[,2]+x[,3]+x[,4]+x[,5])^3))
beta<-runif(ncol(modelMat)-1,0,1)
beta<-c(2,beta) # setting intercept
mu <- 1-plogis(modelMat%*%beta) # true underlying risk of the outcome
y<-rbinom(n,1,mu)
hist(mu);mean(y)
x<-data.frame(x)
D<-data.frame(x,y)
# Specify the number of folds for V-fold cross-validation
folds=5
## split data into 5 groups for 5-fold cross-validation
## we do this here so that the exact same folds will be used in
## both the SL fit with the R package, and the hand coded SL
index<-split(1:1000,1:folds)
splt<-lapply(1:folds,function(ind) D[index[[ind]],])
# view the first 6 observations in the first [[1]] and second [[2]] folds
head(splt[[1]])
head(splt[[2]])
#-------------------------------------------------------------------------------
# Fit using the SuperLearner Package
#-------------------------------------------------------------------------------
# Specify the outcome-for-prediction (y), the predictors (x),
# family (for a binary outcome), measure of performance (1-AUC),
# the library (sl.lib), and number of folds
sl.lib <- c("SL.bayesglm","SL.polymars")
fitY<-SuperLearner(Y=y,X=x,family="binomial",
method="method.AUC",
SL.library=sl.lib,
cvControl=list(V=folds))
# Note: for rare binary outcomes, consider using the stratifyCV option to
# maintain roughly the same # of outcomes per fold
# View the output: 'Risk' column returns the CV estimates of (1-AUC)
# 'Coef' column gives the weights for the final SuperLearner (meta-learner)
fitY
# Obtain the predicted probability of the outcome from SL
y_pred<-predict(fitY, onlySL=T)$pred
p <- data.frame(y=y, y_pred=y_pred)
head(p)
# Use the roc() function to obtain measures of performance for binary classification
a <- roc(p$y, p$y_pred, direction="auto")
# To plot the ROC curve, we need the sensitivity and specificity
C<-data.frame(sens=a$sensitivities,spec=a$specificities)
ggplot() + geom_step(data=C, aes(1-spec,sens),color="blue",size=.25) +
theme_light() + theme(panel.grid.major = element_blank(),
panel.grid.minor = element_blank()) +
labs(x = "1 - Specificity",y = "Sensitivity") +
geom_abline(intercept=0,slope=1,col="gray")
#-------------------------------------------------------------------------------
# Hand-coding Super Learner
#-------------------------------------------------------------------------------
## 1: split data into 5 groups for 5-fold cross-validation
head(D,10)
splt<-split(D,1:folds)
# view the first 6 observations in the first fold
head(splt[[1]])
#----------------------------
## 2: the lapply() function is an efficient way to rotate through the folds to execute:
# (a) set the ii-th fold to be the validation set; (b) fit each algorithm on the training set;
# (c) obtain the predicted outcomes for observations in the validation set;
# (d) estimate the estimated risk (1-AUC) for each fold
#
## 2b: fit each algorithm on the training set (but not the ii-th validation set)
m1<-lapply(1:folds,function(ii) bayesglm(formula=y~.,data=do.call(rbind,splt[-ii]),family="binomial"))
m2<-lapply(1:folds,function(ii) polyclass(do.call(rbind,splt[-ii])[,6],do.call(rbind,splt[-ii])[,1:5],cv=5))
## 2c: obtain the predicted probability of the outcome for observation in the ii-th validation set
p1<-lapply(1:folds,function(ii) predict(m1[[ii]],newdata=rbindlist(splt[ii]),type="response"))
p2<-lapply(1:folds,function(ii) ppolyclass(fit=m2[[ii]],cov=rbindlist(splt[ii])[,1:5])[,2])
# update dataframe 'splt' so that column1 is the observed outcome (y)
# column2 is the CV-predicted probability of the outcome from bayesglm
# column3 is the CV-predicted probability of the outcome from random forest
for(i in 1:folds){
splt[[i]]<-cbind(splt[[i]][,6],p1[[i]],p2[[i]])
}
# view the first 6 observations in the first fold
## 2d: calculate CV risk for each method for the ii-th validation set
# our loss function is the rank loss; so our risk is (1-AUC)
# use the AUC() function with input as the predicted outcomes and 'labels' as the true outcomes
risk1<-lapply(1:folds,function(ii) 1-AUC(predictions=splt[[ii]][,2], labels=splt[[ii]][,1])) # CV-risk for bayesglm
risk2<-lapply(1:folds,function(ii) 1-AUC(predictions=splt[[ii]][,3], labels=splt[[ii]][,1])) # CV-risk for knn
#----------------------------
## 3: average the estimated 5 risks across the folds to obtain 1 measure of performance for each algorithm
a<-rbind(cbind("bayesglm",mean(do.call(rbind, risk1),na.rm=T)),
cbind("polymars",mean(do.call(rbind, risk2),na.rm=T)))
# output a table of the CV-risk estimates
# xtable(a)
# compare with the package output
fitY;a
#----------------------------
## 4: estimate SL weights using the optim() function to minimize (1-AUC)
X<-data.frame(do.call(rbind,splt),row.names=NULL); names(X)<-c("y","bayesglm","poly")
head(X)
bounds = c(0, Inf)
SL.r<-function(A, y, par){
A<-as.matrix(A)
names(par)<-c("bayesglm","poly")
predictions <- crossprod(t(A),par)
cvRisk <- 1 - AUC(predictions = predictions, labels = y)
}
init=(rep(1/2,2))
fit <- optim(par=init, fn=SL.r, A=X[,2:3], y=X[,1],
method="L-BFGS-B",lower=bounds[1],upper=bounds[2])
fit
alpha<-fit$par/sum(fit$par)
fitY; a
alpha
#---------------------
## 5a: fit all algorithms to original data
m1<-bayesglm(formula=y~.,data=D,family="binomial")
m2<-polyclass(D[,6],D[,1:5],cv=5)
## 5b: predict probabilities from each fit using all data
p1<-predict(m1,newdata=D,type="response") # bayesglm
p2<-ppolyclass(fit=m2,cov=D[,1:5])[,2] #randomForest
predictions<-cbind(p1,p2)
head(predictions)
## 5c: for the observed data take a weighted combination of predictions using nnls coeficients as weights
y_pred <- predictions%*%alpha
p<-data.frame(y=y,y_pred=y_pred)
## #--------------------------------------------
# verify that our work predicts similar results as SL package
a<-roc(p$y, p$y_pred, direction="auto")
C2<-data.frame(sens=a$sensitivities,spec=a$specificities)
head(C2)
###--------------------------------------------
# fits from candidate algorithms
a<-roc(y, p1, direction="auto")
Cbayes<-data.frame(sens=a$sensitivities,spec=a$specificities)
a<-roc(y, p2, direction="auto")
Cpoly<-data.frame(sens=a$sensitivities,spec=a$specificities)
cols <- c("SuperLearner Package"="red","Manual SuperLearner"="blue", "Bayes GLM"="green", "PolyMARS"="black")
ggplot() +
geom_step(data=C, aes(1-spec,sens,color="Manual SuperLearner"),size=.75) +
geom_step(data=C2, aes(1-spec,sens,color="SuperLearner Package"),linetype=2,size=.5) +
geom_step(data=Cbayes, aes(1-spec,sens,color="Bayes GLM"),linetype=2,size=.5) +
geom_step(data=Cpoly, aes(1-spec,sens,color="PolyMARS"),linetype=2,size=.5) +
#scale_colour_manual(name="",values=cols) +
theme_light() + theme(legend.position=c(.8,.2)) +
theme(panel.grid.major = element_blank(),
panel.grid.minor = element_blank()) +
labs(x = "1 - Specificity",y = "Sensitivity") +
geom_abline(intercept=0,slope=1,col="gray") +
scale_colour_manual(name="",values=cols)