-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathR_ladies_20190218.R
146 lines (125 loc) · 4.65 KB
/
R_ladies_20190218.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
# Date: 2019-02-18
# Author: Annika Tillander
# Purpose: LAB for R-ladies
my_lda <- function(x, y, sigma_inv, tx)
{
n <- dim(x)[1]
p <- dim(x)[2]
sy <- sort((y), decreasing = FALSE, index.return = TRUE)$ix # Sort the values of y
n_k <- table(y) # number of oberservation in each class
pi <- n_k/n # proportion in each class
sx <- x[sy,] # sort x in the order of y
my <- matrix(NaN, 2, p) # matrix to put the meanvalues
my[1,] <- apply(sx[1:n_k[1],], 2, mean) # mean values for the first class
my[2,] <- apply(sx[(n_k[1]+1):n,], 2, mean)# mean values for the second class
lim_value <- (1/2)*(t(my[2,])%*%sigma_inv%*%my[2,]) - (1/2)*(t(my[1,])%*%sigma_inv%*%my[1,]) + log(pi[1]) - log(pi[2])
test_value <- tx%*%sigma_inv%*%(my[2,]-my[1,])
ty <- as.numeric(test_value > as.numeric(lim_value))
return(ty)
}
# Load data
load(file="C:/***/sim_dat.Rdata")
# The packages needed
require(ggplot2)
require(pROC)
require(ModelMetrics)
require(caret)
require(DAAG)
require(cvTools)
require(class)
require(randomForest)
require(glasso)
require(nnet)
n <- length(y) # Number of observations
p <- dim(x)[2] # Number of features
# Random sample
test_urval <- sample(1:n, 20)
#Test data
test_y <- y[test_urval]
test_x <- x[test_urval, ]
# Check the size
length(test_y)
dim(test_x)
# Train data
train_y <- y[-test_urval]
train_x <- x[-test_urval,]
# Check the size
length(train_y)
dim(train_x)
# Cross validation
# Numer of folds
my_k <- 4
# Subset data into folds
# In caret
flds <- createFolds(train_y, k = my_k, list = TRUE, returnTrain = FALSE)
# In cvTools
folds <- cvFolds(NROW(train_x), K=my_k)
# Paramter space use only 6 possible
ps <- 6
lambda <- (3:8)/10 # Lambda for gLASSO
layers <- 2:7 # hidden layers for nn
nb <- 2:7 # neighbors for knn
nf <- 3*(2:7) # Number of features randomly sampled as candidates at each split
# Scaling and Centering advised for neural networks
s_train_x <- scale(train_x, center = TRUE, scale = TRUE)
xy <- cbind(train_y, s_train_x)
# Turning outcome into factor for random forest
rf_y <- as.factor(train_y)
rf_xy <- data.frame(rf_y, train_x)
# Collect outcomes
temp_auc <- matrix(NaN, 4, my_k)
# For the result
result_auc <- matrix(NaN, 4, ps)
rownames(result_auc) <- c("LDA", "NN", "kNN", "RF")
colnames(result_auc) <- 1:ps
for(i in 1:ps) # Loop over parameter space
{
for (j in 1:my_k) # Loop over folds
{
true_y <- train_y[flds[[j]]]
s <- cov(train_x[-flds[[j]],])
is <- glasso(s, rho=lambda[i])$wi
pred_lda <- my_lda(x = train_x[-flds[[j]],], y = train_y[-flds[[j]]],
sigma_inv=is, tx=as.matrix(train_x[flds[[j]],]))
mod_nn <- nnet(train_y ~ ., data=xy[-flds[[j]],], size = layers[i], trace = FALSE)
pred_nn <- as.numeric(predict(mod_nn, xy[flds[[j]],])>0.1)
pred_knn <- knn(train = train_x[-flds[[j]],], test = train_x[flds[[j]],],
cl = train_y[-flds[[j]]], k = nb[i])
mod_rf <- randomForest(rf_y ~ ., data = rf_xy[-flds[[j]],], mtry = nf[i])
pred_rf <- as.numeric(predict(mod_rf, rf_xy[flds[[j]],]))-1
# Using auc from ModelMetrics
temp_auc[, j] <- c(auc(true_y, pred_lda), auc(true_y, pred_nn), auc(true_y, pred_knn), auc(true_y, pred_rf))
}
result_auc[,i] <- apply(temp_auc, 1, mean)
}
result_auc
# Get parameter values that generate best AUC
lda_max <- as.numeric(colnames(result_auc)[result_auc[1,]==max(result_auc[1,])])
nn_max <- as.numeric(colnames(result_auc)[result_auc[2,]==max(result_auc[2,])])
knn_max <- as.numeric(colnames(result_auc)[result_auc[3,]==max(result_auc[3,])])
rf_max <- as.numeric(colnames(result_auc)[result_auc[4,]==max(result_auc[4,])])
# Final test
s <- cov(train_x)
is <- glasso(s, rho=lambda[lda_max])$wi
pred_lda <- my_lda(x = train_x, y = train_y,
sigma_inv=is, tx=as.matrix(test_x))
mod_nn <- nnet(train_y ~ ., data=xy, size = layers[nn_max], trace = FALSE)
c_test_x <- scale(test_x, center = TRUE, scale = TRUE)
pred_nn <- as.numeric(predict(mod_nn, c_test_x)>0.1)
pred_knn <- knn(train = train_x, test = test_x, cl = train_y, k = nb[knn_max])
mod_rf <- randomForest(rf_y ~ ., data = rf_xy, mtry = nf[rf_max])
rf_test_x <- test_x
colnames(rf_test_x) <- colnames(rf_xy)[-1]
pred_rf <- as.numeric(predict(mod_rf, rf_test_x))-1
# Calculate ROC
lda_roc <- roc(test_y, pred_lda)
nn_roc <- roc(test_y, pred_nn)
pred_knn <- as.numeric(pred_knn)-1
knn_roc <- roc(test_y, pred_knn)
rf_roc <- roc(test_y, pred_rf)
# The result
par(mfrow=c(2,2))
plot.roc(lda_roc, main="LDA")
plot.roc(nn_roc, main="NN")
plot.roc(knn_roc, main="kNN")
plot.roc(rf_roc, main="RF")