-
Notifications
You must be signed in to change notification settings - Fork 10
/
Copy pathC3_LDA.R
122 lines (77 loc) · 2.9 KB
/
C3_LDA.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
library(tidyverse)
library(MASS)
library(ISLR)
library(caret)
set.seed(913)
dfDefault <- Default
dfDefault %>% dplyr::count(default)
p <- ggplot(dfDefault, aes(balance, fill = default)) +
geom_histogram(binwidth = 500)
p
pl1 <- ggplot(dfDefault, aes(balance, fill = default))
pl1 <- pl1 + geom_density(alpha = 0.2, adjust = 5 )
pl1
# -----------------
lda.fit <- lda(default ~ balance, data = dfDefault)
lda.fit
lda.pred <- predict(lda.fit)
dfPred <- data.frame(lda.pred)
dfPred %>% dplyr::count(class)
pl1 <- pl1 + geom_vline(xintercept = mean(lda.fit$means) )
pl1
p <- p + geom_vline(xintercept = mean(lda.fit$means) )
p
# get decision rule (don't worry about doing this - just FYI)
A <- A <- mean(lda.fit$means)
B <- log(lda.fit$prior[2]) - log(lda.fit$prior[1])
s2.k <- t(tapply(dfDefault$balance, dfDefault$default, var)) %*% lda.fit$prior
C <- s2.k/(lda.fit$means[1] - lda.fit$means[2])
dr <- A + B * C
dr
p <- p + geom_vline(xintercept = dr, color = "red" )
p
tst = confusionMatrix(factor(lda.pred$class) , factor(dfDefault$default), positive = "Yes")
tst$table
tst$byClass[1]
tst$byClass[2]
tst$byClass[3]
tst$byClass[4]
# look at the data again
firstAnalysis <- as_tibble(cbind(as.character(lda.pred$class),
as.character(dfDefault$default),
lda.pred$posterior))
firstAnalysis <- cbind(firstAnalysis,dplyr::select(dfDefault, student, balance, income))
#write_csv(firstAnalysis, "firstAnalysis.csv")
# let's adjust the threshold
pred <- rep('No', nrow(dfDefault))
pred[lda.pred$posterior[,2] >= 0.2] <- 'Yes'
dfPred <- data.frame(pred)
dfPred %>% dplyr::count(pred)
confusionMatrix(factor(pred), factor(dfDefault$default), positive = "Yes")
# now splitting into validation sets
dfDefault <- dfDefault %>% rownames_to_column("SampleID")
xTrain <- sample_n(dfDefault, round(nrow(dfDefault)*.6))
xTest <- dfDefault %>% anti_join(xTrain, by = "SampleID")
lda.fit <- lda(default ~ balance, xTrain)
lda.pred <- predict(lda.fit, xTest)
# get decision rule
A <- A <- mean(lda.fit$means)
B <- log(lda.fit$prior[2]) - log(lda.fit$prior[1])
s2.k <- t(tapply(xTest$balance, xTest$default, var)) %*% lda.fit$prior
C <- s2.k/(lda.fit$means[1] - lda.fit$means[2])
dr <- A + B * C
dr
p <- p + geom_vline(xintercept = dr, color = 'red' )
p
# same place
confusionMatrix((lda.pred$class), factor(xTest$default), positive = "Yes")
# add more predictors (p)
# remember, visualization is gone in p>2
lda.fit <- lda(default ~ student + balance + income, xTrain)
lda.fit
lda.pred <- predict(lda.fit, xTest)
confusionMatrix( lda.pred$class, factor(xTest$default), positive = "Yes")
# get back orignial and look at it:
#finalAnalysis <- as_tibble(cbind(as.character(lda.pred$class), as.character(xTest$default), lda.pred$posterior))
#finalAnalysis <- cbind(finalAnalysis,dplyr::select(xTest, student, balance, income))
#write_csv(finalAnalysis, "finalAnalysis.csv")