-
Notifications
You must be signed in to change notification settings - Fork 10
/
Copy pathC1_LogReg_Intro.R
77 lines (51 loc) · 2.3 KB
/
C1_LogReg_Intro.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
library(tidyverse)
library(ISLR)
library(caret)
# from book
set.seed(0907)
dfDefault <- Default
p <- ggplot(dfDefault, aes(balance, fill = default)) +
geom_histogram(binwidth = 500) +
facet_wrap(~student) +
theme(panel.background = element_rect(fill = "white"))
p
pl1 <- ggplot(dfDefault, aes(balance, fill = default)) +
geom_density(alpha = 0.2, adjust = 5 ) +
theme(panel.background = element_rect(fill = "white"))
pl1
# when you get into equations and many modeling algorithms, you'll find that
# variable values need to be integers (0 and 1 only), and factors will often convert to 1, 2:
unique(as.numeric(dfDefault[,"default"]))
dfDefault$default <- as.integer(dfDefault$default)-1
dfDefault <- dfDefault %>% rownames_to_column("SampleID")
train <- sample_n(dfDefault, round(nrow(dfDefault)*.6,0))
test <- dfDefault %>% anti_join(train, by = "SampleID")
# showing equation with one categorical varible
glm.fit <- glm(default ~ student, data = dfDefault, family = binomial)
summary(glm.fit)
dfDefault$Prob <- predict(glm.fit, type = "response")
# you must use "response" type to get probabilities
ggplot(dfDefault, aes(x=balance, y=Prob)) + geom_point()
# now one continuous variable
glm.fit <- glm(default ~ balance, data = train, family = binomial)
summary(glm.fit)
test$Prob <- predict(glm.fit, type = "response", newdata = test)
ggplot(test, aes(x=balance, y=Prob)) + geom_point()
# ---------------------- multiple logistic regression ---------------- #
glMod2 <- glm(default ~ student + balance + income, data = train, family = binomial)
summary(glMod2)
test$mProb2 <- predict(glMod2, type = "response", newdata = test)
mTest = model.matrix(default ~ student + balance + income, data = test)
bet1 <- as.numeric(glMod2$coefficients)
test$tmProb2 <- exp( t(bet1%*%t(mTest)))/(1+exp(t(bet1%*%t(mTest))))
ggplot(test, aes(x=balance, y=tmProb2, color = factor(student))) +
geom_point() +
theme(panel.background = element_rect(fill = "white"))
# let prove the eqquation out:
sum(round(test$tProb - test$mProb,0))
# how did we do?
test$class = factor(if_else(test$tmProb < .5, "No", "Yes"))
test$D2 = factor(if_else(test$default < .5, "No", "Yes"))
table(test$class, test$D2)
confusionMatrix((test$class), factor(test$D2), positive = "Yes")
test %>% group_by(default) %>% summarise(Cnt = n())