-
Notifications
You must be signed in to change notification settings - Fork 10
/
Copy pathRegularization.R
157 lines (103 loc) · 4.38 KB
/
Regularization.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
library(tidyverse)
mydata <- read.csv(file="C:/Users/ellen/Documents/UH/Fall 2020/Data/Ex1LS.csv", header=TRUE, sep=",")
model <- lm( formula = Y ~ X, mydata)
modelQ <- lm( formula = Y ~ X + I(X^2), mydata)
modData <- mydata
modData$newYQ <- predict(modelQ, mydata)
modData$newY <- predict(model, mydata)
p <- ggplot(modData, aes(x=X, y=Y))+geom_point()
p <- p + geom_point(data = modData, aes(x=X, y = newY), color = 'red')
p <- p + geom_smooth(data=modData, aes(X, newY), se=FALSE, color = "red", span = 1.5)
p <- p + geom_smooth(data=modData, aes(X, newYQ), se=FALSE, color = "blue", span = 1.5)
p
x <- mydata$X
y <- mydata$Y
d <- data.frame(x=x,y=y)
# using the quad function above for linear equation
# and solving using normal equations (go back to regression 1)
m = length(mydata$X)
x = matrix(c(rep(1,m), mydata$X, mydata$X^2), ncol=3)
n = ncol(x)
y = matrix(mydata$Y, ncol=1)
# set up 3 different levels of regularization term values 0, 1 and 10
# you don't have to do it this way - you can simplify and just try values one at at time
lambda = c(0,1,10)
d = diag(1,n,n) # needs to be number of columns (coefficients)
d[1,1] = 0 # set reg term to 0 for intercept (we don't regularize the intercept - it's the leftover)
th = array(0,c(n,length(lambda)))
# this is a loop I set up to run through a series for plotting
# - you won't need to do this, but it's here if you like
for (i in 1:length(lambda)) {
th[,i] = solve(t(x) %*% x + (lambda[i] * d)) %*% (t(x) %*% y)
print(i)
}
# I'm just using a loop to go through a range of values to see how it goes
# let's break this down
# recall normal equations:
betaHat <- solve(t(x)%*%x) %*% t(x) %*%y
# compare to lm
modelQ
as.numeric(betaHat)
# but now, betaHat, not penalized is:
betaHat <- solve(t(x) %*% x + (lambda[1] * d)) %*% (t(x) %*% y)
# so no change, which is what we would expect.
# now for reg term = 1
betaHat <- solve(t(x) %*% x + (lambda[2] * d)) %*% (t(x) %*% y)
# recall that we're solving for betahat - the vector of coefficients
# and we want them to have less effect, we want to reduce the impact
# so we're increasing X multiples, which decreases the parameter values
# we could just add 1, it becomes
solve(t(x) %*% x + (1*d)) %*% (t(x) %*% y)
# or add 5 (which adds more penalty)
solve(t(x) %*% x + (5*d)) %*% (t(x) %*% y)
# or 10
solve(t(x) %*% x + (10 * d)) %*% (t(x) %*% y)
# you still need the diagonal to multiply by x, which has 3 columns
# also remember that this is element multiplication, not matrix (dot product)
# ------------ create seq for smooth visual --------------- #
# generate x sequence (just for visual)
nwx = seq(1, 4, len=50)
# extend x to include poly term
x = matrix(c(rep(1,length(nwx)), nwx, nwx^2), ncol=3)
# multiply by 3 different beta vectors
newData <- as.data.frame(nwx)
newData$th1 <- (x %*% th[,1])
newData$th2 <- (x %*% th[,2])
newData$th3 <- (x %*% th[,3])
# notice how the thetas are reduced in magnitude
X <- newData$nwx
p <- ggplot(mydata, aes(x=X, y=Y))+geom_point() + geom_smooth(method = 'lm', se=FALSE, color = 'black')
p <- p+ geom_smooth(data = newData, aes(x=nwx, y = th1), color = 'blue')
p <- p+ geom_smooth(data = newData, aes(x=nwx, y = th2), color = 'red')
p <- p+ geom_smooth(data = newData, aes(x=nwx, y = th3), color = 'green')
p
# ----------- end of visual sequence ----------------- #
# now let's check the errors. Refresh data:
x = matrix(c(rep(1, nrow(mydata)), mydata$X, mydata$X^2), ncol = 3)
# OR (you'll want to use this where possible)
x = model.matrix(Y ~ X, mydata)
xq = model.matrix(Y ~ X + I(X^2), mydata)
# recompute unpenalized betas
betaHat <- solve(t(x)%*%x) %*% t(x) %*%y
betaHatQ <- solve(t(xq)%*%xq) %*% t(xq) %*%y
# and compute errors
# --- blue
sqrt(sum(((xq %*% betaHatQ)- mydata$Y)^2)/(nrow(betaHat)))
# --- red
sqrt(sum(((xq %*% th[,2])- mydata$Y)^2)/(nrow(betaHat)))
# --- green
sqrt(sum(((xq %*% th[,3])- mydata$Y)^2)/(nrow(betaHat)))
# so the blue has the lowest error
# so now let's create some new data:
newData = data.frame(X = c(1, 2, 3, 4), Y = c(5, 6, 6.5, 9))
p = p + geom_point(data = newData, aes(X, Y), color = "red", size = 3)
p
# calculate error
xq2 = model.matrix(Y ~ X + I(X^2), newData)
# --- blue
sqrt(sum(((xq2 %*% betaHatQ)- newData$Y)^2)/(nrow(betaHat)))
# --- red
sqrt(sum(((xq2 %*% th[,2])- newData$Y)^2)/(nrow(betaHat)))
# --- green
sqrt(sum(((xq2 %*% th[,3])- newData$Y)^2)/(nrow(betaHat)))
# and now the lowest error is green. WHY?