-
Notifications
You must be signed in to change notification settings - Fork 10
/
Copy pathRegularization Revisited.R
64 lines (41 loc) · 1.82 KB
/
Regularization Revisited.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
library(tidyverse)
rmse <- function(error)
{
sqrt(mean(error^2))
}
# get some data
dfMPG <- mpg
dfMPG = select(dfMPG, manufacturer, hwy, displ)
# create polynomial model
ModMatrix = model.matrix(hwy ~ manufacturer + displ + I(displ^2), data = dfMPG)
# pull the y values
vY = dfMPG$hwy
# solve for Betas
mBeta = as.numeric(solve(t(ModMatrix) %*% ModMatrix) %*% (t(ModMatrix) %*% vY))
# error estimated from training
rmse(t(as.numeric(mBeta)%*%t(ModMatrix)) - dfMPG$hwy)
# add regularization
# you don't need to set up a loop and vector of thetas, you can just add any number you want
# (10 in this example - you can just arbitrarily pick this to start)
# You still need the diagonal (d) so that you can mulitply the regularization factor by the model matrix
# (which is why you get the dimensions from the model matrix -
# except for the first one which is the intercept)
# doing this one at a time is sometimes easier than looping through because you know
# which way to go (is the error getting better or worse?)
n = ncol(ModMatrix)
d = diag(1,n,n)
d[1,1] = 0
vBetaReg1 = as.numeric(solve(t(ModMatrix) %*% ModMatrix + (10 * d)) %*% (t(ModMatrix) %*% vY))
# error estimated from training
rmse(t(as.numeric(vBetaReg1)%*%t(ModMatrix)) - dfMPG$hwy)
# here, the error in the regularized model is greater
# that's what you want as the model tends to less variable
# so that when new data is acquired, the model will perform better.
# generate a random new dataset
dfMPG2 = mpg %>% mutate(hwy = 35 + (-3*displ) + rnorm(nrow(mpg), 0, 3))
# now create a new model matrix from the new data:
ModMatrix2 = model.matrix(hwy ~ manufacturer + displ + I(displ^2), data = dfMPG2)
# poly model error
rmse(t(as.numeric(mBeta)%*%t(ModMatrix2)) - dfMPG2$hwy)
# regularized model error
rmse(t(as.numeric(vBetaReg1)%*%t(ModMatrix2)) - dfMPG2$hwy)