-
Notifications
You must be signed in to change notification settings - Fork 4
/
regression_estimators.R
154 lines (125 loc) · 6.5 KB
/
regression_estimators.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
#' Load Libraries
packages <- c("broom","here","tidyverse","skimr","rlang","sandwich","boot","xtable")
for (package in packages) {
if (!require(package, character.only=T, quietly=T)) {
install.packages(package,repos='http://lib.stat.cmu.edu/R/CRAN')
}
}
for (package in packages) {
library(package, character.only=T)
}
#' Define Plot Format
thm <- theme_classic() +
theme(
legend.position = "top",
legend.title=element_blank(),
legend.background = element_rect(fill = "transparent", colour = NA),
legend.key = element_rect(fill = "transparent", colour = NA)
)
theme_set(thm)
#' Define the expit (inverse logit) function
expit <- function(x){1/(1+exp(-x))}
#' Import, look at, and format data
#' This code downloads the data from the website for the book by Hernan and
#' Robins "Causal Inference." It requires internet access, and will not work if
#' the link changes.
file_loc <- url("https://cdn1.sph.harvard.edu/wp-content/uploads/sites/1268/1268/20/nhefs.csv")
#' This begins the process of cleaning and formatting the data
nhefs <- read_csv(file_loc) %>%
select(qsmk,wt82_71,exercise,sex,age,race,income,marital,school,asthma,bronch) %>%
mutate(income=as.numeric(income>15),
marital=as.numeric(marital>2)) %>%
na.omit()
factor_names <- c("exercise","income","marital","sex","race","asthma","bronch")
nhefs[,factor_names] <- lapply(nhefs[,factor_names] , factor)
#' Define outcome
nhefs <- nhefs %>% mutate(wt_delta = as.numeric(wt82_71>median(wt82_71)))
nhefs
#' Quick summary of data
skim(nhefs)
#' Here, we start fitting relevant regression models to the data.
#' modelForm is a regression argument that one can use to regress the
#' outcome (wt_delta) against the exposure (qsmk) and selected confounders.
formulaVars <- paste(names(nhefs)[-c(2,12)],collapse = "+")
modelForm <- as.formula(paste0("wt_delta ~", formulaVars))
modelForm
#' This model can be used to quantify a conditionally adjusted odds ratio with correct standard error
modelOR <- glm(modelForm,data=nhefs,family = binomial("logit"),control = list(trace = TRUE))
tidy(modelOR)[2,]
#' This model can be used to quantify a conditionally adjusted risk ratio with with correct standard error
#' However, error it returns an error and thus does not provide any results.
modelRR_binom <- glm(modelForm,data=nhefs,family = binomial("log"))
#' This model can be used to quantify a conditionally adjusted risk difference with correct standard error
modelRD_binom <- glm(modelForm,data=nhefs,family = binomial("identity"))
#' This model can be used to quantify a conditionally risk ratio using the Poisson distribuiton and log link function.
#' However, because the Poisson distribution is used, the model provides incorrect standard error estimates.
modelRR <- glm(modelForm,data=nhefs,family = poisson("log"))
tidy(modelRR)[2,]
#' To obtain the correct variance, we use the "sandwich" function to obtain correct sandwich (robust) standard error estimates.
sqrt(sandwich(modelRR)[2,2])
#' This model can be used to obtain a risk difference with the gaussian distribiton or using ordinary least
#' squares (OLS, via the lm function). Again, the model based standard error estimates are incorrect.
modelRD <- glm(modelForm,data=nhefs,family = gaussian("identity"))
modelRD <- lm(modelForm,data=nhefs)
tidy(modelRD)[2,]
#' To obtain the correct variance, we use the "sandwich" function to obtain correct sandwich (robust) standard error estimates.
sqrt(sandwich(modelRD)[2,2])
#' This model shows some potential evidence for interaction between smoking and exercise on the risk difference scale.
formulaVars <- paste(names(nhefs)[-c(2,12)],collapse = "+")
modelForm <- as.formula(paste0("wt_delta ~", formulaVars,"+qsmk*exercise"))
modelForm
summary(lm(modelForm,data=nhefs))
#' Marginal Standardization
##' To avoid assuming no interaction between smoking and any of the other variables
##' in the model, we subset modeling among exposed/unexposed. This code removes smoking from the model,
##' which will allow us to regress the outcome against the confounders among the exposed and
##' the unexposed searately. Doing so will allow us to account for any potential exposure-covariate interactions
##' that may be present.
formulaVars <- paste(names(nhefs)[-c(1,2,12)],collapse = "+")
modelForm <- as.formula(paste0("wt_delta ~", formulaVars))
modelForm
#' Regress the outcome against the confounders among the unexposed (model0) and then among the exposed (model1)
model0 <- glm(modelForm,data=subset(nhefs,qsmk==0),family=binomial("logit"))
model1 <- glm(modelForm,data=subset(nhefs,qsmk==1),family=binomial("logit"))
##' Generate predictions for everyone in the sample using the model fit to only the
##' unexposed (mu0 predictions) and only the exposed (mu1 predictions).
mu1 <- predict(model1,newdata=nhefs,type="response")
mu0 <- predict(model0,newdata=nhefs,type="response")
#' Marginally adjusted risk ratio
marg_stand_RR <- mean(mu1)/mean(mu0)
#' Marginally adjusted risk difference
marg_stand_RD <- mean(mu1)-mean(mu0)
#' Using the bootstrap to obtain confidence intervals for the marginally adjusted
#' risk ratio and risk difference.
bootfunc <- function(data,index){
boot_dat <- data[index,]
model0 <- glm(modelForm,data=subset(boot_dat,qsmk==0),family=binomial("logit"))
model1 <- glm(modelForm,data=subset(boot_dat,qsmk==1),family=binomial("logit"))
mu1 <- predict(model1,newdata=boot_dat,type="response")
mu0 <- predict(model0,newdata=boot_dat,type="response")
marg_stand_RR <- mean(mu1)/mean(mu0)
marg_stand_RD <- mean(mu1)-mean(mu0)
res <- c(marg_stand_RD,marg_stand_RR)
return(res)
}
#' Run the boot function. Set a seed to obtain reproducibility
set.seed(123)
boot_res <- boot(nhefs,bootfunc,R=2000)
boot_RD <- boot.ci(boot_res,index=1)
boot_RR <- boot.ci(boot_res,index=2)
#' Creating Table 1 presented in the manuscript
t11<-as.matrix(tidy(modelRD)[2,2])
t12<-as.matrix(tidy(modelRD)[2,2] - 1.96*sqrt(sandwich(modelRD)[2,2]))
t13<-as.matrix(tidy(modelRD)[2,2] + 1.96*sqrt(sandwich(modelRD)[2,2]))
t21<-as.matrix(exp(tidy(modelRR)[2,2]))
t22<-as.matrix(exp(tidy(modelRR)[2,2] - 1.96*sqrt(sandwich(modelRR)[2,2])))
t23<-as.matrix(exp(tidy(modelRR)[2,2] + 1.96*sqrt(sandwich(modelRR)[2,2])))
t31<-as.matrix(marg_stand_RD)
t32<-as.matrix(boot_RD$bca[4])
t33<-as.matrix(boot_RD$bca[5])
t41<-as.matrix(marg_stand_RR)
t42<-as.matrix(boot_RR$bca[4])
t43<-as.matrix(boot_RR$bca[5])
tab <- data.frame(method=c("GLM","Marginal Standardization"),rd=c(t11,t31),rd.ll=c(t12,t32),rd.ul=c(t13,t33),
rr=c(t21,t41),rr.ll=c(t22,t42),rr.ul=c(t23,t43))
xtable(tab)