-
Notifications
You must be signed in to change notification settings - Fork 0
/
Workfile.Rmd
416 lines (390 loc) · 18.6 KB
/
Workfile.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
---
title: "Workfile"
author: "Group"
output: pdf_document
---
```{r}
library(readr)
library(dplyr)
library(ggplot2)
library(ggpubr)
```
```{r message=FALSE, warning=FALSE}
data <- read.csv("Data/initial_table.csv")
#mean(data$AGE, na.rm = T)
data.work <- dplyr::select(data, ID, AGE, SEX, IBS_POST, DLIT_AG, SIM_GIPERT, endocr_01, endocr_02, ZSN, LET_IS)
data.work <- na.omit(data.work)
data.work <- filter(data.work, DLIT_AG != 10)
dim(data.work) # obs = 1380
# exploratory analysis
mean(data.work$AGE) #61.397
median(data.work$AGE) # 62
table(data.work$SEX) # female(0): 502, male(1): 878
table(data.work$IBS_POST) # no CHD(0): 353, exertional angina pectoris(1):443, unstable angina pectoris(2):584
mean(data.work$DLIT_AG) # 3.34
median(data.work$DLIT_AG) # 3
median(data.work$DLIT_AG) # 3
table(data.work$SIM_GIPERT) # no(0): 1336, yes(1): 44
table(data.work$endocr_01) # no(0): 1193, yes(1):187
table(data.work$endocr_02) # no(0): 1348, yes(1):32
table(data.work$ZSN) # no(0): 1052, yes(1): 328
# distribution plots for single variable
age.hist <- ggplot(data.work, aes(data.work$AGE)) + geom_histogram() + labs(title = "age distribution", x = "age")
age.hist
sex.plot <- ggplot(data.work, aes(as.factor(data.work$SEX))) + geom_bar() + labs(title = "distribution of sex", x = "sex") + scale_x_discrete(labels = c("female", "male"))
sex.plot
ibs.plot <- ggplot(data.work, aes(as.factor(data.work$IBS_POST))) + geom_bar() + labs(title = "distribution of CHD in recent weeks", x = "type of CHD") + scale_x_discrete(labels = c("no CHD", "extertional angina pectoris", "unstable angina pectoris"))
ibs.plot
duration.hist <- ggplot(data.work, aes(data.work$DLIT_AG)) + geom_histogram() + labs(title = "duration of arterial hypertension", x = "years")
duration.hist
hypertension.plot <- ggplot(data.work, aes(as.factor(data.work$SIM_GIPERT))) + geom_bar() + labs(title = "distribution of hypertension", x = "hypertension") + scale_x_discrete(labels = c("no", "yes"))
hypertension.plot
diabetes.plot <- ggplot(data.work, aes(as.factor(data.work$endocr_01))) + geom_bar() + labs(title = "distribution of diabetes", x = "diabetes") + scale_x_discrete(labels = c("no", "yes"))
diabetes.plot
obesity.plot <- ggplot(data.work, aes(as.factor(data.work$endocr_02))) + geom_bar() + labs(title = "distribution of obesity", x = "obesity") + scale_x_discrete(labels = c("no", "yes"))
obesity.plot
chd.plot <- ggplot(data.work, aes(as.factor(data.work$ZSN))) + geom_bar() + labs(title = "distribution of CHD", x = "CHD") + scale_x_discrete(labels = c("no", "yes"))
chd.plot
```
```{r}
names(data)
```
## Ariane
Exploring relationship between age and CHD
```{r}
library("DescTools")
library(tidyverse)
#sex and chronic heart failure
data_sex_chf <- table(data.work$SEX,data.work$ZSN)
dimnames(data_sex_chf) <- list(Sex=c("Female","Male"),
"Chronic Heart Failure"=c("No","Yes"))
data_sex_chf
chi_sq_data_sex_chf <-chisq.test(data_sex_chf)
chi_sq_data_sex_chf
LR_data_sex_chf <- GTest(data_sex_chf)
LR_data_sex_chf
```
With the p-value<0.01 we reject the null and conclude there is an association between Sex and chronic heart failure
```{r}
# age and chronic heart failure
data_age_chf <- table(data.work$AGE,data.work$ZSN)
dimnames(data_age_chf) <- list(Age = names(data_age_chf[,1]),
"Chronic Heart Failure"=c("No","Yes"))
#data_age_chf
boxplot_age_chf <- data.work %>%
ggplot() +
geom_boxplot(mapping = aes(x=AGE, y=as.factor(ZSN),
group = as.factor(ZSN))) +
ylab("Chronic Heart Failure") +
scale_y_discrete(labels=c("No","Yes"))
boxplot_age_chf
#CHF NO
summary(data.work %>%
filter(ZSN==0) %>%
select(AGE))
#CHF YES
summary(data.work %>%
filter(ZSN==1) %>%
select(AGE))
wilcox.test(data.work$AGE[which(data.work$ZSN == 0)],
data.work$AGE[which(data.work$ZSN == 1)])
```
Results from Wilcoxon Rank Sum test rejects the null with the p-value <0.01 and concludes there is a difference and age between outcomes
```{r}
#look at age categorically by decade
age_decade <- data.work %>%
mutate(decade = floor(AGE/10)*10) %>%
select(decade)
data_age_decade_chf <- table(age_decade$decade,data.work$ZSN)
dimnames(data_age_decade_chf) <-
list(Age = paste0(names(data_age_decade_chf[,1]),"s"),
"Chronic Heart Failure"=c("No","Yes"))
data_age_decade_chf
chi_sq_data_age_decade_chf <-chisq.test(data_age_decade_chf)
chi_sq_data_age_decade_chf
LR_data_age_decade_chf <- GTest(data_age_decade_chf)
LR_data_age_decade_chf
```
Using the age by decade we have a p-value<0.01 which like the wilcoxon test suggest an association between age an chronic heart failure due to the rejection of the null
## Alona
Exploring the relationship between CHF and Duration of arterial hypertension.
```{r}
library(knitr)
library(tidyverse)
library(vcdExtra, quietly = TRUE)
library("DescTools")
library("ResourceSelection")
# Duration of arterial hypertension (DLIT_AG): Ordinal
freq.dlitag <- data.work %>%
group_by(DLIT_AG) %>%
dplyr::summarize(n = n()) %>%
mutate(freq = n/sum(n))
freq.dlitag
ggplot(data.work, aes(x = as.factor(ZSN), y = DLIT_AG)) +
geom_count()
```
The two classes of CHF have similar distribution of proportions across the level of duration of arterial hypertension. We will further test the hypothesis that theres is an association between the two variables.
```{r}
# removing category 10 which is likely a mistake.
data.work.2 <- data.work %>%
filter(DLIT_AG != 10)
data.work.3 <- data.work %>%
mutate(DLIT_AG_N = case_when(DLIT_AG==6 ~ 8,
DLIT_AG==7 ~ 10,
DLIT_AG==0 ~ 0,
DLIT_AG==1 ~ 1,
DLIT_AG==2 ~ 2,
DLIT_AG==3 ~ 3,
DLIT_AG==4 ~ 4,
DLIT_AG==5 ~ 5
))
mean(data.work.2$DLIT_AG) # 3.36
median(data.work.2$DLIT_AG) #3
tab <- table(data.work.2$DLIT_AG,data.work.2$ZSN)
dimnames(tab) <- list("Duration of AH"=c("None","1-year","2-years","3-years","4-years",
"5-years","6-10 years",">=10 years"),
"Chronic Heart Failure"=c("No","Yes"))
tab2 <- table(data.work.3$DLIT_AG_N,data.work.3$ZSN)
# contingency table
dlitag <- as.table(tab)
kable(dlitag,
caption = "Duration of Arterial Hypertension by Chronic Heart Failure")
```
Duration of Arterial Hypertension is an ordinal type variable. we therefore use ordinal trend tests
```{r}
#Ordinal trend test
gamma.test <- GKgamma(dlitag)
pvalg=2*pnorm(q=gamma.test$gamma/gamma.test$sigma, lower.tail=FALSE)
pvalg
# Cochran Armitage Test for Ix2 tables - section 5.3.5 in the book
coarm <- CochranArmitageTest(dlitag)
coarm
# chisq test can be used but is less powerful than the two above.
chisq <- round(chisq.test(dlitag)$statistic,3)
#pval <- round(chisq.test(dlitag)$p.value,3)
#lrt <- GTest(dlitag)
std.res <- chisq.test(dlitag)$stdres
# all p-values from all test are confirming the finding that there is no relationship between
# duration of arterial hypertension and chronic heart failure
# residual analysis
# this is just a cool plot - unfortunately nothing is significant so there is no color.
mosaicplot(dlitag,
main = "",
xlab = "Duration of Arterial Hypertension",
ylab = "Chronic Heart Failure",
las = 1,
border = "chocolate",
shade = TRUE)
```
All tests have non-significant p-value (>0.2) which suggest that we do not reject the null of no association.
```{r}
# Logistic regression models for Chronic heart failure - ZSN as a function of DLIT_AG
# canonical link
fit.dlit.l <- glm(ZSN ~ DLIT_AG, data=data.work.2, family=binomial)
summary(fit.dlit.l)
# fit.dlitn.l <- glm(ZSN ~ DLIT_AG_N, data=data.work.3, family=binomial)
# summary(fit.dlitn.l)
# cloglog link
fit.dlit.cll <- glm(ZSN ~ DLIT_AG, data=data.work.2, family=binomial(link="cloglog"))
summary(fit.dlit.cll)
# identity link
fit.dlit.i <- glm(ZSN ~ DLIT_AG, data=data.work.2, family=binomial(link="identity"))
summary(fit.dlit.i)
#goodness of fit
G.sq=deviance(fit.dlit.l)
df.fit <- fit.dlit.l$df.residual
p.val=1-pchisq(G.sq,df.fit)
p.val
newdata <- data.frame(DLIT_AG=seq(min(data.work.2$DLIT_AG), max(data.work.2$DLIT_AG),len=23))
newdata$ZSN <- predict(fit.dlit.l, newdata=newdata, type="response")
plot(ZSN~DLIT_AG, data=data.work.2, col="black",
main = "Plot A",
ylab = "Predicted probability of CHF",
xlab = "Duration of arterial hypertension")
lines(ZSN~DLIT_AG, newdata, col="Blue", lwd=2)
```
The logistic regression model for CHF as explained by duration of arterial hypertension is not predictive. The predicted probabilities are effectively constant and the goodness of fit value is
`r I(p.val)` suggesting we reject the null of the model fitting the data.
```{r include=FALSE}
# consider other representations of the predictor variable.
# dichotomizing at the median - to assess the U shaped distribution factor of DLIT_AG
data.work.2$DLIT_AG_BIN <- ifelse(data.work.2$DLIT_AG>=3, 1, 0)
data.work.2$DLIT_AG_BIN2 <- ifelse(data.work.2$DLIT_AG>=1, 1, 0)
tab2 <- table(data.work.2$DLIT_AG_BIN,data.work.2$ZSN)
dimnames(tab2) <- list("Duration of AH" = c("<3",">=3"),
"Chronic Heart Failure"=c("No","Yes"))
tab2a <- table(data.work.2$DLIT_AG_BIN2,data.work.2$ZSN)
dimnames(tab2a) <- list("Duration of AH" = c("0",">0"),
"Chronic Heart Failure"=c("No","Yes"))
dlitag2 <- as.table(tab2)
kable(dlitag2)
dlitag2a <- as.table(tab2a)
kable(dlitag2a)
# contingency tables
chisq2 <- round(chisq.test(dlitag2)$statistic,3)
pval2 <- round(chisq.test(dlitag2)$p.value,3)
pval2
lrt2 <- GTest(dlitag2)
std.res2 <- chisq.test(dlitag2)$stdres
chisq2a <- round(chisq.test(dlitag2a)$statistic,3)
pval2a <- round(chisq.test(dlitag2a)$p.value,3)
pval2a
lrt2a <- GTest(dlitag2a)
std.res2a <- chisq.test(dlitag2a)$stdres
```
Considering the U shaped distribution of the variable, We also conducted the analysis for the dichotomized (at the median) variable. The results were no different than in the original form.
We also evaluated if a binary cut of the duration of arterial hypertension to no arterial hypertension (category of 0) vs. duration of arterial hypertension > 0 has more meaningful association with CHF and here too, the results were not different.
```{r include=FALSE}
# we consider using values that are strictly greater than 0 since 0 has no duration value.
# analysis excluding no arterial hypertension
data.sub <- data.work.3 %>%
filter(DLIT_AG_N !=0)
tab3 <- table(data.sub$DLIT_AG_N,data.sub$ZSN)
dimnames(tab3) <- list("Duration of AH" = c("1 year","2 years","3 years","4 years",
"5 years","6-10 years",">=10 years"),
"Chronic Heart Failure"=c("No","Yes"))
dlitag3 <- as.table(tab3)
#Ordinal trend test
gamma.test.2 <- GKgamma(dlitag3)
pvalg2=2*pnorm(q=gamma.test.2$gamma/gamma.test.2$sigma, lower.tail=FALSE)
pvalg2
# Cochran Armitage Test for Ix2 tables - section 5.3.5 in the book
coarm2 <- CochranArmitageTest(dlitag3)
coarm2
```
In conclusion, the variable of duration of arterial hypertension by itself is not associated with the outcome of chronic heart failure. This ordinal variable was tested in the original form - with equally spaced categories - and was also evaluated with an adjustment of score assignment for the last two categories (that are not one-to-one mapping of name to value)
## Minsu
Build a multivariable logistic regression model, identifying the best model, and calculating predictive power of the model.
```{r}
data.work <- data.work %>%
mutate(DLIT_AG_N = case_when(DLIT_AG==6 ~ 8,
DLIT_AG==7 ~ 10,
DLIT_AG==0 ~ 0,
DLIT_AG==1 ~ 1,
DLIT_AG==2 ~ 2,
DLIT_AG==3 ~ 3,
DLIT_AG==4 ~ 4,
DLIT_AG==5 ~ 5
))
#fit a model with all 7 predictors
data.work$SIM.f <- factor(data.work$SIM_GIPERT, levels=c(0,1), labels = c("no","yes"))
data.work$endocr_01.f <- factor(data.work$endocr_01, levels=c(0,1), labels = c("no","yes"))
data.work$endocr_02.f <- factor(data.work$endocr_02, levels=c(0,1), labels = c("no","yes"))
chf.dat <- select(data.work, AGE, SEX, IBS_POST, DLIT_AG, SIM.f, endocr_01.f, endocr_02.f, ZSN)
fit<- glm(ZSN ~ . , data=chf.dat, family=binomial)
summary(fit)
#overall test for model with 7 predictors
fit.0<- glm(ZSN ~ 1. , data=chf.dat, family=binomial)
summary(fit.0)
lr <- deviance(fit.0) - deviance(fit)
df <- summary(fit.0)$df[2]-summary(fit)$df[2]
p.val <- 1 - pchisq(lr, df=df)
p.val
```
There is strong evidence that at least one predictor has an effect. Although the overall test is highly significant, summary(fit) results show that only AGE and endocr_01 seems significant in the Wald test.
```{r}
#add AGE and endocr_01 to the logistic model in subtopic 2.
fit.ini <- glm(ZSN~ DLIT_AG, data=chf.dat, family=binomial)
fit.add <- glm(ZSN~ DLIT_AG + AGE + endocr_01.f, data=chf.dat, family=binomial)
#goodnes of fit
G.sq=deviance(fit.add)
df.fit <- fit.add$df.residual
p.val=1-pchisq(G.sq,df.fit)
#compare this additive model with the initial model with only DLIT_AG
anova(fit.ini, fit.add)
lr <- fit.ini$deviance - fit.add$deviance
df <- anova(fit.ini, fit.add, test="LRT")$Df[2]
p.val <- 1 - pchisq(lr, df=df)
p.val
```
The model with AGE and endocr_01 in addition to DLIT_AG improves the goodness-of-fit. Next, we perform stepwise model selection through the forward and backward elimination methods to see if there is effect of interaction between predictors.
```{r}
#Backward selection
fit.3 <- glm(ZSN~ DLIT_AG* AGE * endocr_01.f, data=chf.dat, family=binomial)
mod.back <- step(fit.3, scope=list(lower = ~ 1, upper = formula(fit.3)), scale = 1, trace = T, direction = "backward")
res.back <- mod.back$anova
res.back
#Forward selection
fit.0 <- glm(ZSN ~ 1 , data=chf.dat, family=binomial)
mod.for <- step(fit.0, scope=list(lower = ~ 1, upper = formula(fit.3)), scale = 1, trace = T, direction = "forward")
res.for <- mod.for$anova
res.for
#fit the best model
fit.best <- glm(ZSN ~ AGE + DLIT_AG * endocr_01.f , data=chf.dat, family=binomial)
summary(fit.best)
#goodness of fit
G.sq=deviance(fit.best)
df.fit.best <- fit.best$df.residual
p.val=1-pchisq(G.sq,df.fit.best)
#compare this best model with the additive model
lr <- fit.add$deviance - fit.best$deviance
df <- anova(fit.ini, fit.best, test="LRT")$Df[2]
p.val <- 1 - pchisq(lr, df=df)
p.val
#predictive power using ROC curve
library(ROCR)
pred1 <- prediction(fitted(fit.add), chf.dat$ZSN)
val1 <- performance(pred1, 'tpr', 'fpr')
pred2 <- prediction(fitted(fit.best), chf.dat$ZSN)
val2 <- performance(pred2, 'tpr', 'fpr')
lab1 <- expression('AGE'+'DLIT_AG'+'endocr_01')
lab2 <- expression('AGE'+'DLIT_AG x endocr_01')
plot([email protected][[1]], [email protected][[1]], type='s', [email protected], [email protected], col='red', lwd=2)
lines([email protected][[1]], [email protected][[1]], type='s', col='blue', lty=2)
abline(0,1, col='gray')
legend('bottomright', c(lab1, lab2), col=c('red','blue'), lwd=c(2,1), lty=1:2, cex=.9, bty='n')
```
The model with the interaction between AGE and DLIT_AG and endocr_01.f doesn't improve the goodness-of-fit.
As seen in the ROC Figure, their ROC curves are very close, thus we expect their performance for prediction will be almost same.
## Jadey
Build a mutivariable logistic regression model to predict the death of the cohort and check model prediction accuracy.
```{r}
data.work2 <- data.work
data.work2$death <- ifelse(data.work$LET_IS == 0, 0, 1)
table(data.work2$death) # survive: 1212, dead: 191
ggplot(data.work, aes(as.factor(data.work2$death))) + geom_bar() + labs(title = "distribution of death", x = "death") + scale_x_discrete(labels = c("no", "yes"))
# use stepwise selection to select variable
death.fit0 <- glm(death ~ 1, data = data.work2, family = binomial)
death.fit1 <- glm(death ~ AGE + as.factor(SEX) + as.factor(IBS_POST) + DLIT_AG + as.factor(SIM_GIPERT) + as.factor(endocr_01) + as.factor(endocr_02) + AGE * IBS_POST + AGE * DLIT_AG + AGE * SIM_GIPERT + AGE * endocr_01 + AGE * endocr_02, data = data.work2, family = binomial)
step(death.fit1, death.fit0, direction = "both") # selected variable: AGE, IBS_POST, SIM_GIPERT, endocr_01, endocr_02
# fit the best model
death.fit.logit <- glm(death ~ AGE + as.factor(IBS_POST) + as.factor(SIM_GIPERT) + as.factor(endocr_01) + as.factor(endocr_02), data = data.work2, family = binomial)
deviance(death.fit.logit) # 1018.906
# Hosmer-Lemeshow test to check goodness of fit
library("ResourceSelection")
death.pred <- predict(death.fit.logit, data.work2, type = "response")
hoslem.test(data.work2$death, death.pred, g = 20) # p = 0.4291, fail to reject H0
## Get indices of vector fit, from smallest to greatest
fit <- death.fit.logit$fitted.values
index <- sort.list(fit)
## check 10 smallest indices
index[1:10]
## create a matrix of death and fit, using this index
hosmer <- matrix(c(data.work2$death[index], fit[index]), byrow = F, nrow = nrow(data.work2))
head(hosmer)
## group into 20 groups with 69 observations per group
observed <- rep(NA, 20)
for (i in 1:20){ observed[i] <- sum(hosmer[(69*(i-1) +1) : (69 *i), 1])/ 69 }
observed
# repeat the previous step for the predicted probability
predicted <- rep(NA, 20)
for (i in 1:20){ predicted[i] <- sum(hosmer[(69*(i-1) +1) : (69 *i), 2])/ 69 }
predicted
# plot observed versus predicted
ggplot() + aes(x = predicted, y = observed) + geom_point() + geom_line() + geom_abline( a = 0, b = 1, color = "red")
# model summary
summary(death.fit.logit)
# calculate
glm.predict <- ifelse(predict(death.fit.logit, data.work2, type = "response") > 0.5, 1, 0)
sum(diag(table(glm.predict, data.work2$ZSN))) / nrow(data.work2) # 0.7616
```
The final model fitted: $log \frac{\pi_i}{ 1 - \pi_i} = -6.018 + 0.058 \times age + 0.073 \times I(IBS = 1) + 0.696 \times I(IBS = 2) + 0.726 \times I(SIM = 1) + 0.476 \times I(endocr01 = 1) + 1.081 \times I(endocr02 = 1)$. Hosmer Lemeshow tests shows adequate goodness of fit (p = 0.4291).
Fit logistic regression with multinomial response
```{r}
library(nnet)
data.work3 <- filter(data.work2, LET_IS != 0)
dim(data.work3) # n = 191
table(data.work3$LET_IS)
ggplot(data.work3, aes(as.factor(data.work3$LET_IS))) + geom_bar() + labs(title = "causes of death") + xlab("") + scale_x_discrete(labels = c("cardiogenic shock", "pulmonary endema", "myocardial rupture", "progress of conestive heart failure", "thromboembolism", "asystole", "ventricular fibrillation"), guide = guide_axis(n.dodge = 2))
multinom(LET_IS ~ AGE + as.factor(IBS_POST) + as.factor(SIM_GIPERT) + as.factor(endocr_01) + as.factor(endocr_02), data = data.work3)
```