|
| 1 | +# Article 1 |
| 2 | +# Shrikant I. Bangdiwala (2018): Regression: binary logistic, International Journal |
| 3 | +# of Injury Control and Safety Promotion, DOI: 10.1080/17457300.2018.1486503 |
| 4 | + |
| 5 | +# Article 2 |
| 6 | +# Leeper, T.J., 2017. Interpreting regression results using average marginal |
| 7 | +# effects with R's margins. Tech. rep. URL https://cran. r-project. |
| 8 | +# org/web/packages/margins/index. html. |
| 9 | + |
| 10 | + |
| 11 | + |
| 12 | +############################# |
| 13 | +# Part 1: load Libraries |
| 14 | +############################# |
| 15 | +library(mlbench) # for PimaIndiansDiabetes2 dataset |
| 16 | +library(tidyverse) # or specifically you can use ggplot2 library |
| 17 | + # for plotting |
| 18 | +library(broom) # Make model summary tidy |
| 19 | +library(caret) # use to compute confusion matrix |
| 20 | +library(visreg) # For potting logodds and probabilitis |
| 21 | +library(margins) # Use to calculate Average Marginal Effects |
| 22 | +library(rcompanion) # Use to calculate pseudo R2 |
| 23 | +library(ROCR) # Use to calculate Reciever Opering Curve |
| 24 | + |
| 25 | + |
| 26 | + |
| 27 | +############################# |
| 28 | +# Part 2: Gather and clean data |
| 29 | +############################# |
| 30 | + |
| 31 | +data(package = "mlbench") |
| 32 | + |
| 33 | + |
| 34 | + |
| 35 | +# load the diabetes dataset |
| 36 | + |
| 37 | +data(PimaIndiansDiabetes2) |
| 38 | + |
| 39 | + |
| 40 | +########################## |
| 41 | +# Variables of the dataset |
| 42 | +########################## |
| 43 | + |
| 44 | +# I1: pregnant: Number of times pregnant |
| 45 | +# I2: glucose: Plasma glucose concentration (glucose tolerance test) |
| 46 | +# I3: pressure: Diastolic blood pressure (mm Hg) |
| 47 | +# I4: triceps: Triceps skin fold thickness (mm) |
| 48 | +# I5: insulin: 2-Hour serum insulin (mu U/ml) |
| 49 | +# I6: mass: Body mass index (weight in kg/(height in m)\^2) |
| 50 | +# I7: pedigree: Diabetes pedigree function |
| 51 | +# I8: age: Age (years) |
| 52 | + |
| 53 | +# D1: diabetes: Class variable (test for diabetes) |
| 54 | + |
| 55 | + |
| 56 | +head(PimaIndiansDiabetes2) |
| 57 | + |
| 58 | + |
| 59 | +# Save data to Diabetes |
| 60 | + |
| 61 | +Diabetes <- na.omit(PimaIndiansDiabetes2) # dataset for modeling |
| 62 | + |
| 63 | +head(Diabetes) |
| 64 | + |
| 65 | +str(Diabetes) |
| 66 | + |
| 67 | + |
| 68 | + |
| 69 | + |
| 70 | +# Changing levels neg = 0 and pos = 1 |
| 71 | + |
| 72 | +head(Diabetes) |
| 73 | + |
| 74 | +levels(Diabetes$diabetes) <- 0:1 |
| 75 | + |
| 76 | +head(Diabetes) |
| 77 | + |
| 78 | + |
| 79 | +############################################### |
| 80 | +# Part 3: Dividing randomly data samples into train and test dataset |
| 81 | +############################################### |
| 82 | + |
| 83 | + |
| 84 | +# Total number of rows in the credit data frame |
| 85 | +n <- nrow(Diabetes) |
| 86 | + |
| 87 | +# Number of rows for the training set (80% of the dataset) |
| 88 | +n_train <- round(0.80 * n) |
| 89 | + |
| 90 | +# Create a vector of indices which is an 80% random sample |
| 91 | +set.seed(123) |
| 92 | +train_indices <- sample(1:n, n_train) |
| 93 | + |
| 94 | +# Subset the credit data frame to training indices only |
| 95 | +train <- Diabetes[train_indices, ] |
| 96 | + |
| 97 | +# Exclude the training indices to create the test set |
| 98 | +test <- Diabetes[-train_indices, ] |
| 99 | + |
| 100 | +#################################### |
| 101 | +# Part 4: Fitting a logistic regression model |
| 102 | +#################################### |
| 103 | + |
| 104 | + |
| 105 | +model_logi <- glm(diabetes~., data = train, family = "binomial") |
| 106 | + |
| 107 | +summary(model_logi) # see summary statistics |
| 108 | + |
| 109 | + |
| 110 | +# Make data tidy using broom package |
| 111 | +tidy(model_logi) |
| 112 | + |
| 113 | +glance(model_logi) # Check model fitting |
| 114 | + |
| 115 | +augment(model_logi) # obtain fitted values |
| 116 | + |
| 117 | + |
| 118 | + |
| 119 | +##################################### |
| 120 | +# Part 5: Calculating important statistics |
| 121 | +##################################### |
| 122 | + |
| 123 | +# Part 5 (A): Calculating the odd ratio |
| 124 | + |
| 125 | +(exp(coef(model_logi))) |
| 126 | + |
| 127 | +tidy(model_logi, exponentiate = TRUE, conf.level = 0.95) # odd ratio |
| 128 | + |
| 129 | + |
| 130 | +# Part 5 (B): Logodds and probability plots |
| 131 | + |
| 132 | + |
| 133 | +# Import and use visreg library |
| 134 | + |
| 135 | +# Logodds of diabetes wrt to glucose level |
| 136 | +visreg(model_logi, "glucose", xlab="Glucose level", |
| 137 | + ylab="Log odds (diabetes)") |
| 138 | + |
| 139 | +# Logodds of diabetes wrt to pedigree level |
| 140 | +visreg(model_logi, "pedigree", xlab="pedigree level", |
| 141 | + ylab="Log odds (diabetes)") |
| 142 | + |
| 143 | + |
| 144 | + |
| 145 | + |
| 146 | +# Probabilities of diabetes wrt glucose |
| 147 | +visreg(model_logi, "glucose", scale="response", rug=2, xlab="Glucose level", |
| 148 | + ylab="P(diabetes)") |
| 149 | + |
| 150 | +# Probabilities of diabetes wrt pedigree |
| 151 | +visreg(model_logi, "pedigree", scale="response", rug=2, xlab="pedigree level", |
| 152 | + ylab="P(diabetes)") |
| 153 | + |
| 154 | + |
| 155 | + |
| 156 | + |
| 157 | +# Part 5 (C): Calculate marginal effect |
| 158 | + |
| 159 | + |
| 160 | +###################################################### |
| 161 | + |
| 162 | +# While the estimated coefficients from logistic regression |
| 163 | +# are not easily interpretable |
| 164 | + |
| 165 | +# 1. Log odds: represents the change in the log of odds of |
| 166 | +# outcome for a given change in a predictor |
| 167 | + |
| 168 | +# 2. odds ratios: might provide a better summary of the effects of |
| 169 | +# predictor on outcome variable (odds ratios are derived from |
| 170 | +# exponentiation of the estimated coefficients from logistic |
| 171 | +# regression). The Calculation and Interpretation of |
| 172 | +# Odds Ratios may be somewhat more meaningful. |
| 173 | + |
| 174 | +# Marginal effects: Marginal effects are an alternative metric |
| 175 | +# that can be used to describe the impact of a preditor on |
| 176 | +# outcome variable. Marginal effects can be |
| 177 | +# described as the change in outcome as a |
| 178 | +# function of the change in the treatment |
| 179 | +# (or independent variable of interest) holding all other |
| 180 | +# variables in the model constant. In linear regression, |
| 181 | +# the estimated regression coefficients are marginal effects |
| 182 | +# and are more easily interpreted (more on this later). |
| 183 | + |
| 184 | + |
| 185 | +# There are two way of computing Marginal Effects |
| 186 | + |
| 187 | +# a) Marginal Effect at Mean |
| 188 | +# b) Average Marginal Effect |
| 189 | + |
| 190 | +# The magnitude of the marginal effect depends on the |
| 191 | +# values of the other variables and their coefficients. |
| 192 | + |
| 193 | +# The Marginal Effect at the Mean (MEM) is popular (i.e. compute the marginal |
| 194 | +# effects when all x's are at their mean) but many think that |
| 195 | +# Average Marginal Effects (AMEs) are superior |
| 196 | + |
| 197 | + |
| 198 | +# Use "margins" library for Average Marginal Effect compulation |
| 199 | + |
| 200 | + |
| 201 | +# Calculate average marginal effect |
| 202 | +effects_logit_dia = margins(model_logi) |
| 203 | + |
| 204 | + |
| 205 | + |
| 206 | +print(effects_logit_dia) |
| 207 | + |
| 208 | + |
| 209 | +# Summary of marginal effect |
| 210 | +summary(effects_logit_dia) |
| 211 | + |
| 212 | + |
| 213 | + |
| 214 | +# Plot marginal effect |
| 215 | +plot(effects_logit_dia) |
| 216 | + |
| 217 | + |
| 218 | + |
| 219 | +# Plot marginal effect using ggplot2 library |
| 220 | + |
| 221 | +effects_logit_diab = summary(effects_logit_dia) |
| 222 | + |
| 223 | + |
| 224 | +ggplot(data = effects_logit_diab) + |
| 225 | + geom_point(mapping = aes(x = factor, y = AME)) + |
| 226 | + geom_errorbar(mapping = aes(x = factor, ymin = lower, ymax = upper)) + |
| 227 | + geom_hline(yintercept = 0) + |
| 228 | + theme_minimal() + |
| 229 | + theme(axis.text.x = element_text(angle = 45)) |
| 230 | + |
| 231 | + |
| 232 | +####################### |
| 233 | +# Part 6: Model Evaluation |
| 234 | +####################### |
| 235 | + |
| 236 | +# Part 6 (A): Misclassification identification |
| 237 | +# using confusion matrix |
| 238 | + |
| 239 | +pred <- predict(model_logi, test, type="response") # predict using test data |
| 240 | + |
| 241 | +head(pred) |
| 242 | + |
| 243 | +predicted <- round(pred) # round of the value; >0.5 will convert to 1 |
| 244 | + |
| 245 | +head(predicted) # else 0 |
| 246 | + |
| 247 | +# Side by side comparision |
| 248 | + |
| 249 | +head(data.frame(observed = test$diabetes, predicted = predicted)) |
| 250 | + |
| 251 | + |
| 252 | +# Let's create a contigency table |
| 253 | + |
| 254 | +tab <- table(Predicted = predicted, Reference = test$diabetes) |
| 255 | + |
| 256 | +tab |
| 257 | + |
| 258 | + |
| 259 | +sum(diag(tab))/sum(tab)*100 |
| 260 | + |
| 261 | + |
| 262 | +# Confusion matrix using caret package |
| 263 | + |
| 264 | +confusionMatrix(tab) |
| 265 | + |
| 266 | + |
| 267 | + |
| 268 | + |
| 269 | + |
| 270 | +# Part 6 (B): |
| 271 | +# Pseudo R2 and loglikelyhood ratio test |
| 272 | + |
| 273 | +# Import and use rcompanion library |
| 274 | + |
| 275 | + |
| 276 | +nagelkerke(model_logi) |
| 277 | + |
| 278 | + |
| 279 | +# Part 6 (C) |
| 280 | +# Compute the cutoff values |
| 281 | +# Use ROCR Package |
| 282 | + |
| 283 | +# Use the prediction function to generate a prediction result: |
| 284 | + |
| 285 | + |
| 286 | +pred.rocr <- prediction(pred, test$diabetes) |
| 287 | + |
| 288 | +eval <- performance(pred.rocr, "acc") |
| 289 | + |
| 290 | +plot(eval) |
| 291 | + |
| 292 | + |
| 293 | + |
| 294 | +# Identify best value (Cutoff vs Accuracy) |
| 295 | + |
| 296 | +max <- which.max(slot(eval, "y.values")[[1]]) |
| 297 | +acc <- slot(eval, "y.values")[[1]][max] #y.values are accuracy measures |
| 298 | +cut <- slot(eval, "x.values")[[1]][max] # x.values are cutoff measures |
| 299 | + |
| 300 | + |
| 301 | +print(c(Accuracy = acc, Cutoff = cut)) |
| 302 | + |
| 303 | + |
| 304 | +# Part 6 (D) |
| 305 | +# Receiver Operating Characteristic Curve computation |
| 306 | +# Import ROCR library |
| 307 | + |
| 308 | +# ROC (Receiver Operating Characteristic) Curve tells us about how good |
| 309 | +# the model can distinguish between two things |
| 310 | + |
| 311 | +# Use the performance function to obtain the performance measurement: |
| 312 | + |
| 313 | +perf.rocr <- performance(pred.rocr, measure = "auc", |
| 314 | + x.measure = "cutoff") |
| 315 | + |
| 316 | +perf.rocr@y.values[[1]] <- round(perf.rocr@y.values[[1]], digits = 4) |
| 317 | + |
| 318 | +perf.tpr.fpr.rocr <- performance(pred.rocr, "tpr", "fpr") |
| 319 | + |
| 320 | +# pos (actual) --and-- predicted (pos) ->correctly identified -> True pos |
| 321 | +# neg (actual) --and-- predicted (pos) ->Incorrectly identified -> False Pos |
| 322 | +# pos (actual) --and-- predicted (not pos) ->Incorrectly rejected -> False neg |
| 323 | +# neg (actual) --and-- predicted (not pos) ->Correctly rejected -> True neg |
| 324 | + |
| 325 | + |
| 326 | +# Visualize ROC curve using plot function |
| 327 | + |
| 328 | +plot(perf.tpr.fpr.rocr, colorize=T, |
| 329 | + main = paste("AUC:", (perf.rocr@y.values))) |
| 330 | +abline(a = 0, b = 1) |
0 commit comments