Skip to content

Commit a09ee12

Browse files
author
Rahul raoniar
committed
Update logistic regression
New update
1 parent d0f0745 commit a09ee12

File tree

2 files changed

+330
-0
lines changed

2 files changed

+330
-0
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,330 @@
1+
# Article 1
2+
# Shrikant I. Bangdiwala (2018): Regression: binary logistic, International Journal
3+
# of Injury Control and Safety Promotion, DOI: 10.1080/17457300.2018.1486503
4+
5+
# Article 2
6+
# Leeper, T.J., 2017. Interpreting regression results using average marginal
7+
# effects with R's margins. Tech. rep. URL https://cran. r-project.
8+
# org/web/packages/margins/index. html.
9+
10+
11+
12+
#############################
13+
# Part 1: load Libraries
14+
#############################
15+
library(mlbench) # for PimaIndiansDiabetes2 dataset
16+
library(tidyverse) # or specifically you can use ggplot2 library
17+
# for plotting
18+
library(broom) # Make model summary tidy
19+
library(caret) # use to compute confusion matrix
20+
library(visreg) # For potting logodds and probabilitis
21+
library(margins) # Use to calculate Average Marginal Effects
22+
library(rcompanion) # Use to calculate pseudo R2
23+
library(ROCR) # Use to calculate Reciever Opering Curve
24+
25+
26+
27+
#############################
28+
# Part 2: Gather and clean data
29+
#############################
30+
31+
data(package = "mlbench")
32+
33+
34+
35+
# load the diabetes dataset
36+
37+
data(PimaIndiansDiabetes2)
38+
39+
40+
##########################
41+
# Variables of the dataset
42+
##########################
43+
44+
# I1: pregnant: Number of times pregnant
45+
# I2: glucose: Plasma glucose concentration (glucose tolerance test)
46+
# I3: pressure: Diastolic blood pressure (mm Hg)
47+
# I4: triceps: Triceps skin fold thickness (mm)
48+
# I5: insulin: 2-Hour serum insulin (mu U/ml)
49+
# I6: mass: Body mass index (weight in kg/(height in m)\^2)
50+
# I7: pedigree: Diabetes pedigree function
51+
# I8: age: Age (years)
52+
53+
# D1: diabetes: Class variable (test for diabetes)
54+
55+
56+
head(PimaIndiansDiabetes2)
57+
58+
59+
# Save data to Diabetes
60+
61+
Diabetes <- na.omit(PimaIndiansDiabetes2) # dataset for modeling
62+
63+
head(Diabetes)
64+
65+
str(Diabetes)
66+
67+
68+
69+
70+
# Changing levels neg = 0 and pos = 1
71+
72+
head(Diabetes)
73+
74+
levels(Diabetes$diabetes) <- 0:1
75+
76+
head(Diabetes)
77+
78+
79+
###############################################
80+
# Part 3: Dividing randomly data samples into train and test dataset
81+
###############################################
82+
83+
84+
# Total number of rows in the credit data frame
85+
n <- nrow(Diabetes)
86+
87+
# Number of rows for the training set (80% of the dataset)
88+
n_train <- round(0.80 * n)
89+
90+
# Create a vector of indices which is an 80% random sample
91+
set.seed(123)
92+
train_indices <- sample(1:n, n_train)
93+
94+
# Subset the credit data frame to training indices only
95+
train <- Diabetes[train_indices, ]
96+
97+
# Exclude the training indices to create the test set
98+
test <- Diabetes[-train_indices, ]
99+
100+
####################################
101+
# Part 4: Fitting a logistic regression model
102+
####################################
103+
104+
105+
model_logi <- glm(diabetes~., data = train, family = "binomial")
106+
107+
summary(model_logi) # see summary statistics
108+
109+
110+
# Make data tidy using broom package
111+
tidy(model_logi)
112+
113+
glance(model_logi) # Check model fitting
114+
115+
augment(model_logi) # obtain fitted values
116+
117+
118+
119+
#####################################
120+
# Part 5: Calculating important statistics
121+
#####################################
122+
123+
# Part 5 (A): Calculating the odd ratio
124+
125+
(exp(coef(model_logi)))
126+
127+
tidy(model_logi, exponentiate = TRUE, conf.level = 0.95) # odd ratio
128+
129+
130+
# Part 5 (B): Logodds and probability plots
131+
132+
133+
# Import and use visreg library
134+
135+
# Logodds of diabetes wrt to glucose level
136+
visreg(model_logi, "glucose", xlab="Glucose level",
137+
ylab="Log odds (diabetes)")
138+
139+
# Logodds of diabetes wrt to pedigree level
140+
visreg(model_logi, "pedigree", xlab="pedigree level",
141+
ylab="Log odds (diabetes)")
142+
143+
144+
145+
146+
# Probabilities of diabetes wrt glucose
147+
visreg(model_logi, "glucose", scale="response", rug=2, xlab="Glucose level",
148+
ylab="P(diabetes)")
149+
150+
# Probabilities of diabetes wrt pedigree
151+
visreg(model_logi, "pedigree", scale="response", rug=2, xlab="pedigree level",
152+
ylab="P(diabetes)")
153+
154+
155+
156+
157+
# Part 5 (C): Calculate marginal effect
158+
159+
160+
######################################################
161+
162+
# While the estimated coefficients from logistic regression
163+
# are not easily interpretable
164+
165+
# 1. Log odds: represents the change in the log of odds of
166+
# outcome for a given change in a predictor
167+
168+
# 2. odds ratios: might provide a better summary of the effects of
169+
# predictor on outcome variable (odds ratios are derived from
170+
# exponentiation of the estimated coefficients from logistic
171+
# regression). The Calculation and Interpretation of
172+
# Odds Ratios may be somewhat more meaningful.
173+
174+
# Marginal effects: Marginal effects are an alternative metric
175+
# that can be used to describe the impact of a preditor on
176+
# outcome variable. Marginal effects can be
177+
# described as the change in outcome as a
178+
# function of the change in the treatment
179+
# (or independent variable of interest) holding all other
180+
# variables in the model constant. In linear regression,
181+
# the estimated regression coefficients are marginal effects
182+
# and are more easily interpreted (more on this later).
183+
184+
185+
# There are two way of computing Marginal Effects
186+
187+
# a) Marginal Effect at Mean
188+
# b) Average Marginal Effect
189+
190+
# The magnitude of the marginal effect depends on the
191+
# values of the other variables and their coefficients.
192+
193+
# The Marginal Effect at the Mean (MEM) is popular (i.e. compute the marginal
194+
# effects when all x's are at their mean) but many think that
195+
# Average Marginal Effects (AMEs) are superior
196+
197+
198+
# Use "margins" library for Average Marginal Effect compulation
199+
200+
201+
# Calculate average marginal effect
202+
effects_logit_dia = margins(model_logi)
203+
204+
205+
206+
print(effects_logit_dia)
207+
208+
209+
# Summary of marginal effect
210+
summary(effects_logit_dia)
211+
212+
213+
214+
# Plot marginal effect
215+
plot(effects_logit_dia)
216+
217+
218+
219+
# Plot marginal effect using ggplot2 library
220+
221+
effects_logit_diab = summary(effects_logit_dia)
222+
223+
224+
ggplot(data = effects_logit_diab) +
225+
geom_point(mapping = aes(x = factor, y = AME)) +
226+
geom_errorbar(mapping = aes(x = factor, ymin = lower, ymax = upper)) +
227+
geom_hline(yintercept = 0) +
228+
theme_minimal() +
229+
theme(axis.text.x = element_text(angle = 45))
230+
231+
232+
#######################
233+
# Part 6: Model Evaluation
234+
#######################
235+
236+
# Part 6 (A): Misclassification identification
237+
# using confusion matrix
238+
239+
pred <- predict(model_logi, test, type="response") # predict using test data
240+
241+
head(pred)
242+
243+
predicted <- round(pred) # round of the value; >0.5 will convert to 1
244+
245+
head(predicted) # else 0
246+
247+
# Side by side comparision
248+
249+
head(data.frame(observed = test$diabetes, predicted = predicted))
250+
251+
252+
# Let's create a contigency table
253+
254+
tab <- table(Predicted = predicted, Reference = test$diabetes)
255+
256+
tab
257+
258+
259+
sum(diag(tab))/sum(tab)*100
260+
261+
262+
# Confusion matrix using caret package
263+
264+
confusionMatrix(tab)
265+
266+
267+
268+
269+
270+
# Part 6 (B):
271+
# Pseudo R2 and loglikelyhood ratio test
272+
273+
# Import and use rcompanion library
274+
275+
276+
nagelkerke(model_logi)
277+
278+
279+
# Part 6 (C)
280+
# Compute the cutoff values
281+
# Use ROCR Package
282+
283+
# Use the prediction function to generate a prediction result:
284+
285+
286+
pred.rocr <- prediction(pred, test$diabetes)
287+
288+
eval <- performance(pred.rocr, "acc")
289+
290+
plot(eval)
291+
292+
293+
294+
# Identify best value (Cutoff vs Accuracy)
295+
296+
max <- which.max(slot(eval, "y.values")[[1]])
297+
acc <- slot(eval, "y.values")[[1]][max] #y.values are accuracy measures
298+
cut <- slot(eval, "x.values")[[1]][max] # x.values are cutoff measures
299+
300+
301+
print(c(Accuracy = acc, Cutoff = cut))
302+
303+
304+
# Part 6 (D)
305+
# Receiver Operating Characteristic Curve computation
306+
# Import ROCR library
307+
308+
# ROC (Receiver Operating Characteristic) Curve tells us about how good
309+
# the model can distinguish between two things
310+
311+
# Use the performance function to obtain the performance measurement:
312+
313+
perf.rocr <- performance(pred.rocr, measure = "auc",
314+
x.measure = "cutoff")
315+
316+
perf.rocr@y.values[[1]] <- round(perf.rocr@y.values[[1]], digits = 4)
317+
318+
perf.tpr.fpr.rocr <- performance(pred.rocr, "tpr", "fpr")
319+
320+
# pos (actual) --and-- predicted (pos) ->correctly identified -> True pos
321+
# neg (actual) --and-- predicted (pos) ->Incorrectly identified -> False Pos
322+
# pos (actual) --and-- predicted (not pos) ->Incorrectly rejected -> False neg
323+
# neg (actual) --and-- predicted (not pos) ->Correctly rejected -> True neg
324+
325+
326+
# Visualize ROC curve using plot function
327+
328+
plot(perf.tpr.fpr.rocr, colorize=T,
329+
main = paste("AUC:", (perf.rocr@y.values)))
330+
abline(a = 0, b = 1)

5.2 data.table/.Rhistory

Whitespace-only changes.

0 commit comments

Comments
 (0)