-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathLab2_DescriptivesVisuals.R
336 lines (263 loc) · 13 KB
/
Lab2_DescriptivesVisuals.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
###############################################
### LAB 2: Describing and Visualizing Data ###
###############################################
#### OUTLINE:
#1. Looking at our data + practice with spotting mistakes and recoding variables
#2. Descriptive Statistics for Categorical variables (+presentation style tables)
#3. Descriptive Statistics for Quantitative variables (+presentation style tables)
#4. Introduction to ggplot2 package for creating figures
#5. Visualizing data for exploration
#### Install New Packages
if (!require("flextable")) install.packages("flextable")
if (!require("table1")) install.packages("table1")
if (!require("rstatix")) install.packages("rstatix")
if (!require("tidyverse")) install.packages("tidyverse")
#### Load Packages
library(psych)
library(rties)
library(flextable)
library(table1)
library(tidyverse)
library(rstatix)
## What is the first thing you should do when you open RStudio and want to work with data? Set our working directory!
setwd("/Users/akuelz/Desktop/315/Spr24/Lab/data") # this is my path.. your path WILL BE different
#OR - 1) Session > 2)Set Working Directory > 3) Choose Directory
getwd()
#### Load in Data
d0 <- read.csv("Garcia.csv", stringsAsFactors = T)
## *** Two simple things we should do after loading in a dataset ***:
str(d0) #str stands for structure. think of it like a table of contents
# num stands for numeric (vectors are numeric when elements contain decimals) - descriptive statistics can be computed
# int stands for integer (similar to numeric but only whole #'s) - descriptive statistics can be computed but it may not make sense
# factor - this is R's version of a categorical variable
summary(d0) # summary provides common summary statistics
# summary statistics reported for continuous variables
# frequencies reported for categorical/factor variables
# another way to look at summary statistics using describe in the psych package:
describe(d0, quant = c(.25, .75), IQR = T)
## the min and max values for two of these variables appears to be capped at either (0,2) or (0,1)
# these may not be truly quantitative or "continu-ish"
#let's check out their distributions using histograms
histAll(d0) # what are your thoughts on 'protest'? what about anger?
# these "numbers" appear to be labels only
# let's look at the information on the dataset to see what the labels correspond to
# open description of data
## There are two variables (protest and anger) that are not truly continuous based on the way they were measured
# We have some 'cleaning up' to do since both are currently recognized as integer variables (i.e., quantitative variables)
# Although we know what these variables should be, R currently does not. Time to catch R up to speed!
########## Recode 'protest' into it's appropriate form: Categorical Variable (i.e., Factor in R)
## Base R Way:
d0$protestF <- factor(d0$protest, levels = c(0,1,2),
labels = c("No Protest", "Individual Protest", "Collective Protest"))
## Most functions in R share a similar structure:
# Comprised of an object or objects that you want to perform the function on
# Mandatory and optional arguments. These are followed by commas inside the parentheses
## Here we have two additional arguments:
# Levels: These are the "numbers" that were meant to serve as labels/placeholders for groupings within the categorical variable
# Labels: This is where we assign the actual labels to each "number". These must be in the order that the levels show up.
## check to make sure our values line up with the original 'protest' variable:
table(d0$protest, useNA = "ifany") # remember that 0, 1, and 2 are not real numbers. these are labels
table(d0$protest, d0$protestF, useNA = "ifany") # yes! we're excellent coders so everything went as planned :)
## Tidyverse Approach:
d0 <- d0 %>% mutate(
protest_f=case_when(protest == 0 ~ "No protest",
protest == 1 ~ "Individual Protest",
protest == 2 ~ "Collective Protest"),
protest_f = factor(protest_f, levels=c("No protest", "Individual Protest", "Collective Protest")))
# Let's break down this line of code as it includes some elements we may not be familiar with
# Starting with 'mutate' --> this is a tidyverse function for recoding variables
# protest_f --> this is the new factor variable we are creating
# case_when --> this is a tool within mutate that allows us to use if/then logic
# (e.g., if the original variable protest is equal to 0, the new factor variable protest_f will equal "No protest")
# notice within the case_when parentheses, the tilda symbol "~" represents the logic of "then"
# also notice the comma after every line inside the case_when parentheses, excluding the final line
########## Recode 'anger' into a Categorical Variable (i.e., Factor in R)
## Let's practice doing this using the data description:
d0$angerY <- factor(d0$anger, levels = c(0,1),
labels = c("No Anger", "Anger"))
# Check our coding work
table(d0$anger, d0$angerY)
########### Describing Data: Categorical Variables ###########
## Now that we have accurately coded our categorical variables, let's start working to describe the frequencies
#what percentage were in the 'No Protest' group?
table(d0$protestF, useNA = "ifany") # frequencies
prop.table(table(d0$protestF,useNA="ifany")) # proportions
prop.table(table(d0$protestF,useNA="ifany")) * 100 # percentages
## Create a table that includes our correctly identified categorical variables using the table1 function
# We choose to save this an object 'freq' so that we can export the resulting table into a Word document for later use
freq <- table1(~ angerY + protestF, data = d0)
freq # Notice that we needed to call on the defined object in order to see what was stored within
## Now we use the flextable package to save the html output from the table1 function to a word document in our working directory
# note that this uses the pipe %>%
t1flex(freq) %>%
save_as_docx(path="L2_frequencies.docx")
## Later we will open this document and manipulate it to look "better"
######### Describing Data: Contui-ish/Quantitative Variables #########
# summary and describe functions are the quickest/easiest for exploration
summary(d0)
describe(d0)
## For presentation purposes we will use a different function that works nicely with the flextable package
## Generate descriptive statistics using 'get_summary_stats' function in the 'rstatix' package
get_summary_stats(d0) # statistics only calculated for variables identified as numeric/integer
# oops..we still need to get rid of the original versions of 'protest' and 'anger'
num <- subset(d0, select = c(sexism, liking, respappr)) # base R way
num_tv <- d0 %>% dplyr::select(sexism, liking, respappr) # tidyverse version using the '%>%
## Use get_summary_statistics on this new dataframe with appropriate variables
# Then use flextable function to display results
desc <- get_summary_stats(num) # statistics only calculated for variables identified as numeric/integer
flextable(desc)
## For reporting (and HW..hint), we want sample size, mean, sd, median, IQR, Q1, Q3, min and max
# i.e., remove 'mad', 'se', and 'ci' from the object
## We will also take this time to rearrange the columns such that statistics follow a more cohesive structure
desc1 <- desc[ ,c(1:4, 10:11, 5:8)]
flextable(desc1)
## We again use the flextable package to save the output displayed in Viewer as a word document in our working directory
flextable(desc1) %>%
save_as_docx(path="L2_descriptives.docx")
## Later we will open this document and manipulate it to look "better"
######## VISUALIZING DATA (Plotting) ########
#### Introduction to ggplot (Grammar of Graphics) #####
## ggplot stands for the "Grammar of Graphics," drawing from the idea that you can construct
# any visual with a group of rudimentary parts.
## This will be an example of creating a slightly fancy scatterplot.
# Step 1: Specify the data
ggplot(data = d0)
# Step 2: Specify the aesthetic mapping
ggplot(data = d0,
mapping = aes(x = respappr,
y = liking))
# Step 3: Specify the geometric object
ggplot(data = d0,
mapping = aes(x = respappr,
y = liking)) +
geom_point(colour = "black",
fill = "grey60",
size = 2,
shape = 21,
alpha = .40,
position = position_jitter())
# geom_point adds the observed data points
# colour = color of shape 'borders'
# fill = specify color of shape 'insides'
# size = size of shapes
# shape = specify actual shape
# alpha = the shading inside of the shape
# position_jitter = helpful when we have multiple data points lying on top of each other
# Step 4: Add a regression line
ggplot(data = d0,
mapping = aes(x = respappr,
y = liking)) +
geom_point(colour = "black",
fill = "grey60",
size = 2,
shape = 21,
alpha = .60,
position = position_jitter()) +
geom_smooth(method = "lm",
se = TRUE,
fullrange = TRUE,
colour = "black",
fill = "grey60")
# Step 5: Add in some labels and fix scaling
ggplot(data = d0,
mapping = aes(x = respappr,
y = liking)) +
geom_point(colour = "black",
fill = "grey60",
size = 2,
shape = 21,
alpha = .60,
position = position_jitter()) +
geom_smooth(method = "lm",
se = TRUE,
fullrange = TRUE,
colour = "black",
fill = "grey60") +
scale_x_continuous(breaks = seq(1, 7, by = .5)) +
scale_y_continuous(breaks = seq(1, 7, by = 1)) +
labs(x = "Perceived Appropriateness",
y = "Target Likeability",
caption = "Data from the sexism (protest) study of Garcia et al., 2010",
title = "Target Likeability by Perceived Response Appropriateness")
# Step 6: Deal with background lines and other formatting
apatheme <- theme_bw()+
theme(panel.grid.major=element_blank(),
panel.grid.minor=element_blank(),
panel.border=element_blank(),
axis.line=element_line(),
text=element_text(family='serif')) # setting up some of display options
ggplot(data = d0,
mapping = aes(x = respappr,
y = liking)) +
geom_point(colour = "black",
fill = "grey60",
size = 2,
shape = 21,
alpha = .60,
position = position_jitter()) +
geom_smooth(method = "lm",
se = TRUE,
fullrange = TRUE,
colour = "black",
fill = "grey60") +
scale_x_continuous(breaks = seq(1, 7, by = .5)) +
scale_y_continuous(breaks = seq(1, 7, by = 1)) +
labs(x = "Perceived Appropriateness",
y = "Target Likeability",
caption = "Data from the sexism (protest) study of Garcia et al., 2010",
title = "Target Likeability by Perceived Response Appropriateness") +
apatheme
# Step 7: Store as an object so we can export from R
scatterH1 <- ggplot(data = d0,
mapping = aes(x = respappr,
y = liking)) +
geom_point(colour = "black",
fill = "grey60",
size = 2,
shape = 21,
alpha = .60,
position = position_jitter()) +
geom_smooth(method = "lm",
se = TRUE,
fullrange = TRUE,
colour = "black",
fill = "grey60") +
scale_x_continuous(breaks = seq(1, 7, by = .5)) +
scale_y_continuous(breaks = seq(1, 7, by = 1)) +
labs(x = "Perceived Appropriateness",
y = "Target Likeability") +
apatheme
## Call on the object
scatterH1
## Save the object as a .png
ggsave(filename = "scatter_plot.png",
plot = scatterH1,
device = "png",
width = 6,
height = 4,
units = "in",
dpi = 500)
#### Visualizing (for Exploration) ####
## Histograms are always good to look at (which we did prior)
histAll(d0)
hist(d0$liking)
## Plot is a workhorse function. # It will return:
# boxplots, scatterplots, barcharts, mosaic plots
# Depends on the type of variable(s) that you're working with
## Barchart Example
plot(d0$protestF,
xlab = "Protest Condition",
ylab = "Frequency") # labels are completely optional
#barchart is displayed because protestF is recognized as a factor
## Scatterplot Example
plot(liking ~ respappr, data = d0)
#scatterplot is displayed because both variables are recognized as quantitative
## Boxplot Example
boxplot(d0$respappr) # single variable example
plot(respappr ~ protestF, data = d0)
# boxplot is displayed because "predictor" is recognized as a factor
## Mosaic Plot Example
plot(protestF ~ angerY, data = d0)
#mosaic is displayed because both variables are recognized as factors
## Note: We can make all of these + more in ggplot
# We will see some examples next week