-
Notifications
You must be signed in to change notification settings - Fork 0
/
2_rawdata_cleaning.R
248 lines (220 loc) · 14 KB
/
2_rawdata_cleaning.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
### Using collaborative open science tools to improve engagement with the
# ecology of the Guana River Estuary
# Geraldine Klarenberg, PhD
# 12 May 2023
# Survey data cleaning and organizing (working with deidentified data)
# This script / project is set up with renv, meaning it restores the versions of
# the packages that were last used (when everything worked, we assume)
# See https://rstudio.github.io/renv/articles/renv.html
# Start all runs of this script with:
renv::restore()
# This ensures it uses the packages last used when everything worked okay. This
# also ensures these packages are installed if you don't have them
library(tidyverse)
# Important notes on how the data is organized:
# 1. If there is only one choice for people to pick, the results show as different
# numbers (1, 2, 3 etc). See file "mc_questions_options.csv" in the metadata folder.
# 2. If people can choose several options, there will be a column for each option,
# with _1, _2, etc added on, and there will be a 1 in the column if they picked it.
# The document "mc_questions_options.csv" in the metadata folder shows which column
# is which question and which option.
# 3. Question IDs (QID...) in "mc_questions_options.csv" are not in necessarily order
# and neither are the multiple choice options from point 1.
# 4. For the data questions where we ask several questions about different data
# types, the pre-fix 1_, 2_, 3_, etc refers directly to the choices made in the
# question before.
# Read in de-identified dataset. I prefer to use the numeric results. The "text" results
# also have numeric results for some questions, and when making long datasets, mixing text
# and numeric data complicates things.
all_surveys <- read_csv("2_data_deidentified/survey_data_safe_numeric_raw.csv")
##### Create sub datasets ------------------------------------
# Make sure that each also has the ID, so we can match thins up if necessary
# All datasets are transformed to long version so plotting and analysis is
# easier (hopefully). In order to make them long, the numeric results need to
# be used - so text answers are saved separately.
# These datasets are all saved with the word "basic" in the filename, meaning that
# the headers of the files are codes, not the actual questions. In order to get
# the actual questions, use the document "questions_details.csv" in the
# metadata folder.
names(all_surveys)
# Load question metadata
questions_detail <- read_csv("1_metadata/mc_questions_options.csv")
# To add on the actual questions and options picked (in text)
#### Save all text answers separately ------------------------------------
# (as they prevent pivoting the data to a long format)
text_only <- all_surveys %>%
select("ID", "source", ends_with("_TEXT")) %>% # select text questions
pivot_longer(cols = c(-1, -2), names_to = "qname", values_to = "text_answer") %>% # make long
filter(!is.na(text_answer)) %>% # remove NAs
separate(qname, c("field_no", "qname_main", "q_code", "type"),
sep = "_", remove = FALSE, convert = TRUE, fill = "left") %>%
select(-type)
write_csv(text_only, "2_data_deidentified/subsets/text_results_basic.csv")
#### Make dataset with "intro" questions ------------------------------------
intro <- all_surveys %>%
select("ID", "source", starts_with("D-")) %>% # Get section D questions
select(!ends_with("_TEXT")) %>% # Take out the text answers
pivot_longer(cols = 3:ncol(.),
names_to = "qname",
values_to = "choice") %>%
# separate main questions code and numeric options
separate(qname, c("qname_main", "q_code"), sep = "_", remove = FALSE, convert = TRUE) %>%
left_join(select(questions_detail, qname_main, q_type), relationship = "many-to-many") %>% # add on question type so we can use it to add correct text
filter(!duplicated(.)) %>%
mutate(order = if_else(q_type == "RO", # For the ordering questions, save the order as a separate column
true = choice,
false = NA)) %>%
mutate(q_code = if_else(is.na(q_code), # If the response is NA in column q_code, it means it's a single answer MC question
true = choice, # Move the answer in choice to q_code
false = q_code)) %>% # Otherwise just keep q_code
filter(!is.na(choice)) %>% # Remove NA (not picked) questions/answers
select(!choice) %>% # no need for this column anymore
left_join(select(questions_detail, qname, qname_main, main, q_code, q_text)) %>%
# reorder columns...
select(ID, source, qname, qname_main, main, q_code, q_text, order)
write_csv(intro, "2_data_deidentified/subsets/intro_results_basic2.csv")
#### Make "has accessed data" dataset ------------------------------------
# Do in two parts, because the way the questions codes are created is different
# First only the question about which data
# Then the part that refers to each data type that people picked
yes_data1 <- all_surveys %>%
select("ID", "source", contains("YD-")) %>%
select(!ends_with("_TEXT")) %>% # Take out the text answers
select("ID", "source", starts_with("YD")) %>% # Filter for questions that do NOT start with a number
select(!contains("DO")) %>% # Take out the results that indicate in which order the choices were displayed
# to the survey takers (not relevant for us - question was set up to randomize the order)
pivot_longer(cols = 3:ncol(.),
names_to = "qname",
values_to = "choice") %>%
separate(qname, c("qname_main", "q_code"), sep = "_", remove = FALSE, convert = TRUE) %>%
left_join(select(questions_detail, qname_main, q_type), relationship = "many-to-many") %>% # add on question type so we can use it to add correct text
filter(!duplicated(.)) %>%
mutate(order = if_else(q_type == "RO", # For the ordering questions, save the order as a separate column
true = choice,
false = NA)) %>%
mutate(q_code = if_else(is.na(q_code), # If the response is NA in column q_code, it means it's a single answer MC question
true = choice, # Move the answer in choice to q_code
false = q_code)) %>% # Otherwise just keep q_code
filter(!is.na(choice)) %>% # Remove NA (not picked) questions/answers
select(!choice) %>% # no need for this column anymore
left_join(select(questions_detail, qname, qname_main, main, q_code, q_text)) %>%
# reorder columns...
select(ID, source, qname, qname_main, main, q_code, q_text, order)
yes_data2 <- all_surveys %>%
select("ID", "source", contains("YD-")) %>%
select(!ends_with("_TEXT")) %>% # Take out the text answers
select("ID", "source", !starts_with("YD")) %>% # Filter for questions that start with a number
pivot_longer(cols = 3:ncol(.),
names_to = "qname",
values_to = "choice") %>%
# separate main questions code and numeric options
separate(qname, c("field_no", "qname_main", "q_code"), sep = "_", remove = FALSE, convert = TRUE) %>%
left_join(select(questions_detail, qname_main, q_type), relationship = "many-to-many") %>% # add on question type so we can use it to add correct text
filter(!duplicated(.)) %>%
mutate(order = if_else(q_type == "RO", # For the ordering questions, save the order as a separate column
true = choice,
false = NA)) %>%
mutate(q_code = if_else(is.na(q_code), # If the response is NA in column q_code, it means it's a single answer MC question
true = choice, # Move the answer in choice to q_code
false = q_code)) %>% # Otherwise just keep q_code
filter(!is.na(choice)) %>% # Remove NA (not picked) questions/answers
select(!choice) %>% # no need for this column anymore
left_join(select(questions_detail, qname, qname_main, main, field_no, field_name, q_code, q_text)) %>%
# reorder columns...
select(ID, source, qname, qname_main, field_no, field_name, main, q_code, q_text, order)
# Put together
yes_data <- full_join(yes_data1, yes_data2)
write_csv(yes_data, "2_data_deidentified/subsets/data_yes_results_basic.csv")
#### Make "has NOT accessed data" dataset ------------------------------------
no_data <- all_surveys %>%
select("ID", "source", starts_with("ND-")) %>%
select(!ends_with("_TEXT")) %>% # Take out the text answers
pivot_longer(cols = 3:ncol(.),
names_to = "qname",
values_to = "choice") %>%
# separate main questions code and numeric options
separate(qname, c("qname_main", "q_code"), sep = "_", remove = FALSE, convert = TRUE) %>%
left_join(select(questions_detail, qname_main, q_type), relationship = "many-to-many") %>% # add on question type so we can use it to add correct text
filter(!duplicated(.)) %>%
mutate(order = if_else(q_type == "RO", # For the ordering questions, save the order as a separate column
true = choice,
false = NA)) %>%
mutate(q_code = if_else(is.na(q_code), # If the response is NA in column q_code, it means it's a single answer MC question
true = choice, # Move the answer in choice to q_code
false = q_code)) %>% # Otherwise just keep q_code
filter(!is.na(choice)) %>% # Remove NA (not picked) questions/answers
select(!choice) %>% # no need for this column anymore
left_join(select(questions_detail, qname, qname_main, main, q_code, q_text)) %>%
# reorder columns...
select(ID, source, qname, qname_main, main, q_code, q_text, order)
write_csv(no_data, "2_data_deidentified/subsets/data_no_results_basic.csv")
#### Make "dashboard" dataset ------------------------------------
dashboard <- all_surveys %>%
select("ID", "source", starts_with("T-")) %>%
select(!ends_with("_TEXT")) %>% # Take out the text answers
pivot_longer(cols = 3:ncol(.),
names_to = "qname",
values_to = "choice") %>%
# separate main questions code and numeric options
separate(qname, c("qname_main", "q_code"), sep = "_", remove = FALSE, convert = TRUE) %>%
left_join(select(questions_detail, qname_main, q_type), relationship = "many-to-many") %>% # add on question type so we can use it to add correct text
filter(!duplicated(.)) %>%
mutate(order = if_else(q_type == "RO", # For the ordering questions, save the order as a separate column
true = choice,
false = NA)) %>%
mutate(q_code = if_else(is.na(q_code), # If the response is NA in column q_code, it means it's a single answer MC question
true = choice, # Move the answer in choice to q_code
false = q_code)) %>% # Otherwise just keep q_code
filter(!is.na(choice)) %>% # Remove NA (not picked) questions/answers
select(!choice) %>% # no need for this column anymore
left_join(select(questions_detail, qname, qname_main, main, q_code, q_text)) %>%
# reorder columns...
select(ID, source, qname, qname_main, main, q_code, q_text, order)
write_csv(dashboard, "2_data_deidentified/subsets/dashboard_results_basic.csv")
#### Make "trust" dataset ------------------------------------
trust <- all_surveys %>%
select("ID", "source", starts_with("TR-")) %>%
select(!ends_with("_TEXT")) %>% # Take out the text answers
pivot_longer(cols = 3:ncol(.),
names_to = "qname",
values_to = "choice") %>%
# separate main questions code and numeric options
separate(qname, c("qname_main", "q_code"), sep = "_", remove = FALSE, convert = TRUE) %>%
left_join(select(questions_detail, qname_main, q_type), relationship = "many-to-many") %>% # add on question type so we can use it to add correct text
filter(!duplicated(.)) %>%
mutate(order = if_else(q_type == "RO", # For the ordering questions, save the order as a separate column
true = choice,
false = NA)) %>%
mutate(q_code = if_else(is.na(q_code), # If the response is NA in column q_code, it means it's a single answer MC question
true = choice, # Move the answer in choice to q_code
false = q_code)) %>% # Otherwise just keep q_code
filter(!is.na(choice)) %>% # Remove NA (not picked) questions/answers
select(!choice) %>% # no need for this column anymore
left_join(select(questions_detail, qname, qname_main, main, q_code, q_text)) %>%
# reorder columns...
select(ID, source, qname, qname_main, main, q_code, q_text, order)
write_csv(trust, "2_data_deidentified/subsets/trust_results_basic.csv")
#### Make "demographics" dataset ------------------------------------
demographics <- all_surveys %>%
select("ID", "source", starts_with("DE-")) %>%
select(!ends_with("_TEXT")) %>% # Take out the text answers
pivot_longer(cols = 4:ncol(.),
names_to = "qname",
values_to = "choice") %>%
# separate main questions code and numeric options
separate(qname, c("qname_main", "q_code"), sep = "_", remove = FALSE, convert = TRUE) %>%
left_join(select(questions_detail, qname_main, q_type), relationship = "many-to-many") %>% # add on question type so we can use it to add correct text
filter(!duplicated(.)) %>%
mutate(order = if_else(q_type == "RO", # For the ordering questions, save the order as a separate column
true = choice,
false = NA)) %>%
mutate(q_code = if_else(is.na(q_code), # If the response is NA in column q_code, it means it's a single answer MC question
true = choice, # Move the answer in choice to q_code
false = q_code)) %>% # Otherwise just keep q_code
filter(!is.na(choice)) %>% # Remove NA (not picked) questions/answers
select(!choice) %>% # no need for this column anymore
left_join(select(questions_detail, qname, qname_main, main, q_code, q_text)) %>%
# reorder columns...
select(ID, source, `DE-1`, qname, qname_main, main, q_code, q_text, order)
write_csv(demographics, "2_data_deidentified/subsets/demographics_results_basic.csv")