-
Notifications
You must be signed in to change notification settings - Fork 0
/
1a_rawdata_deidentify_text.R
151 lines (129 loc) · 6.94 KB
/
1a_rawdata_deidentify_text.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
### Using collaborative open science tools to improve engagement with the
# ecology of the Guana River Estuary
# Geraldine Klarenberg, PhD
# 19 June 2023
#### Survey data extraction ####
# THIS SCRIPT ONLY WORKS ON THE RAW **NON DE-IDENTIFIED DATA**
# You will need to run the script that sets your Qualtrics key FIRST, before running
# this script. This script will only work if you have admin access to the project's
# surveys (see below).
# If you have questions about this script or need to access the raw data, please
# contact the author of this script (see above).
#### NOTE for project collaborators: when running this script, make sure the raw
# data is saved to your hard disk. De-identified data is confidential and is not
# allowed to be stored in the cloud or shared drives etc. As per IRB regulations.
# This script / project is set up with renv, meaning it restores the versions of
# the packages that were last used (when everything worked, we assume)
# See https://rstudio.github.io/renv/articles/renv.html
# Start all runs of this script with:
renv::restore()
# This ensures it uses the packages last used when everything worked okay. This
# also ensures these packages are installed if you don't have them
# Load packages
library("qualtRics")
library("tidyverse")
# Documentation for qualtRics: https://docs.ropensci.org/qualtRics/
#### API credentials were set up in file 0a ####
# This script will only work if these credentials are set!!!
# You can only use the API to download Qualtrics results for surveys that you are
# admin on. To check which surveys you can access, you can uncomment and run
# the next line:
# all_surveys()
# Our survey IDs are
all_survey_ids <- c("SV_agTpds5m6MDrqAe", # visitor's center
"SV_bqMgSIcEsmEy91Q", # kiosk
"SV_ezASGpdQPyOr8Xk", # social media
"SV_9RjNbEveqMo0MLk") # email
names(all_survey_ids) <- c("visitor", "kiosk", "socialmedia", "email")
#### Get surveys ####
# Collect all our surveys (this can take a while to run)
if(exists("survey_data")){
rm(survey_data)
}
for (survey in all_survey_ids){
#### If you want to download from Qualtrics, uncomment and run the next line
# survey_ind <- fetch_survey(surveyID = survey, # read survey data
# # FYI without changing convert I could not get the
# # full_join to work properly. Some problem with the
# # factor levels
# label = FALSE, # recoded values instead of text
# convert = FALSE, # no conversion to proper data type
# force_request = TRUE, # this forces a download (instead
# # of loading existing temporary data downloaded earlier)
# save_dir = "1_survey_downloads_confidential") # surveys will be saved
# # as RDS files in this directory
#### If you want to load existing downloads, uncomment and run the next line
survey_ind <- readRDS(file = paste0("1_survey_downloads_confidential/",
survey, ".Rds"))
if (nrow(survey_ind != 0)){ # only if the survey has results, attach them
if (!exists("survey_data")){ # if survey_data does not yet exist, create it
survey_ind$source <- names(all_survey_ids[which(all_survey_ids == survey)])
survey_data <- survey_ind
} else { # if it does exist, add on
survey_ind$source <- names(all_survey_ids[which(all_survey_ids == survey)])
survey_data <- full_join(survey_data, survey_ind)
} # change something here about how data is read in! QD-2 has 7 levels in one
# survey and 8 in the other!
}
}
# Filter for those who agreed and for those who did not finalize the survey
survey_data_unfinished <- survey_data %>%
filter(`Informed Consent` == "I agree",
Finished == FALSE)
# Print to screen how many unfinished surveys...
nrow(survey_data_unfinished)
# None of these probably have email addresses in them, but just in case, saving
# these to the confidential folder (not tracked by git)
write_csv(survey_data_unfinished, "1_survey_downloads_confidential/survey_data_unfinished_text_raw.csv")
# Filter for those who agreed and for those who finalized the survey
survey_data <- survey_data %>%
mutate(`F-4` = replace_na(`F-4`, "no email")) %>%
filter(`Informed Consent` == "I agree",
# Finished == TRUE, # 15 August 2023: use all surveys for report
`F-4` != "[email protected]") # Remove test survey Shirley did
# Save email addresses of people that want to stay involved in a separate file
# These questions are F-1 through F-4
survey_data_contacts <- survey_data %>%
select(starts_with("F-"))
# Add proper questions as headers instead of codes
questions_detail <- read_csv("1_metadata/mc_questions_options.csv")
names(survey_data_contacts) <- questions_detail %>%
filter(str_detect(qname, "F-")) %>%
filter(!duplicated(qname)) %>%
pull(main) # pull instead of select gives a vector, which what we need
# for renaming the headers
names(survey_data_contacts)[2] <- paste0(names(survey_data_contacts)[2], " - meetings")
names(survey_data_contacts)[3] <- paste0(names(survey_data_contacts)[3], " - surveys")
names(survey_data_contacts)[4] <- paste0(names(survey_data_contacts)[4], " - testing")
# Make long version
survey_data_contacts <- survey_data_contacts %>%
pivot_longer(cols = 2:4,
names_to = "question",
values_to = "type_input") %>%
#select(!question) %>% # superfluous column now
mutate(type_input = if_else(condition = (`Although you do not want to give further input on the project, would you like to receive email updates?` == "Yes"),
true = "Email updates only",
false = type_input,
missing = type_input)) %>%
distinct() # Remove duplicates
# Am not removing the NAs in type_input, this way we are also keeping people that
# said no to wanting to give further input and also no to receiving email updates.
# Maybe/probably still good info to have?
write_csv(survey_data_contacts, "2_data_deidentified/survey_data_contacts.csv")
# De-identify data
# Remove email addresses, IPAddress, latitude and longitude
# Assign random unique IDs
set.seed(7) # Set a seed so we get the same randomized numbers every time
random_IDs <- sample(x = 1:nrow(survey_data),
size = nrow(survey_data),
replace = FALSE)
survey_data_safe <- survey_data %>%
select(!c(starts_with("F-"), "IPAddress", "LocationLatitude", "LocationLongitude")) %>%
mutate(ID = random_IDs)
# Tidy up: remove unnecessary columns
survey_data_safe <- survey_data_safe %>%
select(!c("Status", "Progress", "Finished", "ResponseId", "RecipientLastName",
"RecipientFirstName", "RecipientEmail", "ExternalReference",
"DistributionChannel", "UserLanguage", "Informed Consent"))
write_csv(survey_data_safe, "2_data_deidentified/survey_data_safe_text_raw.csv")