-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathNumvars_Cleaning_Script_v01.Rmd
136 lines (131 loc) · 7.92 KB
/
Numvars_Cleaning_Script_v01.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
---
title: "Numvars_Cleaning_Script_v01.2"
author: "Rita M. Ludwig, Ph.D."
date: "`r Sys.Date()`"
output: html_document
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(warning = FALSE, message = FALSE)
## Load in
#Packages
library(tidyverse)
library(sjlabelled)
library(stringr)
library(textclean)
#Data
load("~/P1_Baseline_Cleaned_2023.Rdata")
#Dictionary
datadict = read.csv('~/P1_baseline_datadictionary_2023.csv')
## Set path and filename to write out final dataset to
path = "~/Cleaned"
filename = "P1_Baseline_Cleaned_2023"
```
INPUT: 1) Cleaned dataset.Rdata 2) Data dictionary.csv
OUTPUT: Three new collections of variables (see below for details); cleaned dataset .csv, .dta, .Rdata files.
BEHAVIOR: Cleans variables with expected numerical values that were collected with open text entry fields. For variables that are of number entry type, passes them through a standardized set of cleaning steps to result in a numeric value. For variables that are part of entry-frequency pairs, uses a function to generate a) a monthly figure, b) an annual figure, and c) an hourly figure (when applicalble) for each variable category.
```{r variable_vectors, include=FALSE}
## Create vectors of the variables to be cleaned. These will be all variables of 'number entry' type, which include financial frequency-amount pair variables.
#Vector of all financial variables to be cleaned in this script.
finvars = data %>%
select(., contains("entry") | contains("frequency")) %>%
select(., contains("finances")| contains("jobs")) %>%
colnames(.)
#Vector of all frequency variables.
freqvars = finvars %>%
str_subset(., "frequency") %>%
str_extract(., pattern = "^([^_]*_[^_]*[^_]*)_")
#Vector of all number entry vars. Need this in order to clean the text that participants provided.
entryvars = datadict %>%
filter(type == "number entry") %>%
pull(canonical_name)
#Create another vector of the variables expecting annual figures that aren't part of freq-amt pairs, since some people provided monthly and that needs to be converted.
annualvars = entryvars %>%
str_subset(., "earn|housing")
## Store vector of value labels, because they will be destroyed in mutate calls and will need to be re-applied at the end.
varlabs = data %>% var_label(.)
#var_label(df1) <- labs
```
```{r entry_cleaning, include=FALSE}
## Clean the number entry variables. Run through a standardized list of cleaning steps, including converting all of the 'cute' ways participants enter numbers. This cleaning saves the updates in-place, meaning that no new 'cleaned' variables are created in the dataset; the original values are overwritten with the cleaned ones.
#Clean the data to make these variables numerical. Converts 'don't know' answers to NA; replace Os substituted for 0s in hundreds; replace commas written after the dollar figure to periods (e.g. "90,00 to 90.00"); replace Ks following numbers with 000s; convert numbers written in English to digits, strip decimal places from numbers.
data = data %>%
mutate(across(all_of(entryvars), ~ tolower(.))) %>%
mutate(across(all_of(entryvars), ~ replace(., str_detect(., "not sure"), NA_real_))) %>%
mutate(across(all_of(entryvars), ~ replace(., str_detect(., "unknown"), NA_real_))) %>%
mutate(across(all_of(entryvars), ~ replace(., str_detect(., "don't know"), NA_real_))) %>%
mutate(across(all_of(entryvars), ~ replace(., str_detect(., "n\\\\a"), NA_real_))) %>%
mutate(across(all_of(entryvars), ~ replace(., str_detect(., "skip"), NA_real_))) %>%
mutate(across(all_of(entryvars), ~ replace(., str_detect(., "none"), 0))) %>%
mutate(across(all_of(entryvars), ~ str_replace_all(., 'OOO', "000"))) %>%
mutate(across(all_of(entryvars), ~ str_replace_all(., '(,\\d{2})$', '.00'))) %>%
mutate(across(all_of(entryvars), ~ str_replace_all(., regex('(?<![[:alpha:]])K\\b', ignore_case = TRUE), "000"))) %>%
mutate(across(all_of(entryvars), ~ str_replace_all(., regex('thousand', ignore_case = TRUE), "000"))) %>%
mutate(across(all_of(entryvars), ~ str_remove(., pattern = "\\.{1,}[0-9]{2}"))) %>%
mutate(across(all_of(entryvars), ~ mgsub(., replace_number(seq_len(100)), seq_len(100))))
#Now, detect instances where variables expecting annual figures were provided monthly ones, and calculate their annual estimates.
monthtoannual = function(x) {
tempdata = data %>%
select(., all_of(c("id", x))) %>%
filter(., str_detect(!!sym(x), "month|mo.|mo|mth")) %>%
mutate(across(all_of(!!x), ~ str_extract(., pattern = "\\d+,?\\d+"))) %>%
mutate(across(all_of(!!x), ~ str_replace_all(., '(,\\d{2})$', ''))) %>%
mutate(across(all_of(!!x), ~ str_remove_all(., "\\D+"))) %>%
mutate(across(all_of(!!x), ~ as.numeric(.)*12)) %>%
mutate(across(all_of(!!x), ~ as.character(.)))
data <<- data %>% rows_update(., tempdata, by = "id")
}
#Run the function
map(annualvars, ~monthtoannual(.x))
#Find numbers that are provided in a range indicated with a dash between numbers and take the median
data = data %>%
mutate(across(all_of(entryvars), ~ case_when(str_detect(., "\\-") == TRUE
~ sapply(str_extract_all(., '[0-9.]+'), function(x) as.character(round(median(as.numeric(x)),0))),
TRUE ~ .)))
#Finally, clean all of the income variables so that the number is extracted.
data = data %>%
mutate(across(all_of(entryvars), ~ str_remove_all(., "\\D+"))) %>%
mutate(across(all_of(entryvars), ~ as.numeric(.)))
```
```{r frequency_cleaning, include=FALSE}
## Clean the frequency variables and convert them all into a standard metric. Here, generate two new variables each - one for monthly, and one for annual converted amounts.
#This function takes the vector of frequency variable prefixes and runs them through a mathematical conversion to generate monthly and annual figures.
freqconv = function(z) {
tempnames = data %>%
select(., "id", contains(z)) %>%
select(., "id", contains("frequency") | contains("entry"))
tempfreq = str_subset(colnames(tempnames), "frequency")
tempentr = str_subset(colnames(tempnames), "entry")
tempnames = tempnames %>%
mutate(across(all_of(tempfreq), ~ sjlabelled::as_label(., drop.NA = TRUE)))
tempnames = tempnames %>%
mutate(., "{z}annually" := case_when(!!sym(tempfreq) == "Daily" ~ !!sym(tempentr)*365,
!!sym(tempfreq) == "Weekly" ~ !!sym(tempentr)*52,
!!sym(tempfreq) == "Biweekly" ~ !!sym(tempentr)*26,
!!sym(tempfreq) == "Bimonthly" ~ !!sym(tempentr)*24,
!!sym(tempfreq) == "Monthly" ~ !!sym(tempentr)*12,
!!sym(tempfreq) == "Annually" ~ !!sym(tempentr)*1
))
tempnames = tempnames %>%
mutate(., "{z}monthly" := case_when(!!sym(tempfreq) == "Daily" ~ !!sym(tempentr)*30.437,
!!sym(tempfreq) == "Weekly" ~ !!sym(tempentr)*4.35,
!!sym(tempfreq) == "Biweekly" ~ !!sym(tempentr)*2.17,
!!sym(tempfreq) == "Bimonthly" ~ !!sym(tempentr)*2,
!!sym(tempfreq) == "Monthly" ~ !!sym(tempentr)*1,
!!sym(tempfreq) == "Annually" ~ !!sym(tempentr)/12
) %>% ceiling(.))
tempnames = tempnames %>%
mutate(., "{z}hourly" := case_when(!!sym(tempfreq) == "Hourly" ~ !!sym(tempentr)*1))
tempnames = tempnames %>% select(-c(tempfreq, tempentr))
data <<- data %>% left_join(., tempnames, by = "id")
}
#Run the function
map(freqvars, ~freqconv(.x))
## Finally, apply labels back to the data
var_label(data) = varlabs
```
Save cleaned data files.
```{r save the data, include=FALSE}
write_csv(data, file.path(path, paste0(filename,".csv")))
write_stata(data, file.path(path, paste0(filename,".dta")), drop.na = FALSE, version = 14)
save(data, file = file.path(path, paste0(filename,".Rdata")))
```