Numvars_Cleaning_Script_v01.Rmd

---
title: "Numvars_Cleaning_Script_v01.2"
author: "Rita M. Ludwig, Ph.D."
date: "`r Sys.Date()`"
output: html_document
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(warning = FALSE, message = FALSE)
## Load in
#Packages
library(tidyverse)
library(sjlabelled)
library(stringr)
library(textclean)
#Data
load("~/Cleaned/P1_Baseline_Labelled_2023.Rdata")
#Dictionary
datadict = read.csv('~/P1_baseline_datadictionary_2023.csv')

## Set path and filename to write out final dataset to
path = "~/Cleaned"
filename = "P1_Baseline_Cleaned_2023"
```
INPUT: 1) Cleaned dataset.Rdata 2) Data dictionary.csv
OUTPUT: Three new collections of variables (see below for details); cleaned dataset .csv, .dta, .Rdata files.
BEHAVIOR: Cleans variables with expected numerical values that were collected with open text entry fields. For variables that are of number entry type, passes them through a standardized set of cleaning steps to result in a numeric value. For variables that are part of entry-frequency pairs, uses a function to generate a) a monthly figure, b) an annual figure, and c) an hourly figure (when applicalble) for each variable category.
```{r variable_vectors, include=FALSE}
## Create vectors of the variables to be cleaned. These will be all variables of 'number entry' type, which include financial frequency-amount pair variables.
#Vector of all financial variables to be cleaned in this script. 
finvars = data %>%
  select(., contains("entry") | contains("frequency")) %>%
  select(., contains("finances")| contains("jobs")) %>%
  colnames(.)
#Vector of all frequency variables.
freqvars = finvars %>%
  str_subset(., "frequency") %>%
  str_extract(., pattern = "^([^_]*_[^_]*[^_]*)_")
#Vector of all number entry vars. Need this in order to clean the text that participants provided.
entryvars = datadict %>% 
  filter(type == "number entry") %>% 
  pull(canonical_name)
#Create another vector of the variables expecting annual figures that aren't part of freq-amt pairs, since some people provided monthly and that needs to be converted.
annualvars = entryvars %>%
  str_subset(., "earn|housing")

## Store vector of value labels, because they will be destroyed in mutate calls and will need to be re-applied at the end.
varlabs = data %>% var_label(.)  
#var_label(df1) <- labs
```
```{r entry_cleaning, include=FALSE}
## Clean the number entry variables. Run through a standardized list of cleaning steps, including converting all of the 'cute' ways participants enter numbers. This cleaning saves the updates in-place, meaning that no new 'cleaned' variables are created in the dataset; the original values are overwritten with the cleaned ones.
#Clean the data to make these variables numerical. Converts 'don't know' answers to NA; replace Os substituted for 0s in hundreds; replace commas written after the dollar figure to periods (e.g. "90,00 to 90.00"); replace Ks following numbers with 000s; convert numbers written in English to digits, strip decimal places from numbers.
data = data %>% 
  mutate(across(all_of(entryvars), ~ tolower(.))) %>%
  mutate(across(all_of(entryvars), ~ replace(., str_detect(., "not sure"), NA_real_))) %>%
  mutate(across(all_of(entryvars), ~ replace(., str_detect(., "unknown"), NA_real_))) %>%
  mutate(across(all_of(entryvars), ~ replace(., str_detect(., "don't know"), NA_real_))) %>%
  mutate(across(all_of(entryvars), ~ replace(., str_detect(., "n\\\\a"), NA_real_))) %>%
  mutate(across(all_of(entryvars), ~ replace(., str_detect(., "skip"), NA_real_))) %>%
  mutate(across(all_of(entryvars), ~ replace(., str_detect(., "none"), 0))) %>%
  mutate(across(all_of(entryvars), ~ str_replace_all(., 'OOO', "000"))) %>%
  mutate(across(all_of(entryvars), ~ str_replace_all(., '(,\\d{2})$', '.00'))) %>%
  mutate(across(all_of(entryvars), ~ str_replace_all(., regex('(?<![[:alpha:]])K\\b', ignore_case = TRUE), "000"))) %>%
  mutate(across(all_of(entryvars), ~ str_replace_all(., regex('thousand', ignore_case = TRUE), "000"))) %>%
  mutate(across(all_of(entryvars), ~ str_remove(., pattern = "\\.{1,}[0-9]{2}"))) %>%
  mutate(across(all_of(entryvars), ~ mgsub(., replace_number(seq_len(100)), seq_len(100))))

#Now, detect instances where variables expecting annual figures were provided monthly ones, and calculate their annual estimates.
monthtoannual = function(x) {
  tempdata = data %>% 
    select(., all_of(c("id", x))) %>%
    filter(., str_detect(!!sym(x), "month|mo.|mo|mth")) %>%
    mutate(across(all_of(!!x), ~ str_extract(., pattern = "\\d+,?\\d+"))) %>%
    mutate(across(all_of(!!x), ~ str_replace_all(., '(,\\d{2})$', ''))) %>%
    mutate(across(all_of(!!x), ~ str_remove_all(., "\\D+"))) %>%
    mutate(across(all_of(!!x), ~ as.numeric(.)*12)) %>%
    mutate(across(all_of(!!x), ~ as.character(.)))
  data <<- data %>% rows_update(., tempdata, by = "id")
}
#Run the function
map(annualvars, ~monthtoannual(.x))
#Find numbers that are provided in a range indicated with a dash between numbers and take the median
data = data %>%
  mutate(across(all_of(entryvars), ~ case_when(str_detect(., "\\-") == TRUE
                                               ~ sapply(str_extract_all(., '[0-9.]+'), function(x) as.character(round(median(as.numeric(x)),0))),
                                               TRUE ~ .)))
#Finally, clean all of the income variables so that the number is extracted.
data = data %>%
  mutate(across(all_of(entryvars), ~ str_remove_all(., "\\D+"))) %>%
  mutate(across(all_of(entryvars), ~ as.numeric(.)))
```
```{r frequency_cleaning, include=FALSE}
## Clean the frequency variables and convert them all into a standard metric. Here, generate two new variables each - one for monthly, and one for annual converted amounts.
#This function takes the vector of frequency variable prefixes and runs them through a mathematical conversion to generate monthly and annual figures.
freqconv = function(z) {
  tempnames = data %>% 
  select(., "id", contains(z)) %>%
  select(., "id", contains("frequency") | contains("entry")) 
  tempfreq = str_subset(colnames(tempnames), "frequency")
  tempentr = str_subset(colnames(tempnames), "entry")
  tempnames = tempnames %>%
    mutate(across(all_of(tempfreq), ~ sjlabelled::as_label(., drop.NA = TRUE)))
  tempnames = tempnames %>%
    mutate(., "{z}annually" := case_when(!!sym(tempfreq) == "Daily" ~ !!sym(tempentr)*365,
                                         !!sym(tempfreq) == "Weekly" ~ !!sym(tempentr)*52,
                                         !!sym(tempfreq) == "Biweekly" ~ !!sym(tempentr)*26,
                                         !!sym(tempfreq) == "Bimonthly" ~ !!sym(tempentr)*24,
                                         !!sym(tempfreq) == "Monthly" ~ !!sym(tempentr)*12,
                                         !!sym(tempfreq) == "Annually" ~ !!sym(tempentr)*1,
                                         !!sym(tempfreq) == "No Frequency" ~ !!sym(tempentr)*1
                                         ))
   tempnames = tempnames %>% 
    mutate(., "{z}monthly" := case_when(!!sym(tempfreq) == "Daily" ~ !!sym(tempentr)*30.437,
                                      !!sym(tempfreq) == "Weekly" ~ !!sym(tempentr)*4.35,
                                      !!sym(tempfreq) == "Biweekly" ~ !!sym(tempentr)*2.17,
                                      !!sym(tempfreq) == "Bimonthly" ~ !!sym(tempentr)*2,
                                      !!sym(tempfreq) == "Monthly" ~ !!sym(tempentr)*1,
                                      !!sym(tempfreq) == "Annually" ~ !!sym(tempentr)/12
                                      ) %>% ceiling(.))
   tempnames = tempnames %>%
     mutate(., "{z}hourly" := case_when(!!sym(tempfreq) == "Hourly" ~ !!sym(tempentr)*1))
   tempnames = tempnames %>% select(-c(tempfreq, tempentr)) %>%
     select_if(function(x) !all(is.na(x)))
   data <<- data %>% left_join(., tempnames, by = "id")
}
#Run the function
map(freqvars, ~freqconv(.x))

## Finally, apply labels back to the data
var_label(data) = varlabs
```
Save cleaned data files.
```{r save the data, include=FALSE}
write_csv(data, file.path(path, paste0(filename,".csv")))
write_stata(data, file.path(path, paste0(filename,".dta")), drop.na = FALSE, version = 14)
save(data, file = file.path(path, paste0(filename,".Rdata")))
```