From e3a646f2f8a47db94c44462a48e6d9bbbf094642 Mon Sep 17 00:00:00 2001 From: marjom02 Date: Mon, 27 May 2024 11:45:06 +0100 Subject: [PATCH] changes to activity after death flag --- R/add_activity_after_death_flag.R | 53 +++++++++++++++++-------------- R/get_slf_lookup_paths.R | 13 ++++---- 2 files changed, 37 insertions(+), 29 deletions(-) diff --git a/R/add_activity_after_death_flag.R b/R/add_activity_after_death_flag.R index 55777859d..6be5c5f73 100644 --- a/R/add_activity_after_death_flag.R +++ b/R/add_activity_after_death_flag.R @@ -11,19 +11,22 @@ add_activity_after_death_flag <- function( data, year, - deaths_data = read_file(all_slf_deaths_lookup_path())) { - # Match on BOXI NRS deaths lookup for records without chi - data <- data %>% + deaths_data = read_file(get_all_slf_deaths_lookup_path())) { + death_joined <- data %>% + dplyr::select(year, chi, record_keydate1, record_keydate2, death_date, deceased) %>% dplyr::filter(!is.na(chi) | chi != "") %>% dplyr::left_join( - deaths_data, + deaths_data %>% + slfhelper::get_chi(), by = "chi", suffix = c("", "_boxi") - ) + ) %>% + dplyr::filter(deceased == TRUE | deceased_boxi == TRUE) %>% + dplyr::distinct() # Check and print error message for records which already have a death_date in the episode file, but this doesn't match the BOXI death date - check_death_date_match <- data %>% + check_death_date_match <- death_joined %>% dplyr::filter(death_date != death_date_boxi) if (nrow(check_death_date_match) != 0) { @@ -32,7 +35,7 @@ add_activity_after_death_flag <- function( # Check and print error message for records which have a record_keydate1 after their BOXI death date - check_keydate1_death_date <- data %>% + check_keydate1_death_date <- death_joined %>% dplyr::filter(record_keydate1 > death_date_boxi) if (nrow(check_death_date_match) != 0) { @@ -40,24 +43,24 @@ add_activity_after_death_flag <- function( } - flag_data <- data %>% + flag_data <- death_joined %>% dplyr::mutate( - flag_keydate1 = if_else(record_keydate1 > death_date_boxi, 1, 0), - flag_keydate2 = if_else(record_keydate2 > death_date_boxi, 1, 0), + flag_keydate1 = dplyr::if_else(record_keydate1 > death_date_boxi, 1, 0), + flag_keydate2 = dplyr::if_else(record_keydate2 > death_date_boxi, 1, 0), # Next flag records with 'ongoing' activity after date of death (available from BOXI) if keydate2 is missing and the death date occurs in # in the current or a previous financial year. - flag_keydate2_missing = if_else(((is.na(record_keydate2) | record_keydate2 == "") & (death_date_boxi <= paste0("20", substr(year, 3, 4), "-03-31"))), 1, 0), + flag_keydate2_missing = dplyr::if_else(((is.na(record_keydate2) | record_keydate2 == "") & (death_date_boxi <= paste0("20", substr(year, 3, 4), "-03-31"))), 1, 0), # Also flag records without a death_date in the episode file, but the BOXI death date occurs in the current or a previous financial year. - flag_deathdate_missing = if_else(((is.na(death_date) | death_date == "") & (death_date_boxi <= paste0("20", substr(year, 3, 4), "-03-31"))), 1, 0) + flag_deathdate_missing = dplyr::if_else(((is.na(death_date) | death_date == "") & (death_date_boxi <= paste0("20", substr(year, 3, 4), "-03-31"))), 1, 0) ) %>% # These should be flagged by one of the two lines of code above, but in these cases, we will also fill in the blank death date if appropriate # Search all variables beginning with "flag_" for value "1" and create new variable to flag cases where 1 is present # Multiplying by 1 changes flag from true/false to 1/0 dplyr::mutate(activity_after_death = purrr::pmap_dbl( - select(., contains("flag_")), + dplyr::select(., contains("flag_")), ~ any(grepl("^1$", c(...)), na.rm = TRUE ) * 1 @@ -77,18 +80,17 @@ add_activity_after_death_flag <- function( # Fill in date of death if missing in the episode file but available in BOXI lookup, due to historic dates of death not being carried # over from previous financial years flag_data <- flag_data %>% - dplyr::mutate(death_date = if_else(((is.na(death_date) | death_date == "") & (death_date_boxi <= paste0("20", substr(year, 1, 2), "-03-31"))), death_date_boxi, death_date)) %>% - dplyr::mutate(deceased = if_else(((is.na(deceased) | deceased == "") & (deceased_boxi == TRUE)), deceased_boxi, deceased)) %>% + dplyr::filter(activity_after_death == 1) %>% # Remove temporary flag variables used to create activity after death flag and fill in missing death_date - dplyr::select(-c(death_date_boxi, deceased_boxi, flag_keydate1, flag_keydate2, flag_keydate2_missing, flag_deathdate_missing)) + dplyr::select(year, chi, record_keydate1, record_keydate2, activity_after_death) %>% + dplyr::distinct() # Match activity after death flag back to episode file final_data <- data %>% dplyr::left_join( flag_data, - by = "chi", - na_matches = "never", - relationship = "many-to-one" + by = c("year", "chi", "record_keydate1", "record_keydate2"), + na_matches = "never" ) @@ -113,7 +115,8 @@ add_activity_after_death_flag <- function( #' #' # Read data------------------------------------------------ -process_deaths_lookup <- function(update = latest_update(), ...) { +process_deaths_lookup <- function(update = latest_update(), + write_to_disk = TRUE, ...) { all_boxi_deaths <- read_file(get_slf_deaths_lookup_path("1415")) %>% rbind(read_file(get_slf_deaths_lookup_path("1516"))) %>% rbind(read_file(get_slf_deaths_lookup_path("1617"))) %>% @@ -125,7 +128,7 @@ process_deaths_lookup <- function(update = latest_update(), ...) { rbind(read_file(get_slf_deaths_lookup_path("2223"))) %>% rbind(read_file(get_slf_deaths_lookup_path("2324"))) %>% # Can this be automated to pick up files starting with name "get_slf_deaths_lookup_path"? - + slfhelper::get_chi() %>% # Remove rows with missing or blank CHI number - could also use na.omit? # na.omit(all_boxi_deaths) dplyr::filter(!is.na(chi) | chi != "") @@ -169,8 +172,12 @@ process_deaths_lookup <- function(update = latest_update(), ...) { # Maybe save as its own function # Write the all BOXI NRS deaths lookup file to disk, so this can be used to populate activity after death flag in each episode file if (write_to_disk) { - all_boxi_deaths %>% - write_file(get_all_slf_deaths_lookup_path()) + write_file( + all_boxi_deaths, + fs::path(get_slf_dir(), "Deaths", + file_name = stringr::str_glue("anon-all_slf_deaths_lookup_{update}.parquet") + ) + ) } return(all_boxi_deaths) diff --git a/R/get_slf_lookup_paths.R b/R/get_slf_lookup_paths.R index d7e68c494..3a1b932be 100644 --- a/R/get_slf_lookup_paths.R +++ b/R/get_slf_lookup_paths.R @@ -63,7 +63,7 @@ get_slf_deaths_lookup_path <- function(year, ...) { # Review the naming convention of this path and file slf_deaths_lookup_path <- get_file_path( directory = fs::path(get_slf_dir(), "Deaths"), - file_name = stringr::str_glue("slf_deaths_lookup_{year}.parquet"), + file_name = stringr::str_glue("anon-slf_deaths_lookup_{year}.parquet"), ... ) @@ -82,19 +82,20 @@ get_slf_deaths_lookup_path <- function(year, ...) { #' @family slf lookup file path #' @seealso [get_file_path()] for the generic function. -get_all_slf_deaths_lookup_path <- function(update = latest_update()) { +get_all_slf_deaths_lookup_path <- function(update = latest_update(), ...) { # Note this name is very similar to the existing slf_deaths_lookup_path which returnsthe path for # the processed BOXI extract for each financial year. This function will return the combined financial # years lookup i.e. all years put together. all_slf_deaths_lookup_path <- get_file_path( - directory = fs::path(get_slf_dir(), "Deaths", - file_name = stringr::str_glue("all_slf_deaths_lookup_{update}.parquet") - ) + directory = fs::path(get_slf_dir(), "Deaths"), + file_name = stringr::str_glue("anon-all_slf_deaths_lookup_{update}.parquet"), + ... ) - return(all_slf_deaths_lookup_path) } + + #' SLF CHI Deaths File Path #' #' @description Get the full path to the CHI deaths file