-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
Related to #862 WIP - Added function for get_all_slf_deaths_lookup_path which combines all financial years' extract. - Added function for add_activity_after_death_flag Following identification of activity recorded after death due to data quality or admin issues, an investigation into sources of death data available for SLFs was undertaken. This guided the addition of a new variable to flag records where activity occurred after the death of date. Further details about the issues and methodology can be found in the accompanying "Investigation into activity after death" report.
- Loading branch information
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,184 @@ | ||
#' Match on BOXI NRS death dates to process activity after death flag | ||
#' | ||
#' @description Match on CHI number where available in the episode file, and add date of death from the BOXI NRS lookup. | ||
#' Create new activity after death flag | ||
#' | ||
#' @param data episode files | ||
#' @param year financial year, e.g. '1920' | ||
#' @param deaths_data The death data for the year | ||
#' | ||
#' @return data flagged if activity after death | ||
add_activity_after_death_flag <- function( | ||
data, | ||
year, | ||
deaths_data = read_file(get_all_slf_deaths_lookup_path()) %>% | ||
slfhelper::get_chi()) { | ||
death_joined <- data %>% | ||
dplyr::select(.data$year, .data$chi, .data$record_keydate1, .data$record_keydate2, .data$death_date, .data$deceased) %>% | ||
dplyr::filter(!is.na(.data$chi) | .data$chi != "") %>% | ||
dplyr::left_join( | ||
deaths_data, | ||
by = "chi", | ||
suffix = c("", "_boxi") | ||
) %>% | ||
dplyr::filter(.data$deceased == TRUE | .data$deceased_boxi == TRUE) %>% | ||
dplyr::distinct() | ||
|
||
|
||
# Check and print error message for records which already have a death_date in the episode file, but this doesn't match the BOXI death date | ||
check_death_date_match <- death_joined %>% | ||
dplyr::filter(.data$death_date != .data$death_date_boxi) | ||
|
||
if (nrow(check_death_date_match) != 0) { | ||
warning("There were records in the episode file which already have a death_date, but does not match the BOXI NRS death date.") | ||
} | ||
|
||
|
||
# Check and print error message for records which have a record_keydate1 after their BOXI death date | ||
check_keydate1_death_date <- death_joined %>% | ||
dplyr::filter(.data$record_keydate1 > .data$death_date_boxi) | ||
|
||
if (nrow(check_death_date_match) != 0) { | ||
warning("There were records in the episode file which have a record_keydate1 after the BOXI NRS death date.") | ||
} | ||
|
||
|
||
flag_data <- death_joined %>% | ||
dplyr::mutate( | ||
flag_keydate1 = dplyr::if_else(.data$record_keydate1 > .data$death_date_boxi, 1, 0), | ||
flag_keydate2 = dplyr::if_else(.data$record_keydate2 > .data$death_date_boxi, 1, 0), | ||
|
||
# Next flag records with 'ongoing' activity after date of death (available from BOXI) if keydate2 is missing and the death date occurs in | ||
# in the current or a previous financial year. | ||
flag_keydate2_missing = dplyr::if_else(((is.na(.data$record_keydate2) | .data$record_keydate2 == "") & (.data$death_date_boxi <= paste0("20", substr(.data$year, 3, 4), "-03-31"))), 1, 0), | ||
|
||
# Also flag records without a death_date in the episode file, but the BOXI death date occurs in the current or a previous financial year. | ||
flag_deathdate_missing = dplyr::if_else(((is.na(.data$death_date) | .data$death_date == "") & (.data$death_date_boxi <= paste0("20", substr(.data$year, 3, 4), "-03-31"))), 1, 0) | ||
) %>% | ||
# These should be flagged by one of the two lines of code above, but in these cases, we will also fill in the blank death date if appropriate | ||
|
||
# Search all variables beginning with "flag_" for value "1" and create new variable to flag cases where 1 is present | ||
# Multiplying by 1 changes flag from true/false to 1/0 | ||
dplyr::mutate(activity_after_death = purrr::pmap_dbl( | ||
dplyr::select(., tidyselect::contains("flag_")), | ||
~ any(grepl("^1$", c(...)), | ||
na.rm = TRUE | ||
) * 1 | ||
)) | ||
|
||
|
||
# Check and print error message for records which already are TRUE for the deceased variable in the episode file, but this doesn't match the | ||
# BOXI deceased variable | ||
check_deceased_match <- flag_data %>% | ||
dplyr::filter(.data$deceased != .data$deceased_boxi) | ||
|
||
if (nrow(check_deceased_match) != 0) { | ||
warning("There were records in the episode file which have a deceased variable which does not match the BOXI NRS deceased variable") | ||
} | ||
|
||
|
||
# Fill in date of death if missing in the episode file but available in BOXI lookup, due to historic dates of death not being carried | ||
# over from previous financial years | ||
flag_data <- flag_data %>% | ||
dplyr::filter(.data$activity_after_death == 1) %>% | ||
# Remove temporary flag variables used to create activity after death flag and fill in missing death_date | ||
dplyr::select(.data$year, .data$chi, .data$record_keydate1, .data$record_keydate2, .data$activity_after_death) %>% | ||
dplyr::distinct() | ||
|
||
# Match activity after death flag back to episode file | ||
final_data <- data %>% | ||
dplyr::left_join( | ||
flag_data, | ||
by = c("year", "chi", "record_keydate1", "record_keydate2"), | ||
na_matches = "never" | ||
) | ||
|
||
|
||
return(final_data) | ||
} | ||
|
||
|
||
#' Create and read SLF Deaths lookup from processed BOXI NRS deaths extracts | ||
#' | ||
#' #' @description The BOXI NRS deaths extract lookup should be created after the extract files for all years have been processed, | ||
# but before an episode file has been produced. Therefore, all BOXI NRS years should be run before running episode files. | ||
#' | ||
#' @param file_path Path to the BOXI NRS file for each financial year - may not use this | ||
#' @param year The year to process, in FY format - may not use this | ||
#' | ||
#' @param write_to_disk (optional) Should the data be written to disk default is | ||
#' `TRUE` i.e. write the data to disk. | ||
#' | ||
#' @return the final data as a [tibble][tibble::tibble-package]. | ||
#' @export | ||
#' | ||
#' | ||
#' | ||
# Read data------------------------------------------------ | ||
process_deaths_lookup <- function(update = latest_update(), | ||
write_to_disk = TRUE, ...) { | ||
all_boxi_deaths <- read_file(get_slf_deaths_lookup_path("1415")) %>% | ||
rbind(read_file(get_slf_deaths_lookup_path("1516"))) %>% | ||
rbind(read_file(get_slf_deaths_lookup_path("1617"))) %>% | ||
rbind(read_file(get_slf_deaths_lookup_path("1718"))) %>% | ||
rbind(read_file(get_slf_deaths_lookup_path("1819"))) %>% | ||
rbind(read_file(get_slf_deaths_lookup_path("1920"))) %>% | ||
rbind(read_file(get_slf_deaths_lookup_path("2021"))) %>% | ||
rbind(read_file(get_slf_deaths_lookup_path("2122"))) %>% | ||
rbind(read_file(get_slf_deaths_lookup_path("2223"))) %>% | ||
rbind(read_file(get_slf_deaths_lookup_path("2324"))) %>% | ||
# Can this be automated to pick up files starting with name "get_slf_deaths_lookup_path"? | ||
slfhelper::get_chi() %>% | ||
# Remove rows with missing or blank CHI number - could also use na.omit? | ||
# na.omit(all_boxi_deaths) | ||
dplyr::filter(!is.na(.data$chi) | chi != "") | ||
|
||
# Check all CHI numbers are valid | ||
chi_check <- all_boxi_deaths %>% | ||
dplyr::pull(.data$chi) %>% | ||
phsmethods::chi_check() | ||
|
||
if (!all(chi_check %in% c("Valid CHI", "Missing (Blank)", "Missing (NA)"))) { | ||
# There are some Missing (NA) values in the extracts, but I have excluded them above as they cannot be matched to episode file | ||
stop("There were bad CHI numbers in the BOXI NRS file") | ||
} | ||
|
||
# Check and print error message for chi numbers with more than one death date | ||
duplicates <- all_boxi_deaths %>% | ||
janitor::get_dupes(.data$chi) | ||
|
||
if (nrow(duplicates) != 0) { | ||
# There are some Missing (NA) values in the extracts, but I have excluded them above as they cannot be matched to episode file | ||
warning("There were duplicate death dates in the BOXI NRS file.") | ||
} | ||
|
||
|
||
# We decided to include duplicates as unable to determine which is correct date (unless IT can tell us, however, they don't seem to know | ||
# the process well enough), and overall impact will be negligible | ||
# Get anon_chi and use this to match onto episode file later | ||
all_boxi_deaths <- all_boxi_deaths %>% | ||
slfhelper::get_anon_chi() | ||
|
||
# Save out duplicates for further investigation if needed (as anon_chi) | ||
if (!missing(duplicates)) { | ||
write_file( | ||
duplicates, | ||
fs::path(get_slf_dir(), "Deaths", | ||
file_name = stringr::str_glue("slf_deaths_duplicates_{update}.parquet") | ||
) | ||
) | ||
} | ||
|
||
# Maybe save as its own function | ||
# Write the all BOXI NRS deaths lookup file to disk, so this can be used to populate activity after death flag in each episode file | ||
if (write_to_disk) { | ||
write_file( | ||
all_boxi_deaths, | ||
fs::path(get_slf_dir(), "Deaths", | ||
file_name = stringr::str_glue("anon-all_slf_deaths_lookup_{update}.parquet") | ||
) | ||
) | ||
} | ||
|
||
return(all_boxi_deaths) | ||
} |
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.