Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

death join and distinct refined death #1015

Merged
merged 10 commits into from
Oct 22, 2024
1 change: 0 additions & 1 deletion NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,6 @@ export(midpoint_fy)
export(next_fy)
export(phs_db_connection)
export(previous_update)
export(process_combined_deaths_lookup)
export(process_costs_ch_rmd)
export(process_costs_dn_rmd)
export(process_costs_gp_ooh_rmd)
Expand Down
182 changes: 57 additions & 125 deletions R/add_activity_after_death_flag.R
Original file line number Diff line number Diff line change
Expand Up @@ -16,47 +16,45 @@ add_activity_after_death_flag <- function(
# to skip warnings no visible binding for global variable ‘.’
. <- NULL

data <- data %>%
dplyr::mutate(ep_row_id_death = dplyr::row_number())

death_joined <- data %>%
dplyr::select(.data$year, .data$chi, .data$record_keydate1, .data$record_keydate2, .data$death_date, .data$deceased) %>%
dplyr::filter(!is.na(.data$chi) | .data$chi != "") %>%
dplyr::left_join(
deaths_data,
dplyr::select(
"year",
"chi",
"recid",
"record_keydate1",
"record_keydate2",
"death_date",
"deceased",
"ep_row_id_death"
) %>%
dplyr::filter(!is.na(.data$chi) & .data$chi != "") %>%
dplyr::left_join(deaths_data,
by = "chi",
suffix = c("", "_boxi")
suffix = c("", "_refined")
) %>%
dplyr::filter(.data$deceased == TRUE) %>%
dplyr::distinct()


# Check and print error message for records which already have a death_date in the episode file, but this doesn't match the BOXI death date
check_death_date_match <- death_joined %>%
dplyr::filter(.data$death_date != .data$death_date_boxi)

if (nrow(check_death_date_match) != 0) {
warning("There were records in the episode file which already have a death_date, but does not match the BOXI NRS death date.")
}


# Check and print error message for records which have a record_keydate1 after their BOXI death date
check_keydate1_death_date <- death_joined %>%
dplyr::filter(.data$record_keydate1 > .data$death_date_boxi)

if (nrow(check_death_date_match) != 0) {
warning("There were records in the episode file which have a record_keydate1 after the BOXI NRS death date.")
}


flag_data <- death_joined %>%
dplyr::mutate(
flag_keydate1 = dplyr::if_else(.data$record_keydate1 > .data$death_date_boxi, 1, 0),
flag_keydate2 = dplyr::if_else(.data$record_keydate2 > .data$death_date_boxi, 1, 0),
flag_keydate1 = dplyr::if_else(.data$record_keydate1 > .data$death_date_refined, 1, 0),
flag_keydate2 = dplyr::if_else(.data$record_keydate2 > .data$death_date_refined, 1, 0),

# Next flag records with 'ongoing' activity after date of death (available from BOXI) if keydate2 is missing and the death date occurs in
# in the current or a previous financial year.
flag_keydate2_missing = dplyr::if_else(((is.na(.data$record_keydate2) | .data$record_keydate2 == "") & (.data$death_date_boxi <= paste0("20", substr(.data$year, 3, 4), "-03-31"))), 1, 0),
flag_keydate2_missing = dplyr::if_else(((is.na(.data$record_keydate2) |
.data$record_keydate2 == "") &
(.data$death_date_refined <= paste0("20", substr(.data$year, 3, 4), "-03-31"))
), 1, 0),

# Also flag records without a death_date in the episode file, but the BOXI death date occurs in the current or a previous financial year.
flag_deathdate_missing = dplyr::if_else(((is.na(.data$death_date) | .data$death_date == "") & (.data$death_date_boxi <= paste0("20", substr(.data$year, 3, 4), "-03-31"))), 1, 0)
flag_deathdate_missing = dplyr::if_else(((is.na(.data$death_date) |
.data$death_date == "") &
(.data$death_date_refined <= paste0("20", substr(.data$year, 3, 4), "-03-31"))
), 1, 0)
) %>%
# These should be flagged by one of the two lines of code above, but in these cases, we will also fill in the blank death date if appropriate

Expand All @@ -67,116 +65,50 @@ add_activity_after_death_flag <- function(
~ any(grepl("^1$", c(...)),
na.rm = TRUE
) * 1
))


# Fill in date of death if missing in the episode file but available in BOXI lookup, due to historic dates of death not being carried
# over from previous financial years
flag_data <- flag_data %>%
)) %>%
# Fill in date of death if missing in the episode file but available in BOXI lookup, due to historic dates of death not being carried
# over from previous financial years
dplyr::filter(.data$activity_after_death == 1) %>%
# Remove temporary flag variables used to create activity after death flag and fill in missing death_date
dplyr::select(.data$year, .data$chi, .data$record_keydate1, .data$record_keydate2, .data$activity_after_death, .data$death_date_boxi) %>%
dplyr::select(
year,
chi,
recid,
record_keydate1,
record_keydate2,
activity_after_death,
death_date_refined,
ep_row_id_death
) %>%
dplyr::distinct()

# Match activity after death flag back to episode file
final_data <- data %>%
dplyr::left_join(
flag_data,
# TODO: this join_by is not 100% accurate. Consider use ep_file_row_id to join
by = c("year", "chi", "record_keydate1", "record_keydate2"),
# this join_by is now 100% accurate.
by = c(
"year",
"chi",
"recid",
"record_keydate1",
"record_keydate2",
"ep_row_id_death"
),
na_matches = "never"
) %>%
dplyr::mutate(death_date = lubridate::as_date(ifelse(is.na(death_date) & !(is.na(death_date_boxi)),
death_date_boxi, death_date
dplyr::mutate(death_date = lubridate::as_date(ifelse(
is.na(death_date) & !(is.na(death_date_refined)),
death_date_refined, death_date
))) %>%
dplyr::select(-death_date_boxi) %>%
dplyr::distinct()
dplyr::select(-death_date_refined, -ep_row_id_death) %>%
dplyr::distinct() %>%
dplyr::mutate(dplyr::if_else(is.na(activity_after_death),
0,
activity_after_death
))

cli::cli_alert_info("Add activity after death flag function finished at {Sys.time()}")

return(final_data)
}


#' Create and read SLF Deaths lookup from processed BOXI NRS deaths extracts
#'
#' @description The BOXI NRS deaths extract lookup should be created after the extract files for all years have been processed,
# but before an episode file has been produced. Therefore, all BOXI NRS years should be run before running episode files.
#'
#' @param ... additional arguments passed to [get_slf_deaths_lookup_path()]
#' @param update the update month (defaults to use [latest_update()])
#'
#' @param write_to_disk (optional) Should the data be written to disk default is
#' `TRUE` i.e. write the data to disk.
#'
#' @return the final data as a [tibble][tibble::tibble-package].
#' @export
#'
#'
#'
# Read data------------------------------------------------

process_combined_deaths_lookup <- function(update = latest_update(),
write_to_disk = TRUE, ...) {
dir_folder <- "/conf/hscdiip/SLF_Extracts/Deaths"
file_names <- list.files(dir_folder,
pattern = "^anon-slf_deaths_lookup_.*parquet",
full.names = TRUE
)

# read all year specific deaths lookups and bind them together
all_boxi_deaths <- lapply(file_names, arrow::read_parquet) %>%
data.table::rbindlist() %>%
# convert to chi for processing
slfhelper::get_chi() %>%
# Remove rows with missing or blank CHI number - could also use na.omit?
# na.omit(all_boxi_deaths)
dplyr::filter(!is.na(.data$chi) | .data$chi != "")

# Check all CHI numbers are valid
chi_check <- all_boxi_deaths %>%
dplyr::pull(.data$chi) %>%
phsmethods::chi_check()

if (!all(chi_check %in% c("Valid CHI", "Missing (Blank)", "Missing (NA)"))) {
# There are some Missing (NA) values in the extracts, but I have excluded them above as they cannot be matched to episode file
stop("There were bad CHI numbers in the BOXI NRS file")
}

# Check and print error message for chi numbers with more than one death date
duplicates <- all_boxi_deaths %>%
janitor::get_dupes(.data$chi)

if (nrow(duplicates) != 0) {
# There are some Missing (NA) values in the extracts, but I have excluded them above as they cannot be matched to episode file
warning("There were duplicate death dates in the BOXI NRS file.")
}


# We decided to include duplicates as unable to determine which is correct date (unless IT can tell us, however, they don't seem to know
# the process well enough), and overall impact will be negligible
# Get anon_chi and use this to match onto episode file later
all_boxi_deaths <- all_boxi_deaths %>%
slfhelper::get_anon_chi()

# Save out duplicates for further investigation if needed (as anon_chi)
if (!missing(duplicates)) {
write_file(
duplicates,
fs::path(get_slf_dir(), "Deaths",
file_name = stringr::str_glue("slf_deaths_duplicates_{update}.parquet")
)
)
}

# Maybe save as its own function
# Write the all BOXI NRS deaths lookup file to disk, so this can be used to populate activity after death flag in each episode file
if (write_to_disk) {
write_file(
all_boxi_deaths,
get_combined_slf_deaths_lookup_path()
)
}

return(all_boxi_deaths)
}
9 changes: 7 additions & 2 deletions R/process_refined_death.R
Original file line number Diff line number Diff line change
Expand Up @@ -49,8 +49,13 @@ process_refined_death <- function(
dplyr::mutate(
fy = phsmethods::extract_fin_year(death_date),
fy = as.character(paste0(substr(fy, 3, 4), substr(fy, 6, 7)))
)
# TODO: check distinct death data by chi while keeping chi==NA records
) %>%
# no need to keep NA
dplyr::filter(!is.na(anon_chi)) %>%
dplyr::group_by(anon_chi) %>%
dplyr::arrange(death_date) %>%
dplyr::distinct(anon_chi, .keep_all = TRUE) %>%
dplyr::ungroup()

if (write_to_disk) {
write_file(
Expand Down
26 changes: 0 additions & 26 deletions man/process_combined_deaths_lookup.Rd

This file was deleted.