Skip to content

Commit

Permalink
death join and distinct refined death (#1015)
Browse files Browse the repository at this point in the history
* distinct death date, keep the earliest one and remove na

* add activity after death 100% accurate joining

* Style code

* remove redundant combine death function

* Update documentation

* fix NA in activity_after_death

* Style code

---------

Co-authored-by: lizihao-anu <[email protected]>
Co-authored-by: Jennit07 <[email protected]>
  • Loading branch information
3 people authored Oct 22, 2024
1 parent 7787668 commit ffa3d0c
Show file tree
Hide file tree
Showing 4 changed files with 64 additions and 154 deletions.
1 change: 0 additions & 1 deletion NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,6 @@ export(midpoint_fy)
export(next_fy)
export(phs_db_connection)
export(previous_update)
export(process_combined_deaths_lookup)
export(process_costs_ch_rmd)
export(process_costs_dn_rmd)
export(process_costs_gp_ooh_rmd)
Expand Down
182 changes: 57 additions & 125 deletions R/add_activity_after_death_flag.R
Original file line number Diff line number Diff line change
Expand Up @@ -16,47 +16,45 @@ add_activity_after_death_flag <- function(
# to skip warnings no visible binding for global variable ‘.’
. <- NULL

data <- data %>%
dplyr::mutate(ep_row_id_death = dplyr::row_number())

death_joined <- data %>%
dplyr::select(.data$year, .data$chi, .data$record_keydate1, .data$record_keydate2, .data$death_date, .data$deceased) %>%
dplyr::filter(!is.na(.data$chi) | .data$chi != "") %>%
dplyr::left_join(
deaths_data,
dplyr::select(
"year",
"chi",
"recid",
"record_keydate1",
"record_keydate2",
"death_date",
"deceased",
"ep_row_id_death"
) %>%
dplyr::filter(!is.na(.data$chi) & .data$chi != "") %>%
dplyr::left_join(deaths_data,
by = "chi",
suffix = c("", "_boxi")
suffix = c("", "_refined")
) %>%
dplyr::filter(.data$deceased == TRUE) %>%
dplyr::distinct()


# Check and print error message for records which already have a death_date in the episode file, but this doesn't match the BOXI death date
check_death_date_match <- death_joined %>%
dplyr::filter(.data$death_date != .data$death_date_boxi)

if (nrow(check_death_date_match) != 0) {
warning("There were records in the episode file which already have a death_date, but does not match the BOXI NRS death date.")
}


# Check and print error message for records which have a record_keydate1 after their BOXI death date
check_keydate1_death_date <- death_joined %>%
dplyr::filter(.data$record_keydate1 > .data$death_date_boxi)

if (nrow(check_death_date_match) != 0) {
warning("There were records in the episode file which have a record_keydate1 after the BOXI NRS death date.")
}


flag_data <- death_joined %>%
dplyr::mutate(
flag_keydate1 = dplyr::if_else(.data$record_keydate1 > .data$death_date_boxi, 1, 0),
flag_keydate2 = dplyr::if_else(.data$record_keydate2 > .data$death_date_boxi, 1, 0),
flag_keydate1 = dplyr::if_else(.data$record_keydate1 > .data$death_date_refined, 1, 0),
flag_keydate2 = dplyr::if_else(.data$record_keydate2 > .data$death_date_refined, 1, 0),

# Next flag records with 'ongoing' activity after date of death (available from BOXI) if keydate2 is missing and the death date occurs in
# in the current or a previous financial year.
flag_keydate2_missing = dplyr::if_else(((is.na(.data$record_keydate2) | .data$record_keydate2 == "") & (.data$death_date_boxi <= paste0("20", substr(.data$year, 3, 4), "-03-31"))), 1, 0),
flag_keydate2_missing = dplyr::if_else(((is.na(.data$record_keydate2) |
.data$record_keydate2 == "") &
(.data$death_date_refined <= paste0("20", substr(.data$year, 3, 4), "-03-31"))
), 1, 0),

# Also flag records without a death_date in the episode file, but the BOXI death date occurs in the current or a previous financial year.
flag_deathdate_missing = dplyr::if_else(((is.na(.data$death_date) | .data$death_date == "") & (.data$death_date_boxi <= paste0("20", substr(.data$year, 3, 4), "-03-31"))), 1, 0)
flag_deathdate_missing = dplyr::if_else(((is.na(.data$death_date) |
.data$death_date == "") &
(.data$death_date_refined <= paste0("20", substr(.data$year, 3, 4), "-03-31"))
), 1, 0)
) %>%
# These should be flagged by one of the two lines of code above, but in these cases, we will also fill in the blank death date if appropriate

Expand All @@ -67,116 +65,50 @@ add_activity_after_death_flag <- function(
~ any(grepl("^1$", c(...)),
na.rm = TRUE
) * 1
))


# Fill in date of death if missing in the episode file but available in BOXI lookup, due to historic dates of death not being carried
# over from previous financial years
flag_data <- flag_data %>%
)) %>%
# Fill in date of death if missing in the episode file but available in BOXI lookup, due to historic dates of death not being carried
# over from previous financial years
dplyr::filter(.data$activity_after_death == 1) %>%
# Remove temporary flag variables used to create activity after death flag and fill in missing death_date
dplyr::select(.data$year, .data$chi, .data$record_keydate1, .data$record_keydate2, .data$activity_after_death, .data$death_date_boxi) %>%
dplyr::select(
year,
chi,
recid,
record_keydate1,
record_keydate2,
activity_after_death,
death_date_refined,
ep_row_id_death
) %>%
dplyr::distinct()

# Match activity after death flag back to episode file
final_data <- data %>%
dplyr::left_join(
flag_data,
# TODO: this join_by is not 100% accurate. Consider use ep_file_row_id to join
by = c("year", "chi", "record_keydate1", "record_keydate2"),
# this join_by is now 100% accurate.
by = c(
"year",
"chi",
"recid",
"record_keydate1",
"record_keydate2",
"ep_row_id_death"
),
na_matches = "never"
) %>%
dplyr::mutate(death_date = lubridate::as_date(ifelse(is.na(death_date) & !(is.na(death_date_boxi)),
death_date_boxi, death_date
dplyr::mutate(death_date = lubridate::as_date(ifelse(
is.na(death_date) & !(is.na(death_date_refined)),
death_date_refined, death_date
))) %>%
dplyr::select(-death_date_boxi) %>%
dplyr::distinct()
dplyr::select(-death_date_refined, -ep_row_id_death) %>%
dplyr::distinct() %>%
dplyr::mutate(dplyr::if_else(is.na(activity_after_death),
0,
activity_after_death
))

cli::cli_alert_info("Add activity after death flag function finished at {Sys.time()}")

return(final_data)
}


#' Create and read SLF Deaths lookup from processed BOXI NRS deaths extracts
#'
#' @description The BOXI NRS deaths extract lookup should be created after the extract files for all years have been processed,
# but before an episode file has been produced. Therefore, all BOXI NRS years should be run before running episode files.
#'
#' @param ... additional arguments passed to [get_slf_deaths_lookup_path()]
#' @param update the update month (defaults to use [latest_update()])
#'
#' @param write_to_disk (optional) Should the data be written to disk default is
#' `TRUE` i.e. write the data to disk.
#'
#' @return the final data as a [tibble][tibble::tibble-package].
#' @export
#'
#'
#'
# Read data------------------------------------------------

process_combined_deaths_lookup <- function(update = latest_update(),
write_to_disk = TRUE, ...) {
dir_folder <- "/conf/hscdiip/SLF_Extracts/Deaths"
file_names <- list.files(dir_folder,
pattern = "^anon-slf_deaths_lookup_.*parquet",
full.names = TRUE
)

# read all year specific deaths lookups and bind them together
all_boxi_deaths <- lapply(file_names, arrow::read_parquet) %>%
data.table::rbindlist() %>%
# convert to chi for processing
slfhelper::get_chi() %>%
# Remove rows with missing or blank CHI number - could also use na.omit?
# na.omit(all_boxi_deaths)
dplyr::filter(!is.na(.data$chi) | .data$chi != "")

# Check all CHI numbers are valid
chi_check <- all_boxi_deaths %>%
dplyr::pull(.data$chi) %>%
phsmethods::chi_check()

if (!all(chi_check %in% c("Valid CHI", "Missing (Blank)", "Missing (NA)"))) {
# There are some Missing (NA) values in the extracts, but I have excluded them above as they cannot be matched to episode file
stop("There were bad CHI numbers in the BOXI NRS file")
}

# Check and print error message for chi numbers with more than one death date
duplicates <- all_boxi_deaths %>%
janitor::get_dupes(.data$chi)

if (nrow(duplicates) != 0) {
# There are some Missing (NA) values in the extracts, but I have excluded them above as they cannot be matched to episode file
warning("There were duplicate death dates in the BOXI NRS file.")
}


# We decided to include duplicates as unable to determine which is correct date (unless IT can tell us, however, they don't seem to know
# the process well enough), and overall impact will be negligible
# Get anon_chi and use this to match onto episode file later
all_boxi_deaths <- all_boxi_deaths %>%
slfhelper::get_anon_chi()

# Save out duplicates for further investigation if needed (as anon_chi)
if (!missing(duplicates)) {
write_file(
duplicates,
fs::path(get_slf_dir(), "Deaths",
file_name = stringr::str_glue("slf_deaths_duplicates_{update}.parquet")
)
)
}

# Maybe save as its own function
# Write the all BOXI NRS deaths lookup file to disk, so this can be used to populate activity after death flag in each episode file
if (write_to_disk) {
write_file(
all_boxi_deaths,
get_combined_slf_deaths_lookup_path()
)
}

return(all_boxi_deaths)
}
9 changes: 7 additions & 2 deletions R/process_refined_death.R
Original file line number Diff line number Diff line change
Expand Up @@ -49,8 +49,13 @@ process_refined_death <- function(
dplyr::mutate(
fy = phsmethods::extract_fin_year(death_date),
fy = as.character(paste0(substr(fy, 3, 4), substr(fy, 6, 7)))
)
# TODO: check distinct death data by chi while keeping chi==NA records
) %>%
# no need to keep NA
dplyr::filter(!is.na(anon_chi)) %>%
dplyr::group_by(anon_chi) %>%
dplyr::arrange(death_date) %>%
dplyr::distinct(anon_chi, .keep_all = TRUE) %>%
dplyr::ungroup()

if (write_to_disk) {
write_file(
Expand Down
26 changes: 0 additions & 26 deletions man/process_combined_deaths_lookup.Rd

This file was deleted.

0 comments on commit ffa3d0c

Please sign in to comment.