death join and distinct refined death (#1015)

* distinct death date, keep the earliest one and remove na * add activity after death 100% accurate joining * Style code * remove redundant combine death function * Update documentation * fix NA in activity_after_death * Style code --------- Co-authored-by: lizihao-anu <[email protected]> Co-authored-by: Jennit07 <[email protected]>
Public-Health-Scotland · Oct 22, 2024 · ffa3d0c · ffa3d0c
1 parent 7787668
commit ffa3d0c
Show file tree

Hide file tree

Showing 4 changed files with 64 additions and 154 deletions.
diff --git a/NAMESPACE b/NAMESPACE
@@ -90,7 +90,6 @@ export(midpoint_fy)
 export(next_fy)
 export(phs_db_connection)
 export(previous_update)
-export(process_combined_deaths_lookup)
 export(process_costs_ch_rmd)
 export(process_costs_dn_rmd)
 export(process_costs_gp_ooh_rmd)

diff --git a/R/add_activity_after_death_flag.R b/R/add_activity_after_death_flag.R
@@ -16,47 +16,45 @@ add_activity_after_death_flag <- function(
   # to skip warnings no visible binding for global variable ‘.’
   . <- NULL
 
+  data <- data %>%
+    dplyr::mutate(ep_row_id_death = dplyr::row_number())
+
   death_joined <- data %>%
-    dplyr::select(.data$year, .data$chi, .data$record_keydate1, .data$record_keydate2, .data$death_date, .data$deceased) %>%
-    dplyr::filter(!is.na(.data$chi) | .data$chi != "") %>%
-    dplyr::left_join(
-      deaths_data,
+    dplyr::select(
+      "year",
+      "chi",
+      "recid",
+      "record_keydate1",
+      "record_keydate2",
+      "death_date",
+      "deceased",
+      "ep_row_id_death"
+    ) %>%
+    dplyr::filter(!is.na(.data$chi) & .data$chi != "") %>%
+    dplyr::left_join(deaths_data,
       by = "chi",
-      suffix = c("", "_boxi")
+      suffix = c("", "_refined")
     ) %>%
     dplyr::filter(.data$deceased == TRUE) %>%
     dplyr::distinct()
 
-
-  # Check and print error message for records which already have a death_date in the episode file, but this doesn't match the BOXI death date
-  check_death_date_match <- death_joined %>%
-    dplyr::filter(.data$death_date != .data$death_date_boxi)
-
-  if (nrow(check_death_date_match) != 0) {
-    warning("There were records in the episode file which already have a death_date, but does not match the BOXI NRS death date.")
-  }
-
-
-  # Check and print error message for records which have a record_keydate1 after their BOXI death date
-  check_keydate1_death_date <- death_joined %>%
-    dplyr::filter(.data$record_keydate1 > .data$death_date_boxi)
-
-  if (nrow(check_death_date_match) != 0) {
-    warning("There were records in the episode file which have a record_keydate1 after the BOXI NRS death date.")
-  }
-
-
   flag_data <- death_joined %>%
     dplyr::mutate(
-      flag_keydate1 = dplyr::if_else(.data$record_keydate1 > .data$death_date_boxi, 1, 0),
-      flag_keydate2 = dplyr::if_else(.data$record_keydate2 > .data$death_date_boxi, 1, 0),
+      flag_keydate1 = dplyr::if_else(.data$record_keydate1 > .data$death_date_refined, 1, 0),
+      flag_keydate2 = dplyr::if_else(.data$record_keydate2 > .data$death_date_refined, 1, 0),
 
       # Next flag records with 'ongoing' activity after date of death (available from BOXI) if keydate2 is missing and the death date occurs in
       # in the current or a previous financial year.
-      flag_keydate2_missing = dplyr::if_else(((is.na(.data$record_keydate2) | .data$record_keydate2 == "") & (.data$death_date_boxi <= paste0("20", substr(.data$year, 3, 4), "-03-31"))), 1, 0),
+      flag_keydate2_missing = dplyr::if_else(((is.na(.data$record_keydate2) |
+        .data$record_keydate2 == "") &
+        (.data$death_date_refined <= paste0("20", substr(.data$year, 3, 4), "-03-31"))
+      ), 1, 0),
 
       # Also flag records without a death_date in the episode file, but the BOXI death date occurs in the current or a previous financial year.
-      flag_deathdate_missing = dplyr::if_else(((is.na(.data$death_date) | .data$death_date == "") & (.data$death_date_boxi <= paste0("20", substr(.data$year, 3, 4), "-03-31"))), 1, 0)
+      flag_deathdate_missing = dplyr::if_else(((is.na(.data$death_date) |
+        .data$death_date == "") &
+        (.data$death_date_refined <= paste0("20", substr(.data$year, 3, 4), "-03-31"))
+      ), 1, 0)
     ) %>%
     # These should be flagged by one of the two lines of code above, but in these cases, we will also fill in the blank death date if appropriate
 
@@ -67,116 +65,50 @@ add_activity_after_death_flag <- function(
       ~ any(grepl("^1$", c(...)),
         na.rm = TRUE
       ) * 1
-    ))
-
-
-  # Fill in date of death if missing in the episode file but available in BOXI lookup, due to historic dates of death not being carried
-  # over from previous financial years
-  flag_data <- flag_data %>%
+    )) %>%
+    # Fill in date of death if missing in the episode file but available in BOXI lookup, due to historic dates of death not being carried
+    # over from previous financial years
     dplyr::filter(.data$activity_after_death == 1) %>%
     # Remove temporary flag variables used to create activity after death flag and fill in missing death_date
-    dplyr::select(.data$year, .data$chi, .data$record_keydate1, .data$record_keydate2, .data$activity_after_death, .data$death_date_boxi) %>%
+    dplyr::select(
+      year,
+      chi,
+      recid,
+      record_keydate1,
+      record_keydate2,
+      activity_after_death,
+      death_date_refined,
+      ep_row_id_death
+    ) %>%
     dplyr::distinct()
 
   # Match activity after death flag back to episode file
   final_data <- data %>%
     dplyr::left_join(
       flag_data,
-      # TODO: this join_by is not 100% accurate. Consider use ep_file_row_id to join
-      by = c("year", "chi", "record_keydate1", "record_keydate2"),
+      # this join_by is now 100% accurate.
+      by = c(
+        "year",
+        "chi",
+        "recid",
+        "record_keydate1",
+        "record_keydate2",
+        "ep_row_id_death"
+      ),
       na_matches = "never"
     ) %>%
-    dplyr::mutate(death_date = lubridate::as_date(ifelse(is.na(death_date) & !(is.na(death_date_boxi)),
-      death_date_boxi, death_date
+    dplyr::mutate(death_date = lubridate::as_date(ifelse(
+      is.na(death_date) & !(is.na(death_date_refined)),
+      death_date_refined, death_date
     ))) %>%
-    dplyr::select(-death_date_boxi) %>%
-    dplyr::distinct()
+    dplyr::select(-death_date_refined, -ep_row_id_death) %>%
+    dplyr::distinct() %>%
+    dplyr::mutate(dplyr::if_else(is.na(activity_after_death),
+      0,
+      activity_after_death
+    ))
 
   cli::cli_alert_info("Add activity after death flag function finished at {Sys.time()}")
 
   return(final_data)
 }
-
-
-#' Create and read SLF Deaths lookup from processed BOXI NRS deaths extracts
-#'
-#' @description The BOXI NRS deaths extract lookup should be created after the extract files for all years have been processed,
-# but before an episode file has been produced. Therefore, all BOXI NRS years should be run before running episode files.
-#'
-#' @param ... additional arguments passed to [get_slf_deaths_lookup_path()]
-#' @param update the update month (defaults to use [latest_update()])
-#'
-#' @param write_to_disk (optional) Should the data be written to disk default is
-#' `TRUE` i.e. write the data to disk.
-#'
-#' @return the final data as a [tibble][tibble::tibble-package].
-#' @export
-#'
-#'
-#'
-# Read data------------------------------------------------
-
-process_combined_deaths_lookup <- function(update = latest_update(),
-                                           write_to_disk = TRUE, ...) {
-  dir_folder <- "/conf/hscdiip/SLF_Extracts/Deaths"
-  file_names <- list.files(dir_folder,
-    pattern = "^anon-slf_deaths_lookup_.*parquet",
-    full.names = TRUE
-  )
-
-  # read all year specific deaths lookups and bind them together
-  all_boxi_deaths <- lapply(file_names, arrow::read_parquet) %>%
-    data.table::rbindlist() %>%
-    # convert to chi for processing
-    slfhelper::get_chi() %>%
-    # Remove rows with missing or blank CHI number - could also use na.omit?
-    # na.omit(all_boxi_deaths)
-    dplyr::filter(!is.na(.data$chi) | .data$chi != "")
-
-  # Check all CHI numbers are valid
-  chi_check <- all_boxi_deaths %>%
-    dplyr::pull(.data$chi) %>%
-    phsmethods::chi_check()
-
-  if (!all(chi_check %in% c("Valid CHI", "Missing (Blank)", "Missing (NA)"))) {
-    # There are some Missing (NA) values in the extracts, but I have excluded them above as they cannot be matched to episode file
-    stop("There were bad CHI numbers in the BOXI NRS file")
-  }
-
-  # Check and print error message for chi numbers with more than one death date
-  duplicates <- all_boxi_deaths %>%
-    janitor::get_dupes(.data$chi)
-
-  if (nrow(duplicates) != 0) {
-    # There are some Missing (NA) values in the extracts, but I have excluded them above as they cannot be matched to episode file
-    warning("There were duplicate death dates in the BOXI NRS file.")
-  }
-
-
-  # We decided to include duplicates as unable to determine which is correct date (unless IT can tell us, however, they don't seem to know
-  # the process well enough), and overall impact will be negligible
-  # Get anon_chi and use this to match onto episode file later
-  all_boxi_deaths <- all_boxi_deaths %>%
-    slfhelper::get_anon_chi()
-
-  # Save out duplicates for further investigation if needed (as anon_chi)
-  if (!missing(duplicates)) {
-    write_file(
-      duplicates,
-      fs::path(get_slf_dir(), "Deaths",
-        file_name = stringr::str_glue("slf_deaths_duplicates_{update}.parquet")
-      )
-    )
-  }
-
-  # Maybe save as its own function
-  # Write the all BOXI NRS deaths lookup file to disk, so this can be used to populate activity after death flag in each episode file
-  if (write_to_disk) {
-    write_file(
-      all_boxi_deaths,
-      get_combined_slf_deaths_lookup_path()
-    )
-  }
-
-  return(all_boxi_deaths)
-}
diff --git a/R/process_refined_death.R b/R/process_refined_death.R
@@ -49,8 +49,13 @@ process_refined_death <- function(
     dplyr::mutate(
       fy = phsmethods::extract_fin_year(death_date),
       fy = as.character(paste0(substr(fy, 3, 4), substr(fy, 6, 7)))
-    )
-  # TODO: check distinct death data by chi while keeping chi==NA records
+    ) %>%
+    # no need to keep NA
+    dplyr::filter(!is.na(anon_chi)) %>%
+    dplyr::group_by(anon_chi) %>%
+    dplyr::arrange(death_date) %>%
+    dplyr::distinct(anon_chi, .keep_all = TRUE) %>%
+    dplyr::ungroup()
 
   if (write_to_disk) {
     write_file(

diff --git a/man/process_combined_deaths_lookup.Rd b/man/process_combined_deaths_lookup.Rd