Public-Health-Scotland · Jennit07 · Sep 26, 2023 · Jul 18, 2023 · Jul 18, 2023 · Jul 19, 2023
diff --git a/R/add_nsu_cohort.R b/R/add_nsu_cohort.R
@@ -2,13 +2,17 @@
 #'
 #' @param data The input data frame
 #' @param year The year being processed
+#' @param nsu_cohort The NSU data for the year
 #'
 #' @return A data frame containing the Non-Service Users as additional rows
 #' @export
 #'
 #' @family episode file
 #' @seealso [get_nsu_path()]
-add_nsu_cohort <- function(data, year) {
+add_nsu_cohort <- function(
+    data,
+    year,
+    nsu_cohort = read_file(get_nsu_path(year))) {
   year_param <- year
 
   if (!check_year_valid(year, "NSU")) {
@@ -29,9 +33,9 @@ add_nsu_cohort <- function(data, year) {
     )
   )
 
-  matched <- dplyr::full_join(data,
-    # NSU cohort file
-    read_file(get_nsu_path(year)) %>%
+  matched <- dplyr::full_join(
+    data,
+    nsu_cohort %>%
       dplyr::mutate(
         dob = as.Date(.data[["dob"]]),
         gpprac = convert_eng_gpprac_to_dummy(.data[["gpprac"]])
@@ -110,7 +114,6 @@ add_nsu_cohort <- function(data, year) {
         .data[["chi"]]
       )
     ) %>%
-    # Remove the additional columns
     dplyr::select(-dplyr::contains("_nsu"), -"has_chi")
 
   return(return_df)

diff --git a/R/create_episode_file.R b/R/create_episode_file.R
@@ -1,17 +1,32 @@
-#' Create the Source Episode file
+#' Produce the Source Episode file
 #'
 #' @param processed_data_list containing data from processed extracts.
 #' @param year The year to process, in FY format.
 #' @param write_to_disk (optional) Should the data be written to disk default is
 #' `TRUE` i.e. write the data to disk.
+#' @inheritParams add_nsu_cohort
+#' @inheritParams fill_geographies
+#' @inheritParams join_cohort_lookups
+#' @inheritParams join_deaths_data
+#' @inheritParams match_on_ltcs
+#' @inheritParams link_delayed_discharge_eps
 #' @param anon_chi_out (Default:TRUE) Should `anon_chi` be used in the output
-#' (instead of chi).
+#' (instead of chi)
 #'
-#' @return the Source Episode file as a [tibble][tibble::tibble-package].
+#' @return a [tibble][tibble::tibble-package] containing the episode file
 #' @export
 create_episode_file <- function(
     processed_data_list,
     year,
+    dd_data = read_file(get_source_extract_path(year, "DD")),
+    nsu_cohort = read_file(get_nsu_path(year)),
+    ltc_data = read_file(get_ltcs_path(year)),
+    slf_pc_lookup = read_file(get_slf_postcode_path()),
+    slf_gpprac_lookup = read_file(
+      get_slf_gpprac_path(),
+      col_select = c("gpprac", "cluster", "hbpraccode")
+    ),
+    slf_deaths_lookup = read_file(get_slf_deaths_lookup_path(year)),
     write_to_disk = TRUE,
     anon_chi_out = TRUE) {
   episode_file <- dplyr::bind_rows(processed_data_list) %>%
@@ -99,15 +114,21 @@ create_episode_file <- function(
     correct_cij_vars() %>%
     fill_missing_cij_markers() %>%
     add_ppa_flag() %>%
-    link_delayed_discharge_eps(year) %>%
-    add_nsu_cohort(year) %>%
-    match_on_ltcs(year) %>%
+    link_delayed_discharge_eps(year, dd_data) %>%
+    add_nsu_cohort(year, nsu_cohort) %>%
+    match_on_ltcs(year, ltc_data) %>%
     correct_demographics(year) %>%
     create_cohort_lookups(year) %>%
     join_cohort_lookups(year) %>%
     join_sparra_hhg(year) %>%
-    fill_geographies() %>%
-    join_deaths_data(year) %>%
+    fill_geographies(
+      slf_pc_lookup,
+      slf_gpprac_lookup
+    ) %>%
+    join_deaths_data(
+      year,
+      slf_deaths_lookup
+    ) %>%
     load_ep_file_vars(year)
 
   if (anon_chi_out) {
@@ -354,22 +375,28 @@ create_cohort_lookups <- function(data, year, update = latest_update()) {
 #'
 #' @inheritParams store_ep_file_vars
 #' @inheritParams get_demographic_cohorts_path
+#' @param demographic_cohort,service_use_cohort The cohort data
 #'
 #' @return The data including the Demographic and Service Use lookups.
-join_cohort_lookups <- function(data, year, update = latest_update()) {
+join_cohort_lookups <- function(
+    data,
+    year,
+    update = latest_update(),
+    demographic_cohort = read_file(
+      get_demographic_cohorts_path(year, update),
+      col_select = c("chi", "demographic_cohort")
+    ),
+    service_use_cohort = read_file(
+      get_service_use_cohorts_path(year, update),
+      col_select = c("chi", "service_use_cohort")
+    )) {
   join_cohort_lookups <- data %>%
     dplyr::left_join(
-      read_file(
-        get_demographic_cohorts_path(year, update),
-        col_select = c("chi", "demographic_cohort")
-      ),
+      demographic_cohort,
       by = "chi"
     ) %>%
     dplyr::left_join(
-      read_file(
-        get_service_use_cohorts_path(year, update),
-        col_select = c("chi", "service_use_cohort")
-      ),
+      service_use_cohort,
       by = "chi"
     )
 

diff --git a/R/fill_geographies.R b/R/fill_geographies.R
@@ -4,10 +4,18 @@
 #' then use the lookups to match on additional variables.
 #'
 #' @param data the SLF
+#' @param slf_pc_lookup The SLF Postcode lookup
+#' @param slf_gpprac_lookup The SLF GP Practice lookup
 #'
 #' @return a [tibble][tibble::tibble-package] of the SLF with improved
 #' Postcode and GP Practice details.
-fill_geographies <- function(data) {
+fill_geographies <- function(
+    data,
+    slf_pc_lookup = read_file(get_slf_postcode_path()),
+    slf_gpprac_lookup = read_file(
+      get_slf_gpprac_path(),
+      col_select = c("gpprac", "cluster", "hbpraccode")
+    )) {
   check_variables_exist(data, c(
     "chi",
     "postcode",
@@ -21,8 +29,15 @@ fill_geographies <- function(data) {
   ))
 
   data %>%
-    fill_postcode_geogs() %>%
-    fill_gpprac_geographies()
+    fill_postcode_geogs(
+      slf_pc_lookup = read_file(get_slf_postcode_path())
+    ) %>%
+    fill_gpprac_geographies(
+      slf_gpprac_lookup = read_file(
+        get_slf_gpprac_path(),
+        col_select = c("gpprac", "cluster", "hbpraccode")
+      )
+    )
 }
 
 #' Make a postcode lookup for filling to most recent postcodes based on CHI
@@ -86,9 +101,9 @@ make_gpprac_lookup <- function(data) {
   return(gpprac_lookup)
 }
 
-fill_postcode_geogs <- function(data) {
-  slf_pc_lookup <- read_file(get_slf_postcode_path())
-
+fill_postcode_geogs <- function(
+    data,
+    slf_pc_lookup) {
   filled_postcodes <- dplyr::left_join(
     data,
     make_postcode_lookup(data),
@@ -123,17 +138,20 @@ fill_postcode_geogs <- function(data) {
       lca = dplyr::coalesce(.data$lca, .data$lca_old),
       datazone2011 = dplyr::coalesce(.data$datazone2011, .data$datazone2011_old)
     ) %>%
-    dplyr::select(!c("hb2018", "hscp", "lca_old", "datazone2011_old", "most_recent_postcode"))
+    dplyr::select(!c(
+      "hb2018",
+      "hscp",
+      "lca_old",
+      "datazone2011_old",
+      "most_recent_postcode"
+    ))
 
   return(filled_postcodes)
 }
 
-fill_gpprac_geographies <- function(data) {
-  gpprac_ref <- read_file(
-    get_slf_gpprac_path(),
-    col_select = c("gpprac", "cluster", "hbpraccode")
-  )
-
+fill_gpprac_geographies <- function(
+    data,
+    slf_gpprac_lookup) {
   filled_gpprac <- dplyr::left_join(
     data,
     make_gpprac_lookup(data),
@@ -147,7 +165,12 @@ fill_gpprac_geographies <- function(data) {
         .data$gpprac
       )
     ) %>%
-    dplyr::left_join(gpprac_ref, by = "gpprac", suffix = c("_old", "")) %>%
+    dplyr::left_join(
+      slf_gpprac_lookup %>%
+        dplyr::select("gpprac", "cluster", "hbpraccode"),
+      by = "gpprac",
+      suffix = c("_old", "")
+    ) %>%
     dplyr::mutate(
       hbpraccode = dplyr::coalesce(.data$hbpraccode, .data$hbpraccode_old)
     ) %>%

diff --git a/R/get_source_extract_path.R b/R/get_source_extract_path.R
@@ -41,34 +41,34 @@ get_source_extract_path <- function(
   type <- match.arg(type)
 
   if (!check_year_valid(year, type)) {
-    return(NA)
+    return(get_dummy_boxi_extract_path())
   }
 
   file_name <- dplyr::case_match(
     type,
     "Acute" ~ "acute_for_source",
-    "AE" ~ "a&e_for_source",
-    "AT" ~ "Alarms-Telecare-for-source",
+    "AE" ~ "a_and_e_for_source",
+    "AT" ~ "alarms-telecare-for-source",
     "CH" ~ "care_home_for_source",
-    "CMH" ~ "CMH_for_source",
+    "CMH" ~ "cmh_for_source",
     "Client" ~ "client_for_source",
-    "DD" ~ "DD_for_source",
+    "DD" ~ "delayed_discharge_for_source",
     "Deaths" ~ "deaths_for_source",
-    "DN" ~ "DN_for_source",
-    "GPOoH" ~ "GP_OOH_for_source",
-    "HC" ~ "Home_Care_for_source",
+    "DN" ~ "district_nursing_for_source",
+    "GPOoH" ~ "gp_ooh_for_source",
+    "HC" ~ "home_care_for_source",
     "Homelessness" ~ "homelessness_for_source",
     "Maternity" ~ "maternity_for_source",
     "MH" ~ "mental_health_for_source",
-    "DD" ~ "DD_for_source",
     "Outpatients" ~ "outpatients_for_source",
-    "PIS" ~ "prescribing_file_for_source",
-    "SDS" ~ "SDS-for-source"
-  )
+    "PIS" ~ "prescribing_for_source",
+    "SDS" ~ "sds_for_source"
+  ) %>%
+    stringr::str_glue("-{year}.parquet")
 
   source_extract_path <- get_file_path(
     directory = get_year_dir(year),
-    file_name = stringr::str_glue("{file_name}-20{year}.parquet"),
+    file_name = file_name,
     ...
   )
 

diff --git a/R/get_sparra_hhg_paths.R b/R/get_sparra_hhg_paths.R
@@ -10,6 +10,10 @@
 #' @family extract file paths
 #' @seealso [get_file_path()] for the generic function.
 get_hhg_path <- function(year, ...) {
+  if (!check_year_valid(year, "HHG")) {
+    return(get_dummy_boxi_extract_path())
+  }
+
   hhg_file_path <- get_file_path(
     directory = fs::path(get_slf_dir(), "HHG"),
     file_name = stringr::str_glue("HHG-20{year}.parquet"),
@@ -31,6 +35,10 @@ get_hhg_path <- function(year, ...) {
 #' @family extract file paths
 #' @seealso [get_file_path()] for the generic function.
 get_sparra_path <- function(year, ...) {
+  if (!check_year_valid(year, "SPARRA")) {
+    return(get_dummy_boxi_extract_path())
+  }
+
   sparra_file_path <- get_file_path(
     directory = fs::path(get_slf_dir(), "SPARRA"),
     file_name = stringr::str_glue("SPARRA-20{year}.parquet"),

diff --git a/R/join_deaths_data.R b/R/join_deaths_data.R
@@ -2,16 +2,14 @@
 #'
 #' @param data Episode file data
 #' @param year financial year, e.g. '1920'
-#' @param slf_deaths_lookup_path Path to slf deaths lookup.
+#' @param slf_deaths_lookup The SLF deaths lookup.
 #'
 #' @return The data including the deaths lookup matched
 #'         on to the episode file.
 join_deaths_data <- function(
     data,
     year,
-    slf_deaths_lookup_path = get_slf_deaths_lookup_path(year)) {
-  slf_deaths_lookup <- read_file(slf_deaths_lookup_path)
-
+    slf_deaths_lookup = read_file(get_slf_deaths_lookup_path(year))) {
   return(
     data %>%
       dplyr::left_join(

diff --git a/R/match_on_ltcs.R b/R/match_on_ltcs.R
@@ -5,13 +5,17 @@
 #'
 #' @param data episode files
 #' @param year financial year, e.g. '1920'
+#' @param ltc_data The LTC data for the year
 #'
 #' @return data matched with long term conditions
-match_on_ltcs <- function(data, year) {
+match_on_ltcs <- function(
+    data,
+    year,
+    ltc_data = read_file(get_ltcs_path(year))) {
   # Match on LTC lookup
   matched <- dplyr::left_join(
     data,
-    read_file(get_ltcs_path(year)),
+    ltc_data,
     by = "chi",
     suffix = c("", "_ltc")
   ) %>%

diff --git a/R/read_file.R b/R/read_file.R
@@ -27,6 +27,11 @@ read_file <- function(path, col_select = NULL, as_data_frame = TRUE, ...) {
     "parquet"
   )
 
+  # Return an empty tibble if trying to read the dummy path
+  if (path == get_dummy_boxi_extract_path()) {
+    return(tibble::tibble())
+  }
+
   ext <- fs::path_ext(path)
 
   if (ext == "gz") {

diff --git a/_targets.R b/_targets.R
@@ -543,11 +543,18 @@ list(
         source_sc_alarms_tele
       )
     ),
+    tar_file_read(nsu_cohort, get_nsu_path(year), read_file(!!.x)),
     tar_target(
       episode_file,
       create_episode_file(
         processed_data_list,
         year,
+        dd_data = source_dd_extract,
+        nsu_cohort = nsu_cohort,
+        ltc_data = source_ltc_lookup,
+        slf_pc_lookup = source_pc_lookup,
+        slf_gpprac_lookup = source_gp_lookup,
+        slf_deaths_lookup = slf_deaths_lookup,
         write_to_disk
       )
     ),

diff --git a/man/add_nsu_cohort.Rd b/man/add_nsu_cohort.Rd