From e6d6ca6761a0fc9b79c9c2cb34076955f5dca139 Mon Sep 17 00:00:00 2001
From: Mandy Norrbo <mandy@jumpingrivers.com>
Date: Tue, 7 Feb 2023 11:11:02 +0000
Subject: [PATCH 001/200] Convert D01 up until L470

---
 R/create_individual_file.R | 472 +++++++++++++++++++++++++++++++++++++
 1 file changed, 472 insertions(+)
 create mode 100644 R/create_individual_file.R

diff --git a/R/create_individual_file.R b/R/create_individual_file.R
new file mode 100644
index 000000000..b1e39f8df
--- /dev/null
+++ b/R/create_individual_file.R
@@ -0,0 +1,472 @@
+#' Create individual file
+#'
+#' @description Creates individual file from episode file
+#'
+#' @param episode_file Tibble containing episodic data
+create_individual_file <- function(episode_file) {
+  episode_file %>%
+    remove_blank_chi() %>%
+    find_non_duplicates(.data$cij_marker, "Distinct_CIJ") %>%
+    add_cij_columns() %>%
+    find_non_duplicates(.data$ch_chi_cis, "first_ch_ep") %>%
+    add_all_columns()
+}
+
+#' Remove blank CHI
+#'
+#' @description Convert blank strings to NA and remove NAs from CHI column
+#'
+#' @inheritParams create_individual_file
+remove_blank_chi <- function(episode_file) {
+  episode_file %>%
+    dplyr::mutate(chi = dplyr::na_if(.data$chi, "")) %>%
+    dplyr::filter(!is.na(.data$chi))
+}
+
+#' Find non-duplicates
+#'
+#' @description Create new column which marks first (per group)
+#' non-duplicated observation as 1, with any duplicates marked as 0.
+#'
+#' @inheritParams create_individual_file
+#' @param group Column to group by
+#' @param col_name Name of new column
+find_non_duplicates <- function(episode_file, group, col_name) {
+  episode_file %>%
+    dplyr::group_by(.data$chi, {{ group }}) %>%
+    dplyr::mutate("{col_name}" := dplyr::if_else(duplicated({{ group }}), 0, 1)) %>%
+    dplyr::ungroup() %>%
+    dplyr::mutate("{col_name}" := dplyr::if_else(is.na({{ group }}), 0, .data[[col_name]]))
+}
+
+#' Add CIJ-related columns
+#'
+#' @description Add new columns related to CIJ
+#'
+#' @inheritParams create_individual_file
+add_cij_columns <- function(episode_file) {
+  episode_file %>%
+    dplyr::mutate(
+      CIJ_non_el = dplyr::if_else(.data$cij_pattype_code == 0,
+                                  .data$Distinct_CIJ,
+                                  NA_real_
+      ),
+      CIJ_el = dplyr::if_else(.data$cij_pattype_code == 1,
+                              .data$Distinct_CIJ,
+                              NA_real_
+      ),
+      CIJ_mat = dplyr::if_else(.data$cij_pattype_code == 2,
+                               .data$Distinct_CIJ,
+                               NA_real_
+      )
+    ) %>%
+    dplyr::mutate(cij_delay = dplyr::if_else(
+      (.data$cij_delay == 1 & .data$Distinct_CIJ == 1),
+      1,
+      0
+    )) %>%
+    dplyr::mutate(
+      preventable_admissions = dplyr::if_else(
+        (cij_ppa == 1 & Distinct_CIJ == 1),
+        1,
+        0
+      ),
+      preventable_beddays = dplyr::if_else(
+        (cij_ppa == 1 & Distinct_CIJ == 1),
+        as.numeric(cij_end_date - cij_start_date),
+        0
+      )
+    )
+}
+
+#' Add all columns
+#'
+#' @description Add new columns based on SMRType and recid which follow a pattern
+#' of prefixed column names created based on some condition.
+#'
+#' @inheritParams create_individual_filw
+add_all_columns <- function(episode_file) {
+  episode_file %>%
+    add_acute_columns("Acute", (smrtype == "Acute-DC" | smrtype == "Acute-IP") & cij_pattype != "Maternity") %>%
+    add_mat_columns("Mat", recid == "02B" | cij_pattype == "Maternity") %>%
+    add_mh_columns("MH", recid == "04B" & cij_pattype != "Maternity") %>%
+    add_gls_columns("GLS", smrtype == "GLS-IP") %>%
+    add_op_columns("OP", recid == "00B") %>%
+    add_ae_columns("AE", recid == "AE2") %>%
+    add_pis_columns("PIS", recid == "PIS") %>%
+    add_ooh_columns("OoH", recid == "OoH") %>%
+    add_dn_columns("DN", recid == "DN") %>%
+    add_cmh_columns("CMH", recid == "CMH") %>%
+    add_dd_columns("DD", recid == "DD") %>%
+    add_nsu_columns("NSU", recid == "NSU") %>%
+    add_nrs_columns("NRS", recid == "NRS") %>%
+    add_hl1_columns("HL1", recid == "HL1") %>%
+    add_ch_columns("CH", recid == "CH") %>%
+    add_hc_columns("HC", recid == "HC") %>%
+    add_at_columns("AT", recid == "AT") %>%
+    add_sds_columns("SDS", recid == "SDS")
+}
+
+#' Add Acute columns
+#'
+#' @inheritParams create_individuaL_file
+#' @param prefix Prefix to add to related columns, e.g. "Acute"
+#' @param condition Condition to create new columns based on
+add_acute_columns <- function(episode_file, prefix, condition) {
+  condition <- substitute(condition)
+  episode_file %>%
+    add_standard_cols(prefix, condition, episode = TRUE, cost = TRUE) %>%
+    add_ipdc_cols(prefix, condition)
+}
+
+#' Add Mat columns
+#'
+#' @inheritParams add_acute_columns
+add_mat_columns <- function(episode_file, prefix, condition) {
+  condition <- substitute(condition)
+  episode_file %>%
+    add_standard_cols(prefix, condition, episode = TRUE, cost = TRUE) %>%
+    add_ipdc_cols(prefix, condition, elective = FALSE)
+}
+
+#' Add MH columns
+#'
+#' @inheritParams add_acute_columns
+add_mh_columns <- function(episode_file, prefix, condition) {
+  condition <- substitute(condition)
+  episode_file %>%
+    add_standard_cols(prefix, condition, episode = TRUE, cost = TRUE) %>%
+    add_ipdc_cols(prefix, condition, ipdc_d = FALSE)
+}
+
+#' Add GLS columns
+#'
+#' @inheritParams add_acute_columns
+add_gls_columns <- function(episode_file, prefix, condition) {
+  condition <- substitute(condition)
+  episode_file %>%
+    add_standard_cols(prefix, condition, episode = TRUE, cost = TRUE) %>%
+    add_ipdc_cols(prefix, condition, ipdc_d = FALSE)
+}
+
+#' Add OP columns
+#'
+#' @inheritParams add_acute_columns
+add_op_columns <- function(episode_file, prefix, condition) {
+  condition <- substitute(condition)
+  episode_file <- episode_file %>%
+    add_standard_cols(prefix, condition)
+  condition_1 <- substitute(condition & attendance_status == 1)
+  episode_file <- episode_file %>%
+    dplyr::mutate(
+      "{prefix}_newcons_attendances" := dplyr::if_else(eval(condition_1), 1, NA_real_),
+      "{prefix}_cost_attend" := dplyr::if_else(eval(condition_1), cost_total_net, NA_real_)
+    )
+  condition_5_8 <- substitute(condition & attendance_status %in% c(5, 8))
+  episode_file <- episode_file %>%
+    dplyr::mutate(
+      "{prefix}_newcons_dnas" := dplyr::if_else(eval(condition_5_8), 1, NA_real_),
+      "{prefix}_cost_dnas" := dplyr::if_else(eval(condition_5_8), cost_total_net_incdnas, NA_real_)
+    )
+  return(episode_file)
+}
+
+#' Add AE columns
+#'
+#' @inheritParams add_acute_columns
+add_ae_columns <- function(episode_file, prefix, condition) {
+  condition <- substitute(condition)
+  episode_file %>%
+    add_standard_cols(prefix, condition, cost = TRUE) %>%
+    dplyr::mutate("{prefix}_attendances" := dplyr::if_else(eval(condition), cost_total_net, NA_real_))
+}
+
+#' Add PIS columns
+#'
+#' @inheritParams add_acute_columns
+add_pis_columns <- function(episode_file, prefix, condition) {
+  condition <- substitute(condition)
+  episode_file %>%
+    add_standard_cols(prefix, condition, cost = TRUE) %>%
+    dplyr::mutate("{prefix}_paid_items" := dplyr::if_else(eval(condition), no_paid_items, NA_real_))
+}
+
+#' Add OoH columns
+#'
+#' @inheritParams add_acute_columns
+add_ooh_columns <- function(episode_file, prefix, condition) {
+  condition <- substitute(condition)
+  episode_file <- episode_file %>%
+    add_standard_cols(prefix, condition, cost = TRUE) %>%
+    dplyr::mutate(
+      "{prefix}_homeV" := dplyr::if_else(eval(condition) & smrtype == "OOH-HomeV", 1, NA_real_),
+      "{prefix}_advice" := dplyr::if_else(eval(condition) & smrtype == "OOH-Advice", 1, NA_real_),
+      "{prefix}_DN" := dplyr::if_else(eval(condition) & smrtype == "OOH-DN", 1, NA_real_),
+      "{prefix}_NHS24" := dplyr::if_else(eval(condition) & smrtype == "OOH-NHS24", 1, NA_real_),
+      "{prefix}_other" := dplyr::if_else(eval(condition) & smrtype == "OOH-Other", 1, NA_real_),
+      "{prefix}_PCC" := dplyr::if_else(eval(condition) & smrtype == "OOH-PCC", 1, NA_real_),
+      ooh_covid_advice = dplyr::if_else(eval(condition) & smrtype == "OOH-C19Adv", 1, NA_real_),
+      ooh_covid_assessment = dplyr::if_else(eval(condition) & smrtype == "OOH-C19Ass", 1, NA_real_),
+      ooh_covid_other = dplyr::if_else(eval(condition) & smrtype == "OOH-C190th", 1, NA_real_)
+    )
+
+  episode_file <- episode_file %>%
+    dplyr::mutate(
+      OoH_consultation_time = dplyr::if_else(eval(condition), as.numeric((lubridate::seconds_to_period(keytime2) + keydate2_dateformat) - (lubridate::seconds_to_period(keytime1) + keydate1_dateformat), units = "mins"), NA_real_),
+      OoH_consultation_time = dplyr::if_else(OoH_consultation_time < 0, 0, OoH_consultation_time)
+    )
+  return(episode_file)
+}
+
+#' Add DN columns
+#'
+#' @inheritParams add_acute_columns
+add_dn_columns <- function(episode_file, prefix, condition) {
+  condition <- substitute(condition)
+  episode_file %>%
+    add_standard_cols(prefix, condition, episode = TRUE, cost = TRUE) %>%
+    dplyr::mutate("{prefix}_contacts" := dplyr::if_else(eval(condition), totalnodncontacts, NA_real_))
+}
+
+#' Add CMH columns
+#'
+#' @inheritParams add_acute_columns
+add_cmh_columns <- function(episode_file, prefix, condition) {
+  condition <- substitute(condition)
+  episode_file %>%
+    add_standard_cols(prefix, condition) %>%
+    dplyr::mutate("{prefix}_contacts" := dplyr::if_else(eval(condition), 1, NA_real_))
+}
+
+#' Add DD columns
+#'
+#' @inheritParams add_acute_columns
+add_dd_columns <- function(episode_file, prefix, condition) {
+  condition <- substitute(condition)
+  condition_delay <- substitute(condition & primary_delay_reason != "9")
+  episode_file <- episode_file %>%
+    dplyr::mutate(
+      "{prefix}_NonCode9_episodes" := dplyr::if_else(eval(condition_delay), 1, NA_real_),
+      "{prefix}_NonCode9_beddays" := dplyr::if_else(eval(condition_delay), yearstay, NA_real_)
+    )
+  condition_delay_9 <- substitute(condition & primary_delay_reason == "9")
+  episode_file <- episode_file %>%
+    dplyr::mutate(
+      "{prefix}_Code9_episodes" := dplyr::if_else(eval(condition_delay_9), 1, NA_real_),
+      "{prefix}_Code9_beddays" := dplyr::if_else(eval(condition_delay_9), yearstay, NA_real_)
+    )
+  return(episode_file)
+}
+
+#' Add NSU columns
+#'
+#' @inheritParams add_acute_columns
+add_nsu_columns <- function(episode_file, prefix, condition) {
+  condition <- substitute(condition)
+  episode_file %>%
+    add_standard_cols(prefix, condition) %>%
+    dplyr::mutate("{prefix}", dplyr::if_else(eval(condition), 1, NA_real_))
+}
+
+#' Add NRS columns
+#'
+#' @inheritParams add_acute_columns
+add_nrs_columns <- function(episode_file, prefix, condition) {
+  condition <- substitute(condition)
+  episode_file %>%
+    add_standard_cols(prefix, condition) %>%
+    dplyr::mutate("{prefix}", dplyr::if_else(eval(condition), 1, NA_real_))
+}
+
+#' Add HL1 columns
+#'
+#' @inheritParams add_acute_columns
+add_hl1_columns <- function(episode_file, prefix, condition) {
+  condition <- substitute(condition)
+  episode_file %>%
+    add_standard_cols(prefix, condition, drop = "gpprac")
+}
+
+#' Add CH columns
+#'
+#' @inheritParams add_acute_columns
+add_ch_columns <- function(episode_file, prefix, condition) {
+  condition <- substitute(condition)
+  episode_file %>%
+    add_standard_cols(prefix, condition) %>%
+    dplyr::mutate(
+      ch_cis_episodes = dplyr::if_else(eval(condition), first_ch_ep, NA_real_),
+      ch_cost_per_day = dplyr::if_else(eval(condition) & yearstay > 0, cost_total_net / yearstay, NA_real_),
+      ch_cost_per_day = dplyr::if_else(eval(condition) & yearstay == 0, cost_total_net / yearstay, ch_cost_per_day),
+      ch_no_cost = eval(condition) & is.na(ch_cost_per_day),
+      ch_ep_end = dplyr::if_else(eval(condition), keydate2_dateformat, lubridate::NA_Date_),
+      ch_ep_end = dplyr::if_else(eval(condition) & is.na(ch_ep_end), lubridate::quarter(zoo::as.yearqtr(sc_latest_submission), type = "date_first"), ch_ep_end)
+    )
+}
+
+#' Add HC columns
+#'
+#' @inheritParams add_acute_columns
+add_hc_columns <- function(episode_file, prefix, condition) {
+  condition <- substitute(condition)
+  episode_file <- episode_file %>%
+    add_standard_cols(prefix, condition, episode = TRUE) %>%
+    dplyr::mutate(
+      "{prefix}_total_hours" := dplyr::if_else(eval(condition), hc_hours_annual, NA_real_),
+      "{prefix}_total_cost" := dplyr::if_else(eval(condition), cost_total_net, NA_real_),
+    )
+  condition_per <- substitute(condition & smrtype == "HC-Per")
+  episode_file <- episode_file %>%
+    dplyr::mutate(
+      "{prefix}_personal_episodes" := dplyr::if_else(eval(condition_per), 1, NA_real_),
+      "{prefix}_personal_hours" := dplyr::if_else(eval(condition_per), HC_total_hours, NA_real_),
+      "{prefix}_personal_hours_cost" := dplyr::if_else(eval(condition_per), cost_total_net, NA_real_)
+    )
+  condition_non_per <- substitute(condition & smrtype == "HC-Non-Per")
+  episode_file <- episode_file %>%
+    dplyr::mutate(
+      "{prefix}_non_personal_episodes" := dplyr::if_else(eval(condition_non_per), 1, NA_real_),
+      "{prefix}_non_personal_hours" := dplyr::if_else(eval(condition_non_per), hc_hours_annual, NA_real_),
+      "{prefix}_non_personal_hours_cost" := dplyr::if_else(eval(condition_non_per), cost_total_net, NA_real_)
+    )
+  condition_reabl <- substitute(condition & hc_reablement == 1)
+  episode_file <- episode_file %>%
+    dplyr::mutate(
+      "{prefix}_reablement_episodes" := dplyr::if_else(eval(condition_reabl), 1, NA_real_),
+      "{prefix}_reablement_hours" := dplyr::if_else(eval(condition_reabl), hc_hours_annual, NA_real_),
+      "{prefix}_reablement_hours_cost" := dplyr::if_else(eval(condition_reabl), cost_total_net, NA_real_)
+    )
+}
+
+#' Add AT columns
+#'
+#' @inheritParams add_acute_columns
+add_at_columns <- function(episode_file, prefix, condition) {
+  condition <- substitute(condition)
+  episode_file %>%
+    add_standard_cols(prefix, condition) %>%
+    dplyr::mutate(
+      "{prefix}_alarms" := dplyr::if_else(eval(condition) & smrtype == "AT-Alarm", 1, NA_real_),
+      "{prefix}_telecare" := dplyr::if_else(eval(condition) & smrtype == "AT-Tele", 1, NA_real_)
+    )
+}
+
+#' Add SDS columns
+#'
+#' @inheritParams add_acute_columns
+add_sds_columns <- function(episode_file, prefix, condition) {
+  condition <- substitute(condition)
+  episode_file %>%
+    add_standard_cols(prefix, condition) %>%
+    dplyr::mutate(
+      "{prefix}_option_1" := dplyr::if_else(eval(condition) & smrtype == "SDS-1", 1, NA_real_),
+      "{prefix}_option_2" := dplyr::if_else(eval(condition) & smrtype == "SDS-2", 1, NA_real_),
+      "{prefix}_option_3" := dplyr::if_else(eval(condition) & smrtype == "SDS-3", 1, NA_real_),
+      "{prefix}_option_4" := dplyr::if_else(eval(condition) & smrtype == "SDS-4", 1, NA_real_)
+    )
+}
+
+#' Add columns based on IPDC
+#'
+#' @description Add columns based on value in IPDC column, which can
+#' be further split by Elective/Non-Elective CIJ.
+#'
+#' @inheritParams add_acute_columns
+#' @param ipdc_d Whether to create columns based on IPDC = "D" (lgl)
+#' @param elective Whether to create columns based on Elective/Non-Elective cij_pattype (lgl)
+add_ipdc_cols <- function(episode_file, prefix, condition, ipdc_d = TRUE, elective = TRUE) {
+  condition_i <- substitute(eval(condition) & ipdc == "I")
+  episode_file <- episode_file %>%
+    dplyr::mutate(
+      "{prefix}_inpatient_episodes" := dplyr::if_else(eval(condition_i), 1, NA_real_),
+      "{prefix}_inpatient_beddays" := dplyr::if_else(eval(condition_i), yearstay, NA_real_)
+    )
+  if (elective) {
+    condition_el <- substitute(condition_i & cij_pattype == "Elective")
+    episode_file <- episode_file %>%
+      dplyr::mutate(
+        "{prefix}_el_inpatient_episodes" := dplyr::if_else(eval(condition_el), 1, NA_real_),
+        "{prefix}_el_inpatient_beddays" := dplyr::if_else(eval(condition_el), yearstay, NA_real_),
+        "{prefix}_el_inpatient_cost" := dplyr::if_else(eval(condition_el), cost_total_net, NA_real_)
+      )
+    condition_non_el <- substitute(condition_i & cij_pattype == "Non-Elective")
+    episode_file <- episode_file %>%
+      dplyr::mutate(
+        "{prefix}_non_el_inpatient_episodes" := dplyr::if_else(eval(condition_non_el), 1, NA_real_),
+        "{prefix}_non_el_inpatient_beddays" := dplyr::if_else(eval(condition_non_el), yearstay, NA_real_),
+        "{prefix}_non_el_inpatient_cost" := dplyr::if_else(eval(condition_non_el), cost_total_net, NA_real_)
+      )
+  }
+  if (ipdc_d) {
+    condition_d <- substitute(eval(condition) & ipdc == "D")
+    episode_file <- episode_file %>%
+      dplyr::mutate(
+        "{prefix}_daycase_episodes" := dplyr::if_else(eval(condition_d), 1, NA_real_),
+        "{prefix}_daycase_cost" := dplyr::if_else(eval(condition_d), cost_total_net, NA_real_)
+      )
+  }
+  return(episode_file)
+}
+
+#' Add columns based on IPDC
+#'
+#' @description Add standard columns (DoB, postcode, gpprac, episodes, cost) to episode file.
+#'
+#' @inheritParams add_acute_columns
+#' @param drop Any columns out of "DoB", "postcode", and "gpprac" that should be dropped
+#' @param episode Whether to create prefix_episodes col, e.g. "Acute_episodes"
+#' @param cost Whether to create prefix_cost col, e.g. "Acute_cost"
+add_standard_cols <- function(episode_file, prefix, condition, drop = NULL, episode = FALSE, cost = FALSE) {
+  episode_file <- dplyr::bind_cols(episode_file, create_cols(episode_file, prefix, condition, drop))
+  if (episode) {
+    episode_file <- dplyr::mutate(episode_file, "{prefix}_episodes" := dplyr::if_else(eval(condition), 1, NA_real_))
+  }
+  if (cost) {
+    episode_file <- dplyr::mutate(episode_file, "{prefix}_cost" := dplyr::if_else(eval(condition), cost_total_net, NA_real_))
+  }
+  return(episode_file)
+}
+
+#' Create standard cols
+#'
+#' @description Create standard cols (DoB, postcode, gpprac).
+#'
+#' @inheritParams add_acute_columns
+#' @param drop Any columns out of "DoB", "postcode", and "gpprac" that should be dropped
+create_cols <- function(episode_file, prefix, condition, drop) {
+  cols <- c("DoB", "postcode", "gpprac")
+  if (!is.null(drop)) {
+    cols <- cols[cols != drop]
+  }
+  episode_file <- purrr::map_dfc(cols, ~ create_col(episode_file, .x, prefix, condition))
+  return(episode_file)
+}
+
+#' Create standard col
+#'
+#' @description Create single standard column.
+#'
+#' @inheritParams add_acute_columns
+#' @inheritParams na_type
+create_col <- function(episode_file, col, prefix, condition) {
+  episode_file %>%
+    dplyr::mutate("{prefix}_{col}" := dplyr::if_else(eval(condition), .data[[tolower(col)]], na_type(col))) %>%
+    dplyr::select(dplyr::last_col())
+}
+
+#' NA type
+#'
+#' @description Helper function to use correct NA type depending on
+#' which type of column is created.
+#'
+#' @param col Which column to create ("DoB", "postcode", or "gpprac")
+na_type <- function(col = c("DoB", "postcode", "gpprac")) {
+  match.arg(col)
+  na_type <- switch(col,
+    "DoB" = lubridate::NA_Date_,
+    "postcode" = NA_character_,
+    "gpprac" = NA_real_
+  )
+  return(na_type)
+}
+

From e095ddf5d3fe5c0bccf1e3a991be66bd7fa5c408 Mon Sep 17 00:00:00 2001
From: jr-mandy <jr-mandy@users.noreply.github.com>
Date: Wed, 8 Feb 2023 11:33:24 +0000
Subject: [PATCH 002/200] Update documentation

---
 man/add_acute_columns.Rd      | 16 ++++++++++++++++
 man/add_ae_columns.Rd         | 16 ++++++++++++++++
 man/add_all_columns.Rd        | 12 ++++++++++++
 man/add_at_columns.Rd         | 16 ++++++++++++++++
 man/add_ch_columns.Rd         | 16 ++++++++++++++++
 man/add_cij_columns.Rd        | 14 ++++++++++++++
 man/add_cmh_columns.Rd        | 16 ++++++++++++++++
 man/add_dd_columns.Rd         | 16 ++++++++++++++++
 man/add_dn_columns.Rd         | 16 ++++++++++++++++
 man/add_gls_columns.Rd        | 16 ++++++++++++++++
 man/add_hc_columns.Rd         | 16 ++++++++++++++++
 man/add_hl1_columns.Rd        | 16 ++++++++++++++++
 man/add_ipdc_cols.Rd          | 21 +++++++++++++++++++++
 man/add_mat_columns.Rd        | 16 ++++++++++++++++
 man/add_mh_columns.Rd         | 16 ++++++++++++++++
 man/add_nrs_columns.Rd        | 16 ++++++++++++++++
 man/add_nsu_columns.Rd        | 16 ++++++++++++++++
 man/add_ooh_columns.Rd        | 16 ++++++++++++++++
 man/add_op_columns.Rd         | 16 ++++++++++++++++
 man/add_pis_columns.Rd        | 16 ++++++++++++++++
 man/add_sds_columns.Rd        | 16 ++++++++++++++++
 man/add_standard_cols.Rd      | 29 +++++++++++++++++++++++++++++
 man/create_col.Rd             | 18 ++++++++++++++++++
 man/create_cols.Rd            | 18 ++++++++++++++++++
 man/create_individual_file.Rd | 14 ++++++++++++++
 man/find_non_duplicates.Rd    | 19 +++++++++++++++++++
 man/na_type.Rd                | 15 +++++++++++++++
 man/remove_blank_chi.Rd       | 14 ++++++++++++++
 28 files changed, 462 insertions(+)
 create mode 100644 man/add_acute_columns.Rd
 create mode 100644 man/add_ae_columns.Rd
 create mode 100644 man/add_all_columns.Rd
 create mode 100644 man/add_at_columns.Rd
 create mode 100644 man/add_ch_columns.Rd
 create mode 100644 man/add_cij_columns.Rd
 create mode 100644 man/add_cmh_columns.Rd
 create mode 100644 man/add_dd_columns.Rd
 create mode 100644 man/add_dn_columns.Rd
 create mode 100644 man/add_gls_columns.Rd
 create mode 100644 man/add_hc_columns.Rd
 create mode 100644 man/add_hl1_columns.Rd
 create mode 100644 man/add_ipdc_cols.Rd
 create mode 100644 man/add_mat_columns.Rd
 create mode 100644 man/add_mh_columns.Rd
 create mode 100644 man/add_nrs_columns.Rd
 create mode 100644 man/add_nsu_columns.Rd
 create mode 100644 man/add_ooh_columns.Rd
 create mode 100644 man/add_op_columns.Rd
 create mode 100644 man/add_pis_columns.Rd
 create mode 100644 man/add_sds_columns.Rd
 create mode 100644 man/add_standard_cols.Rd
 create mode 100644 man/create_col.Rd
 create mode 100644 man/create_cols.Rd
 create mode 100644 man/create_individual_file.Rd
 create mode 100644 man/find_non_duplicates.Rd
 create mode 100644 man/na_type.Rd
 create mode 100644 man/remove_blank_chi.Rd

diff --git a/man/add_acute_columns.Rd b/man/add_acute_columns.Rd
new file mode 100644
index 000000000..db02a88a5
--- /dev/null
+++ b/man/add_acute_columns.Rd
@@ -0,0 +1,16 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/create_individual_file.R
+\name{add_acute_columns}
+\alias{add_acute_columns}
+\title{Add Acute columns}
+\usage{
+add_acute_columns(episode_file, prefix, condition)
+}
+\arguments{
+\item{prefix}{Prefix to add to related columns, e.g. "Acute"}
+
+\item{condition}{Condition to create new columns based on}
+}
+\description{
+Add Acute columns
+}
diff --git a/man/add_ae_columns.Rd b/man/add_ae_columns.Rd
new file mode 100644
index 000000000..3c90fb4f1
--- /dev/null
+++ b/man/add_ae_columns.Rd
@@ -0,0 +1,16 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/create_individual_file.R
+\name{add_ae_columns}
+\alias{add_ae_columns}
+\title{Add AE columns}
+\usage{
+add_ae_columns(episode_file, prefix, condition)
+}
+\arguments{
+\item{prefix}{Prefix to add to related columns, e.g. "Acute"}
+
+\item{condition}{Condition to create new columns based on}
+}
+\description{
+Add AE columns
+}
diff --git a/man/add_all_columns.Rd b/man/add_all_columns.Rd
new file mode 100644
index 000000000..ce0540864
--- /dev/null
+++ b/man/add_all_columns.Rd
@@ -0,0 +1,12 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/create_individual_file.R
+\name{add_all_columns}
+\alias{add_all_columns}
+\title{Add all columns}
+\usage{
+add_all_columns(episode_file)
+}
+\description{
+Add new columns based on SMRType and recid which follow a pattern
+of prefixed column names created based on some condition.
+}
diff --git a/man/add_at_columns.Rd b/man/add_at_columns.Rd
new file mode 100644
index 000000000..5cb469eea
--- /dev/null
+++ b/man/add_at_columns.Rd
@@ -0,0 +1,16 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/create_individual_file.R
+\name{add_at_columns}
+\alias{add_at_columns}
+\title{Add AT columns}
+\usage{
+add_at_columns(episode_file, prefix, condition)
+}
+\arguments{
+\item{prefix}{Prefix to add to related columns, e.g. "Acute"}
+
+\item{condition}{Condition to create new columns based on}
+}
+\description{
+Add AT columns
+}
diff --git a/man/add_ch_columns.Rd b/man/add_ch_columns.Rd
new file mode 100644
index 000000000..e0abfeaa0
--- /dev/null
+++ b/man/add_ch_columns.Rd
@@ -0,0 +1,16 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/create_individual_file.R
+\name{add_ch_columns}
+\alias{add_ch_columns}
+\title{Add CH columns}
+\usage{
+add_ch_columns(episode_file, prefix, condition)
+}
+\arguments{
+\item{prefix}{Prefix to add to related columns, e.g. "Acute"}
+
+\item{condition}{Condition to create new columns based on}
+}
+\description{
+Add CH columns
+}
diff --git a/man/add_cij_columns.Rd b/man/add_cij_columns.Rd
new file mode 100644
index 000000000..7d00e6299
--- /dev/null
+++ b/man/add_cij_columns.Rd
@@ -0,0 +1,14 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/create_individual_file.R
+\name{add_cij_columns}
+\alias{add_cij_columns}
+\title{Add CIJ-related columns}
+\usage{
+add_cij_columns(episode_file)
+}
+\arguments{
+\item{episode_file}{Tibble containing episodic data}
+}
+\description{
+Add new columns related to CIJ
+}
diff --git a/man/add_cmh_columns.Rd b/man/add_cmh_columns.Rd
new file mode 100644
index 000000000..ebb80c293
--- /dev/null
+++ b/man/add_cmh_columns.Rd
@@ -0,0 +1,16 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/create_individual_file.R
+\name{add_cmh_columns}
+\alias{add_cmh_columns}
+\title{Add CMH columns}
+\usage{
+add_cmh_columns(episode_file, prefix, condition)
+}
+\arguments{
+\item{prefix}{Prefix to add to related columns, e.g. "Acute"}
+
+\item{condition}{Condition to create new columns based on}
+}
+\description{
+Add CMH columns
+}
diff --git a/man/add_dd_columns.Rd b/man/add_dd_columns.Rd
new file mode 100644
index 000000000..425169e70
--- /dev/null
+++ b/man/add_dd_columns.Rd
@@ -0,0 +1,16 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/create_individual_file.R
+\name{add_dd_columns}
+\alias{add_dd_columns}
+\title{Add DD columns}
+\usage{
+add_dd_columns(episode_file, prefix, condition)
+}
+\arguments{
+\item{prefix}{Prefix to add to related columns, e.g. "Acute"}
+
+\item{condition}{Condition to create new columns based on}
+}
+\description{
+Add DD columns
+}
diff --git a/man/add_dn_columns.Rd b/man/add_dn_columns.Rd
new file mode 100644
index 000000000..0f97bd01f
--- /dev/null
+++ b/man/add_dn_columns.Rd
@@ -0,0 +1,16 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/create_individual_file.R
+\name{add_dn_columns}
+\alias{add_dn_columns}
+\title{Add DN columns}
+\usage{
+add_dn_columns(episode_file, prefix, condition)
+}
+\arguments{
+\item{prefix}{Prefix to add to related columns, e.g. "Acute"}
+
+\item{condition}{Condition to create new columns based on}
+}
+\description{
+Add DN columns
+}
diff --git a/man/add_gls_columns.Rd b/man/add_gls_columns.Rd
new file mode 100644
index 000000000..4475fa5d0
--- /dev/null
+++ b/man/add_gls_columns.Rd
@@ -0,0 +1,16 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/create_individual_file.R
+\name{add_gls_columns}
+\alias{add_gls_columns}
+\title{Add GLS columns}
+\usage{
+add_gls_columns(episode_file, prefix, condition)
+}
+\arguments{
+\item{prefix}{Prefix to add to related columns, e.g. "Acute"}
+
+\item{condition}{Condition to create new columns based on}
+}
+\description{
+Add GLS columns
+}
diff --git a/man/add_hc_columns.Rd b/man/add_hc_columns.Rd
new file mode 100644
index 000000000..60352e37b
--- /dev/null
+++ b/man/add_hc_columns.Rd
@@ -0,0 +1,16 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/create_individual_file.R
+\name{add_hc_columns}
+\alias{add_hc_columns}
+\title{Add HC columns}
+\usage{
+add_hc_columns(episode_file, prefix, condition)
+}
+\arguments{
+\item{prefix}{Prefix to add to related columns, e.g. "Acute"}
+
+\item{condition}{Condition to create new columns based on}
+}
+\description{
+Add HC columns
+}
diff --git a/man/add_hl1_columns.Rd b/man/add_hl1_columns.Rd
new file mode 100644
index 000000000..03dcc2dac
--- /dev/null
+++ b/man/add_hl1_columns.Rd
@@ -0,0 +1,16 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/create_individual_file.R
+\name{add_hl1_columns}
+\alias{add_hl1_columns}
+\title{Add HL1 columns}
+\usage{
+add_hl1_columns(episode_file, prefix, condition)
+}
+\arguments{
+\item{prefix}{Prefix to add to related columns, e.g. "Acute"}
+
+\item{condition}{Condition to create new columns based on}
+}
+\description{
+Add HL1 columns
+}
diff --git a/man/add_ipdc_cols.Rd b/man/add_ipdc_cols.Rd
new file mode 100644
index 000000000..537f6d0ce
--- /dev/null
+++ b/man/add_ipdc_cols.Rd
@@ -0,0 +1,21 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/create_individual_file.R
+\name{add_ipdc_cols}
+\alias{add_ipdc_cols}
+\title{Add columns based on IPDC}
+\usage{
+add_ipdc_cols(episode_file, prefix, condition, ipdc_d = TRUE, elective = TRUE)
+}
+\arguments{
+\item{prefix}{Prefix to add to related columns, e.g. "Acute"}
+
+\item{condition}{Condition to create new columns based on}
+
+\item{ipdc_d}{Whether to create columns based on IPDC = "D" (lgl)}
+
+\item{elective}{Whether to create columns based on Elective/Non-Elective cij_pattype (lgl)}
+}
+\description{
+Add columns based on value in IPDC column, which can
+be further split by Elective/Non-Elective CIJ.
+}
diff --git a/man/add_mat_columns.Rd b/man/add_mat_columns.Rd
new file mode 100644
index 000000000..2836faa2a
--- /dev/null
+++ b/man/add_mat_columns.Rd
@@ -0,0 +1,16 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/create_individual_file.R
+\name{add_mat_columns}
+\alias{add_mat_columns}
+\title{Add Mat columns}
+\usage{
+add_mat_columns(episode_file, prefix, condition)
+}
+\arguments{
+\item{prefix}{Prefix to add to related columns, e.g. "Acute"}
+
+\item{condition}{Condition to create new columns based on}
+}
+\description{
+Add Mat columns
+}
diff --git a/man/add_mh_columns.Rd b/man/add_mh_columns.Rd
new file mode 100644
index 000000000..5c1279656
--- /dev/null
+++ b/man/add_mh_columns.Rd
@@ -0,0 +1,16 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/create_individual_file.R
+\name{add_mh_columns}
+\alias{add_mh_columns}
+\title{Add MH columns}
+\usage{
+add_mh_columns(episode_file, prefix, condition)
+}
+\arguments{
+\item{prefix}{Prefix to add to related columns, e.g. "Acute"}
+
+\item{condition}{Condition to create new columns based on}
+}
+\description{
+Add MH columns
+}
diff --git a/man/add_nrs_columns.Rd b/man/add_nrs_columns.Rd
new file mode 100644
index 000000000..20076ce93
--- /dev/null
+++ b/man/add_nrs_columns.Rd
@@ -0,0 +1,16 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/create_individual_file.R
+\name{add_nrs_columns}
+\alias{add_nrs_columns}
+\title{Add NRS columns}
+\usage{
+add_nrs_columns(episode_file, prefix, condition)
+}
+\arguments{
+\item{prefix}{Prefix to add to related columns, e.g. "Acute"}
+
+\item{condition}{Condition to create new columns based on}
+}
+\description{
+Add NRS columns
+}
diff --git a/man/add_nsu_columns.Rd b/man/add_nsu_columns.Rd
new file mode 100644
index 000000000..8518dc6dd
--- /dev/null
+++ b/man/add_nsu_columns.Rd
@@ -0,0 +1,16 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/create_individual_file.R
+\name{add_nsu_columns}
+\alias{add_nsu_columns}
+\title{Add NSU columns}
+\usage{
+add_nsu_columns(episode_file, prefix, condition)
+}
+\arguments{
+\item{prefix}{Prefix to add to related columns, e.g. "Acute"}
+
+\item{condition}{Condition to create new columns based on}
+}
+\description{
+Add NSU columns
+}
diff --git a/man/add_ooh_columns.Rd b/man/add_ooh_columns.Rd
new file mode 100644
index 000000000..5a9078259
--- /dev/null
+++ b/man/add_ooh_columns.Rd
@@ -0,0 +1,16 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/create_individual_file.R
+\name{add_ooh_columns}
+\alias{add_ooh_columns}
+\title{Add OoH columns}
+\usage{
+add_ooh_columns(episode_file, prefix, condition)
+}
+\arguments{
+\item{prefix}{Prefix to add to related columns, e.g. "Acute"}
+
+\item{condition}{Condition to create new columns based on}
+}
+\description{
+Add OoH columns
+}
diff --git a/man/add_op_columns.Rd b/man/add_op_columns.Rd
new file mode 100644
index 000000000..5fd8d78c7
--- /dev/null
+++ b/man/add_op_columns.Rd
@@ -0,0 +1,16 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/create_individual_file.R
+\name{add_op_columns}
+\alias{add_op_columns}
+\title{Add OP columns}
+\usage{
+add_op_columns(episode_file, prefix, condition)
+}
+\arguments{
+\item{prefix}{Prefix to add to related columns, e.g. "Acute"}
+
+\item{condition}{Condition to create new columns based on}
+}
+\description{
+Add OP columns
+}
diff --git a/man/add_pis_columns.Rd b/man/add_pis_columns.Rd
new file mode 100644
index 000000000..b19178df7
--- /dev/null
+++ b/man/add_pis_columns.Rd
@@ -0,0 +1,16 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/create_individual_file.R
+\name{add_pis_columns}
+\alias{add_pis_columns}
+\title{Add PIS columns}
+\usage{
+add_pis_columns(episode_file, prefix, condition)
+}
+\arguments{
+\item{prefix}{Prefix to add to related columns, e.g. "Acute"}
+
+\item{condition}{Condition to create new columns based on}
+}
+\description{
+Add PIS columns
+}
diff --git a/man/add_sds_columns.Rd b/man/add_sds_columns.Rd
new file mode 100644
index 000000000..ec2a4668e
--- /dev/null
+++ b/man/add_sds_columns.Rd
@@ -0,0 +1,16 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/create_individual_file.R
+\name{add_sds_columns}
+\alias{add_sds_columns}
+\title{Add SDS columns}
+\usage{
+add_sds_columns(episode_file, prefix, condition)
+}
+\arguments{
+\item{prefix}{Prefix to add to related columns, e.g. "Acute"}
+
+\item{condition}{Condition to create new columns based on}
+}
+\description{
+Add SDS columns
+}
diff --git a/man/add_standard_cols.Rd b/man/add_standard_cols.Rd
new file mode 100644
index 000000000..35ee445f2
--- /dev/null
+++ b/man/add_standard_cols.Rd
@@ -0,0 +1,29 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/create_individual_file.R
+\name{add_standard_cols}
+\alias{add_standard_cols}
+\title{Add columns based on IPDC}
+\usage{
+add_standard_cols(
+  episode_file,
+  prefix,
+  condition,
+  drop = NULL,
+  episode = FALSE,
+  cost = FALSE
+)
+}
+\arguments{
+\item{prefix}{Prefix to add to related columns, e.g. "Acute"}
+
+\item{condition}{Condition to create new columns based on}
+
+\item{drop}{Any columns out of "DoB", "postcode", and "gpprac" that should be dropped}
+
+\item{episode}{Whether to create prefix_episodes col, e.g. "Acute_episodes"}
+
+\item{cost}{Whether to create prefix_cost col, e.g. "Acute_cost"}
+}
+\description{
+Add standard columns (DoB, postcode, gpprac, episodes, cost) to episode file.
+}
diff --git a/man/create_col.Rd b/man/create_col.Rd
new file mode 100644
index 000000000..496057bc1
--- /dev/null
+++ b/man/create_col.Rd
@@ -0,0 +1,18 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/create_individual_file.R
+\name{create_col}
+\alias{create_col}
+\title{Create standard col}
+\usage{
+create_col(episode_file, col, prefix, condition)
+}
+\arguments{
+\item{col}{Which column to create ("DoB", "postcode", or "gpprac")}
+
+\item{prefix}{Prefix to add to related columns, e.g. "Acute"}
+
+\item{condition}{Condition to create new columns based on}
+}
+\description{
+Create single standard column.
+}
diff --git a/man/create_cols.Rd b/man/create_cols.Rd
new file mode 100644
index 000000000..d6540fa89
--- /dev/null
+++ b/man/create_cols.Rd
@@ -0,0 +1,18 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/create_individual_file.R
+\name{create_cols}
+\alias{create_cols}
+\title{Create standard cols}
+\usage{
+create_cols(episode_file, prefix, condition, drop)
+}
+\arguments{
+\item{prefix}{Prefix to add to related columns, e.g. "Acute"}
+
+\item{condition}{Condition to create new columns based on}
+
+\item{drop}{Any columns out of "DoB", "postcode", and "gpprac" that should be dropped}
+}
+\description{
+Create standard cols (DoB, postcode, gpprac).
+}
diff --git a/man/create_individual_file.Rd b/man/create_individual_file.Rd
new file mode 100644
index 000000000..8b0887565
--- /dev/null
+++ b/man/create_individual_file.Rd
@@ -0,0 +1,14 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/create_individual_file.R
+\name{create_individual_file}
+\alias{create_individual_file}
+\title{Create individual file}
+\usage{
+create_individual_file(episode_file)
+}
+\arguments{
+\item{episode_file}{Tibble containing episodic data}
+}
+\description{
+Creates individual file from episode file
+}
diff --git a/man/find_non_duplicates.Rd b/man/find_non_duplicates.Rd
new file mode 100644
index 000000000..ba82bd5c4
--- /dev/null
+++ b/man/find_non_duplicates.Rd
@@ -0,0 +1,19 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/create_individual_file.R
+\name{find_non_duplicates}
+\alias{find_non_duplicates}
+\title{Find non-duplicates}
+\usage{
+find_non_duplicates(episode_file, group, col_name)
+}
+\arguments{
+\item{episode_file}{Tibble containing episodic data}
+
+\item{group}{Column to group by}
+
+\item{col_name}{Name of new column}
+}
+\description{
+Create new column which marks first (per group)
+non-duplicated observation as 1, with any duplicates marked as 0.
+}
diff --git a/man/na_type.Rd b/man/na_type.Rd
new file mode 100644
index 000000000..f8cbc9581
--- /dev/null
+++ b/man/na_type.Rd
@@ -0,0 +1,15 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/create_individual_file.R
+\name{na_type}
+\alias{na_type}
+\title{NA type}
+\usage{
+na_type(col = c("DoB", "postcode", "gpprac"))
+}
+\arguments{
+\item{col}{Which column to create ("DoB", "postcode", or "gpprac")}
+}
+\description{
+Helper function to use correct NA type depending on
+which type of column is created.
+}
diff --git a/man/remove_blank_chi.Rd b/man/remove_blank_chi.Rd
new file mode 100644
index 000000000..9cba40a8f
--- /dev/null
+++ b/man/remove_blank_chi.Rd
@@ -0,0 +1,14 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/create_individual_file.R
+\name{remove_blank_chi}
+\alias{remove_blank_chi}
+\title{Remove blank CHI}
+\usage{
+remove_blank_chi(episode_file)
+}
+\arguments{
+\item{episode_file}{Tibble containing episodic data}
+}
+\description{
+Convert blank strings to NA and remove NAs from CHI column
+}

From 78cc51fb9509cc0830b2f43cb8a4ad3c11b82375 Mon Sep 17 00:00:00 2001
From: jr-mandy <jr-mandy@users.noreply.github.com>
Date: Wed, 8 Feb 2023 11:38:11 +0000
Subject: [PATCH 003/200] Style code

---
 R/create_individual_file.R | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/R/create_individual_file.R b/R/create_individual_file.R
index b1e39f8df..f6c16722a 100644
--- a/R/create_individual_file.R
+++ b/R/create_individual_file.R
@@ -48,16 +48,16 @@ add_cij_columns <- function(episode_file) {
   episode_file %>%
     dplyr::mutate(
       CIJ_non_el = dplyr::if_else(.data$cij_pattype_code == 0,
-                                  .data$Distinct_CIJ,
-                                  NA_real_
+        .data$Distinct_CIJ,
+        NA_real_
       ),
       CIJ_el = dplyr::if_else(.data$cij_pattype_code == 1,
-                              .data$Distinct_CIJ,
-                              NA_real_
+        .data$Distinct_CIJ,
+        NA_real_
       ),
       CIJ_mat = dplyr::if_else(.data$cij_pattype_code == 2,
-                               .data$Distinct_CIJ,
-                               NA_real_
+        .data$Distinct_CIJ,
+        NA_real_
       )
     ) %>%
     dplyr::mutate(cij_delay = dplyr::if_else(
@@ -469,4 +469,3 @@ na_type <- function(col = c("DoB", "postcode", "gpprac")) {
   )
   return(na_type)
 }
-

From e1b69a0d6a191da6f80bc68452945a91a1fe8485 Mon Sep 17 00:00:00 2001
From: Mandy Norrbo <mandy@jumpingrivers.com>
Date: Wed, 8 Feb 2023 11:42:57 +0000
Subject: [PATCH 004/200] Fix typo

---
 R/create_individual_file.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/R/create_individual_file.R b/R/create_individual_file.R
index b1e39f8df..d650c611a 100644
--- a/R/create_individual_file.R
+++ b/R/create_individual_file.R
@@ -408,7 +408,7 @@ add_ipdc_cols <- function(episode_file, prefix, condition, ipdc_d = TRUE, electi
   return(episode_file)
 }
 
-#' Add columns based on IPDC
+#' Add standard columns
 #'
 #' @description Add standard columns (DoB, postcode, gpprac, episodes, cost) to episode file.
 #'

From 18f78f9e51e2bb783cad02f527e9c3d538aa19e4 Mon Sep 17 00:00:00 2001
From: jr-mandy <jr-mandy@users.noreply.github.com>
Date: Wed, 8 Feb 2023 11:47:19 +0000
Subject: [PATCH 005/200] Update documentation

---
 man/add_standard_cols.Rd | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/man/add_standard_cols.Rd b/man/add_standard_cols.Rd
index 35ee445f2..2e22df08a 100644
--- a/man/add_standard_cols.Rd
+++ b/man/add_standard_cols.Rd
@@ -2,7 +2,7 @@
 % Please edit documentation in R/create_individual_file.R
 \name{add_standard_cols}
 \alias{add_standard_cols}
-\title{Add columns based on IPDC}
+\title{Add standard columns}
 \usage{
 add_standard_cols(
   episode_file,

From de62045de2a311d809235f01e37b24092c8062b6 Mon Sep 17 00:00:00 2001
From: Mandy Norrbo <mandy@jumpingrivers.com>
Date: Wed, 8 Feb 2023 12:56:07 +0000
Subject: [PATCH 006/200] Fix typos

---
 R/create_individual_file.R | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/R/create_individual_file.R b/R/create_individual_file.R
index 1b11cd0ae..e3caf9ae6 100644
--- a/R/create_individual_file.R
+++ b/R/create_individual_file.R
@@ -265,7 +265,7 @@ add_nsu_columns <- function(episode_file, prefix, condition) {
   condition <- substitute(condition)
   episode_file %>%
     add_standard_cols(prefix, condition) %>%
-    dplyr::mutate("{prefix}", dplyr::if_else(eval(condition), 1, NA_real_))
+    dplyr::mutate("{prefix}" := dplyr::if_else(eval(condition), 1, NA_real_))
 }
 
 #' Add NRS columns
@@ -275,7 +275,7 @@ add_nrs_columns <- function(episode_file, prefix, condition) {
   condition <- substitute(condition)
   episode_file %>%
     add_standard_cols(prefix, condition) %>%
-    dplyr::mutate("{prefix}", dplyr::if_else(eval(condition), 1, NA_real_))
+    dplyr::mutate("{prefix}" := dplyr::if_else(eval(condition), 1, NA_real_))
 }
 
 #' Add HL1 columns

From 765072bd61a028e45d43373c01a46665f7b90a75 Mon Sep 17 00:00:00 2001
From: Mandy Norrbo <mandy@jumpingrivers.com>
Date: Wed, 8 Feb 2023 13:29:21 +0000
Subject: [PATCH 007/200] Fix warnings?

---
 R/create_individual_file.R | 134 ++++++++++++++++++-------------------
 1 file changed, 67 insertions(+), 67 deletions(-)

diff --git a/R/create_individual_file.R b/R/create_individual_file.R
index e3caf9ae6..f0d9f7b02 100644
--- a/R/create_individual_file.R
+++ b/R/create_individual_file.R
@@ -67,13 +67,13 @@ add_cij_columns <- function(episode_file) {
     )) %>%
     dplyr::mutate(
       preventable_admissions = dplyr::if_else(
-        (cij_ppa == 1 & Distinct_CIJ == 1),
+        (.data$cij_ppa == 1 & .data$Distinct_CIJ == 1),
         1,
         0
       ),
       preventable_beddays = dplyr::if_else(
-        (cij_ppa == 1 & Distinct_CIJ == 1),
-        as.numeric(cij_end_date - cij_start_date),
+        (.data$cij_ppa == 1 & .data$Distinct_CIJ == 1),
+        as.numeric(.data$cij_end_date - .data$cij_start_date),
         0
       )
     )
@@ -84,32 +84,32 @@ add_cij_columns <- function(episode_file) {
 #' @description Add new columns based on SMRType and recid which follow a pattern
 #' of prefixed column names created based on some condition.
 #'
-#' @inheritParams create_individual_filw
+#' @inheritParams create_individual_file
 add_all_columns <- function(episode_file) {
   episode_file %>%
-    add_acute_columns("Acute", (smrtype == "Acute-DC" | smrtype == "Acute-IP") & cij_pattype != "Maternity") %>%
-    add_mat_columns("Mat", recid == "02B" | cij_pattype == "Maternity") %>%
-    add_mh_columns("MH", recid == "04B" & cij_pattype != "Maternity") %>%
-    add_gls_columns("GLS", smrtype == "GLS-IP") %>%
-    add_op_columns("OP", recid == "00B") %>%
-    add_ae_columns("AE", recid == "AE2") %>%
-    add_pis_columns("PIS", recid == "PIS") %>%
-    add_ooh_columns("OoH", recid == "OoH") %>%
-    add_dn_columns("DN", recid == "DN") %>%
-    add_cmh_columns("CMH", recid == "CMH") %>%
-    add_dd_columns("DD", recid == "DD") %>%
-    add_nsu_columns("NSU", recid == "NSU") %>%
-    add_nrs_columns("NRS", recid == "NRS") %>%
-    add_hl1_columns("HL1", recid == "HL1") %>%
-    add_ch_columns("CH", recid == "CH") %>%
-    add_hc_columns("HC", recid == "HC") %>%
-    add_at_columns("AT", recid == "AT") %>%
-    add_sds_columns("SDS", recid == "SDS")
+    add_acute_columns("Acute", (.data$smrtype == "Acute-DC" | .data$smrtype == "Acute-IP") & .data$cij_pattype != "Maternity") %>%
+    add_mat_columns("Mat", .data$recid == "02B" | .data$cij_pattype == "Maternity") %>%
+    add_mh_columns("MH", .data$recid == "04B" & .data$cij_pattype != "Maternity") %>%
+    add_gls_columns("GLS", .data$smrtype == "GLS-IP") %>%
+    add_op_columns("OP", .data$recid == "00B") %>%
+    add_ae_columns("AE", .data$recid == "AE2") %>%
+    add_pis_columns("PIS", .data$recid == "PIS") %>%
+    add_ooh_columns("OoH", .data$recid == "OoH") %>%
+    add_dn_columns("DN", .data$recid == "DN") %>%
+    add_cmh_columns("CMH", .data$recid == "CMH") %>%
+    add_dd_columns("DD", .data$recid == "DD") %>%
+    add_nsu_columns("NSU", .data$recid == "NSU") %>%
+    add_nrs_columns("NRS", .data$recid == "NRS") %>%
+    add_hl1_columns("HL1", .data$recid == "HL1") %>%
+    add_ch_columns("CH", .data$recid == "CH") %>%
+    add_hc_columns("HC", .data$recid == "HC") %>%
+    add_at_columns("AT", .data$recid == "AT") %>%
+    add_sds_columns("SDS", .data$recid == "SDS")
 }
 
 #' Add Acute columns
 #'
-#' @inheritParams create_individuaL_file
+#' @inheritParams create_individual_file
 #' @param prefix Prefix to add to related columns, e.g. "Acute"
 #' @param condition Condition to create new columns based on
 add_acute_columns <- function(episode_file, prefix, condition) {
@@ -160,13 +160,13 @@ add_op_columns <- function(episode_file, prefix, condition) {
   episode_file <- episode_file %>%
     dplyr::mutate(
       "{prefix}_newcons_attendances" := dplyr::if_else(eval(condition_1), 1, NA_real_),
-      "{prefix}_cost_attend" := dplyr::if_else(eval(condition_1), cost_total_net, NA_real_)
+      "{prefix}_cost_attend" := dplyr::if_else(eval(condition_1), .data$cost_total_net, NA_real_)
     )
   condition_5_8 <- substitute(condition & attendance_status %in% c(5, 8))
   episode_file <- episode_file %>%
     dplyr::mutate(
       "{prefix}_newcons_dnas" := dplyr::if_else(eval(condition_5_8), 1, NA_real_),
-      "{prefix}_cost_dnas" := dplyr::if_else(eval(condition_5_8), cost_total_net_incdnas, NA_real_)
+      "{prefix}_cost_dnas" := dplyr::if_else(eval(condition_5_8), .data$cost_total_net_incdnas, NA_real_)
     )
   return(episode_file)
 }
@@ -178,7 +178,7 @@ add_ae_columns <- function(episode_file, prefix, condition) {
   condition <- substitute(condition)
   episode_file %>%
     add_standard_cols(prefix, condition, cost = TRUE) %>%
-    dplyr::mutate("{prefix}_attendances" := dplyr::if_else(eval(condition), cost_total_net, NA_real_))
+    dplyr::mutate("{prefix}_attendances" := dplyr::if_else(eval(condition), .data$cost_total_net, NA_real_))
 }
 
 #' Add PIS columns
@@ -188,7 +188,7 @@ add_pis_columns <- function(episode_file, prefix, condition) {
   condition <- substitute(condition)
   episode_file %>%
     add_standard_cols(prefix, condition, cost = TRUE) %>%
-    dplyr::mutate("{prefix}_paid_items" := dplyr::if_else(eval(condition), no_paid_items, NA_real_))
+    dplyr::mutate("{prefix}_paid_items" := dplyr::if_else(eval(condition), .data$no_paid_items, NA_real_))
 }
 
 #' Add OoH columns
@@ -199,21 +199,21 @@ add_ooh_columns <- function(episode_file, prefix, condition) {
   episode_file <- episode_file %>%
     add_standard_cols(prefix, condition, cost = TRUE) %>%
     dplyr::mutate(
-      "{prefix}_homeV" := dplyr::if_else(eval(condition) & smrtype == "OOH-HomeV", 1, NA_real_),
-      "{prefix}_advice" := dplyr::if_else(eval(condition) & smrtype == "OOH-Advice", 1, NA_real_),
-      "{prefix}_DN" := dplyr::if_else(eval(condition) & smrtype == "OOH-DN", 1, NA_real_),
-      "{prefix}_NHS24" := dplyr::if_else(eval(condition) & smrtype == "OOH-NHS24", 1, NA_real_),
-      "{prefix}_other" := dplyr::if_else(eval(condition) & smrtype == "OOH-Other", 1, NA_real_),
-      "{prefix}_PCC" := dplyr::if_else(eval(condition) & smrtype == "OOH-PCC", 1, NA_real_),
-      ooh_covid_advice = dplyr::if_else(eval(condition) & smrtype == "OOH-C19Adv", 1, NA_real_),
-      ooh_covid_assessment = dplyr::if_else(eval(condition) & smrtype == "OOH-C19Ass", 1, NA_real_),
-      ooh_covid_other = dplyr::if_else(eval(condition) & smrtype == "OOH-C190th", 1, NA_real_)
+      "{prefix}_homeV" := dplyr::if_else(eval(condition) & .data$smrtype == "OOH-HomeV", 1, NA_real_),
+      "{prefix}_advice" := dplyr::if_else(eval(condition) & .data$smrtype == "OOH-Advice", 1, NA_real_),
+      "{prefix}_DN" := dplyr::if_else(eval(condition) & .data$smrtype == "OOH-DN", 1, NA_real_),
+      "{prefix}_NHS24" := dplyr::if_else(eval(condition) & .data$smrtype == "OOH-NHS24", 1, NA_real_),
+      "{prefix}_other" := dplyr::if_else(eval(condition) & .data$smrtype == "OOH-Other", 1, NA_real_),
+      "{prefix}_PCC" := dplyr::if_else(eval(condition) & .data$smrtype == "OOH-PCC", 1, NA_real_),
+      ooh_covid_advice = dplyr::if_else(eval(condition) & .data$smrtype == "OOH-C19Adv", 1, NA_real_),
+      ooh_covid_assessment = dplyr::if_else(eval(condition) & .data$smrtype == "OOH-C19Ass", 1, NA_real_),
+      ooh_covid_other = dplyr::if_else(eval(condition) & .data$smrtype == "OOH-C190th", 1, NA_real_)
     )
 
   episode_file <- episode_file %>%
     dplyr::mutate(
-      OoH_consultation_time = dplyr::if_else(eval(condition), as.numeric((lubridate::seconds_to_period(keytime2) + keydate2_dateformat) - (lubridate::seconds_to_period(keytime1) + keydate1_dateformat), units = "mins"), NA_real_),
-      OoH_consultation_time = dplyr::if_else(OoH_consultation_time < 0, 0, OoH_consultation_time)
+      OoH_consultation_time = dplyr::if_else(eval(condition), as.numeric((lubridate::seconds_to_period(.data$keytime2) + .data$keydate2_dateformat) - (lubridate::seconds_to_period(.data$keytime1) + .data$keydate1_dateformat), units = "mins"), NA_real_),
+      OoH_consultation_time = dplyr::if_else(OoH_consultation_time < 0, 0, .data$OoH_consultation_time)
     )
   return(episode_file)
 }
@@ -225,7 +225,7 @@ add_dn_columns <- function(episode_file, prefix, condition) {
   condition <- substitute(condition)
   episode_file %>%
     add_standard_cols(prefix, condition, episode = TRUE, cost = TRUE) %>%
-    dplyr::mutate("{prefix}_contacts" := dplyr::if_else(eval(condition), totalnodncontacts, NA_real_))
+    dplyr::mutate("{prefix}_contacts" := dplyr::if_else(eval(condition), .data$totalnodncontacts, NA_real_))
 }
 
 #' Add CMH columns
@@ -247,13 +247,13 @@ add_dd_columns <- function(episode_file, prefix, condition) {
   episode_file <- episode_file %>%
     dplyr::mutate(
       "{prefix}_NonCode9_episodes" := dplyr::if_else(eval(condition_delay), 1, NA_real_),
-      "{prefix}_NonCode9_beddays" := dplyr::if_else(eval(condition_delay), yearstay, NA_real_)
+      "{prefix}_NonCode9_beddays" := dplyr::if_else(eval(condition_delay), .data$yearstay, NA_real_)
     )
   condition_delay_9 <- substitute(condition & primary_delay_reason == "9")
   episode_file <- episode_file %>%
     dplyr::mutate(
       "{prefix}_Code9_episodes" := dplyr::if_else(eval(condition_delay_9), 1, NA_real_),
-      "{prefix}_Code9_beddays" := dplyr::if_else(eval(condition_delay_9), yearstay, NA_real_)
+      "{prefix}_Code9_beddays" := dplyr::if_else(eval(condition_delay_9), .data$yearstay, NA_real_)
     )
   return(episode_file)
 }
@@ -295,12 +295,12 @@ add_ch_columns <- function(episode_file, prefix, condition) {
   episode_file %>%
     add_standard_cols(prefix, condition) %>%
     dplyr::mutate(
-      ch_cis_episodes = dplyr::if_else(eval(condition), first_ch_ep, NA_real_),
-      ch_cost_per_day = dplyr::if_else(eval(condition) & yearstay > 0, cost_total_net / yearstay, NA_real_),
-      ch_cost_per_day = dplyr::if_else(eval(condition) & yearstay == 0, cost_total_net / yearstay, ch_cost_per_day),
+      ch_cis_episodes = dplyr::if_else(eval(condition), .data$first_ch_ep, NA_real_),
+      ch_cost_per_day = dplyr::if_else(eval(condition) & .data$yearstay > 0, .data$cost_total_net / .data$yearstay, NA_real_),
+      ch_cost_per_day = dplyr::if_else(eval(condition) & .data$yearstay == 0, .data$cost_total_net / .data$yearstay, .data$ch_cost_per_day),
       ch_no_cost = eval(condition) & is.na(ch_cost_per_day),
-      ch_ep_end = dplyr::if_else(eval(condition), keydate2_dateformat, lubridate::NA_Date_),
-      ch_ep_end = dplyr::if_else(eval(condition) & is.na(ch_ep_end), lubridate::quarter(zoo::as.yearqtr(sc_latest_submission), type = "date_first"), ch_ep_end)
+      ch_ep_end = dplyr::if_else(eval(condition), .data$keydate2_dateformat, lubridate::NA_Date_),
+      ch_ep_end = dplyr::if_else(eval(condition) & is.na(ch_ep_end), lubridate::quarter(zoo::as.yearqtr(.data$sc_latest_submission), type = "date_first"), .data$ch_ep_end)
     )
 }
 
@@ -312,29 +312,29 @@ add_hc_columns <- function(episode_file, prefix, condition) {
   episode_file <- episode_file %>%
     add_standard_cols(prefix, condition, episode = TRUE) %>%
     dplyr::mutate(
-      "{prefix}_total_hours" := dplyr::if_else(eval(condition), hc_hours_annual, NA_real_),
-      "{prefix}_total_cost" := dplyr::if_else(eval(condition), cost_total_net, NA_real_),
+      "{prefix}_total_hours" := dplyr::if_else(eval(condition), .data$hc_hours_annual, NA_real_),
+      "{prefix}_total_cost" := dplyr::if_else(eval(condition), .data$cost_total_net, NA_real_),
     )
   condition_per <- substitute(condition & smrtype == "HC-Per")
   episode_file <- episode_file %>%
     dplyr::mutate(
       "{prefix}_personal_episodes" := dplyr::if_else(eval(condition_per), 1, NA_real_),
-      "{prefix}_personal_hours" := dplyr::if_else(eval(condition_per), HC_total_hours, NA_real_),
-      "{prefix}_personal_hours_cost" := dplyr::if_else(eval(condition_per), cost_total_net, NA_real_)
+      "{prefix}_personal_hours" := dplyr::if_else(eval(condition_per), .data$HC_total_hours, NA_real_),
+      "{prefix}_personal_hours_cost" := dplyr::if_else(eval(condition_per), .data$cost_total_net, NA_real_)
     )
   condition_non_per <- substitute(condition & smrtype == "HC-Non-Per")
   episode_file <- episode_file %>%
     dplyr::mutate(
       "{prefix}_non_personal_episodes" := dplyr::if_else(eval(condition_non_per), 1, NA_real_),
-      "{prefix}_non_personal_hours" := dplyr::if_else(eval(condition_non_per), hc_hours_annual, NA_real_),
-      "{prefix}_non_personal_hours_cost" := dplyr::if_else(eval(condition_non_per), cost_total_net, NA_real_)
+      "{prefix}_non_personal_hours" := dplyr::if_else(eval(condition_non_per), .data$hc_hours_annual, NA_real_),
+      "{prefix}_non_personal_hours_cost" := dplyr::if_else(eval(condition_non_per), .data$cost_total_net, NA_real_)
     )
   condition_reabl <- substitute(condition & hc_reablement == 1)
   episode_file <- episode_file %>%
     dplyr::mutate(
       "{prefix}_reablement_episodes" := dplyr::if_else(eval(condition_reabl), 1, NA_real_),
-      "{prefix}_reablement_hours" := dplyr::if_else(eval(condition_reabl), hc_hours_annual, NA_real_),
-      "{prefix}_reablement_hours_cost" := dplyr::if_else(eval(condition_reabl), cost_total_net, NA_real_)
+      "{prefix}_reablement_hours" := dplyr::if_else(eval(condition_reabl), .data$hc_hours_annual, NA_real_),
+      "{prefix}_reablement_hours_cost" := dplyr::if_else(eval(condition_reabl), .data$cost_total_net, NA_real_)
     )
 }
 
@@ -346,8 +346,8 @@ add_at_columns <- function(episode_file, prefix, condition) {
   episode_file %>%
     add_standard_cols(prefix, condition) %>%
     dplyr::mutate(
-      "{prefix}_alarms" := dplyr::if_else(eval(condition) & smrtype == "AT-Alarm", 1, NA_real_),
-      "{prefix}_telecare" := dplyr::if_else(eval(condition) & smrtype == "AT-Tele", 1, NA_real_)
+      "{prefix}_alarms" := dplyr::if_else(eval(condition) & .data$smrtype == "AT-Alarm", 1, NA_real_),
+      "{prefix}_telecare" := dplyr::if_else(eval(condition) & .data$smrtype == "AT-Tele", 1, NA_real_)
     )
 }
 
@@ -359,10 +359,10 @@ add_sds_columns <- function(episode_file, prefix, condition) {
   episode_file %>%
     add_standard_cols(prefix, condition) %>%
     dplyr::mutate(
-      "{prefix}_option_1" := dplyr::if_else(eval(condition) & smrtype == "SDS-1", 1, NA_real_),
-      "{prefix}_option_2" := dplyr::if_else(eval(condition) & smrtype == "SDS-2", 1, NA_real_),
-      "{prefix}_option_3" := dplyr::if_else(eval(condition) & smrtype == "SDS-3", 1, NA_real_),
-      "{prefix}_option_4" := dplyr::if_else(eval(condition) & smrtype == "SDS-4", 1, NA_real_)
+      "{prefix}_option_1" := dplyr::if_else(eval(condition) & .data$smrtype == "SDS-1", 1, NA_real_),
+      "{prefix}_option_2" := dplyr::if_else(eval(condition) & .data$smrtype == "SDS-2", 1, NA_real_),
+      "{prefix}_option_3" := dplyr::if_else(eval(condition) & .data$smrtype == "SDS-3", 1, NA_real_),
+      "{prefix}_option_4" := dplyr::if_else(eval(condition) & .data$smrtype == "SDS-4", 1, NA_real_)
     )
 }
 
@@ -379,22 +379,22 @@ add_ipdc_cols <- function(episode_file, prefix, condition, ipdc_d = TRUE, electi
   episode_file <- episode_file %>%
     dplyr::mutate(
       "{prefix}_inpatient_episodes" := dplyr::if_else(eval(condition_i), 1, NA_real_),
-      "{prefix}_inpatient_beddays" := dplyr::if_else(eval(condition_i), yearstay, NA_real_)
+      "{prefix}_inpatient_beddays" := dplyr::if_else(eval(condition_i), .data$yearstay, NA_real_)
     )
   if (elective) {
     condition_el <- substitute(condition_i & cij_pattype == "Elective")
     episode_file <- episode_file %>%
       dplyr::mutate(
         "{prefix}_el_inpatient_episodes" := dplyr::if_else(eval(condition_el), 1, NA_real_),
-        "{prefix}_el_inpatient_beddays" := dplyr::if_else(eval(condition_el), yearstay, NA_real_),
-        "{prefix}_el_inpatient_cost" := dplyr::if_else(eval(condition_el), cost_total_net, NA_real_)
+        "{prefix}_el_inpatient_beddays" := dplyr::if_else(eval(condition_el), .data$yearstay, NA_real_),
+        "{prefix}_el_inpatient_cost" := dplyr::if_else(eval(condition_el), .data$cost_total_net, NA_real_)
       )
     condition_non_el <- substitute(condition_i & cij_pattype == "Non-Elective")
     episode_file <- episode_file %>%
       dplyr::mutate(
         "{prefix}_non_el_inpatient_episodes" := dplyr::if_else(eval(condition_non_el), 1, NA_real_),
-        "{prefix}_non_el_inpatient_beddays" := dplyr::if_else(eval(condition_non_el), yearstay, NA_real_),
-        "{prefix}_non_el_inpatient_cost" := dplyr::if_else(eval(condition_non_el), cost_total_net, NA_real_)
+        "{prefix}_non_el_inpatient_beddays" := dplyr::if_else(eval(condition_non_el), .data$yearstay, NA_real_),
+        "{prefix}_non_el_inpatient_cost" := dplyr::if_else(eval(condition_non_el), .data$cost_total_net, NA_real_)
       )
   }
   if (ipdc_d) {
@@ -402,7 +402,7 @@ add_ipdc_cols <- function(episode_file, prefix, condition, ipdc_d = TRUE, electi
     episode_file <- episode_file %>%
       dplyr::mutate(
         "{prefix}_daycase_episodes" := dplyr::if_else(eval(condition_d), 1, NA_real_),
-        "{prefix}_daycase_cost" := dplyr::if_else(eval(condition_d), cost_total_net, NA_real_)
+        "{prefix}_daycase_cost" := dplyr::if_else(eval(condition_d), .data$cost_total_net, NA_real_)
       )
   }
   return(episode_file)
@@ -422,7 +422,7 @@ add_standard_cols <- function(episode_file, prefix, condition, drop = NULL, epis
     episode_file <- dplyr::mutate(episode_file, "{prefix}_episodes" := dplyr::if_else(eval(condition), 1, NA_real_))
   }
   if (cost) {
-    episode_file <- dplyr::mutate(episode_file, "{prefix}_cost" := dplyr::if_else(eval(condition), cost_total_net, NA_real_))
+    episode_file <- dplyr::mutate(episode_file, "{prefix}_cost" := dplyr::if_else(eval(condition), .data$cost_total_net, NA_real_))
   }
   return(episode_file)
 }

From f466f66a96ad86abbe6844fa81ef625a1d1a7d22 Mon Sep 17 00:00:00 2001
From: Mandy Norrbo <mandy@jumpingrivers.com>
Date: Wed, 8 Feb 2023 13:30:39 +0000
Subject: [PATCH 008/200] Update DESCRIPTION

---
 DESCRIPTION | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index 4ac6f798f..f7307ce85 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -49,7 +49,8 @@ Imports:
     stringr (>= 1.4.0),
     tibble (>= 3.1.7),
     tidyr (>= 1.2.0),
-    tidyselect (>= 1.2.0)
+    tidyselect (>= 1.2.0),
+    zoo (>= 1.8.0)
 Suggests:
     covr (>= 3.6.1),
     roxygen2 (>= 7.2.3),

From 567be35ba2dee02df66219eea6d2fcc714712801 Mon Sep 17 00:00:00 2001
From: jr-mandy <jr-mandy@users.noreply.github.com>
Date: Wed, 8 Feb 2023 13:34:11 +0000
Subject: [PATCH 009/200] Update documentation

---
 man/add_acute_columns.Rd | 2 ++
 man/add_ae_columns.Rd    | 2 ++
 man/add_all_columns.Rd   | 3 +++
 man/add_at_columns.Rd    | 2 ++
 man/add_ch_columns.Rd    | 2 ++
 man/add_cmh_columns.Rd   | 2 ++
 man/add_dd_columns.Rd    | 2 ++
 man/add_dn_columns.Rd    | 2 ++
 man/add_gls_columns.Rd   | 2 ++
 man/add_hc_columns.Rd    | 2 ++
 man/add_hl1_columns.Rd   | 2 ++
 man/add_ipdc_cols.Rd     | 2 ++
 man/add_mat_columns.Rd   | 2 ++
 man/add_mh_columns.Rd    | 2 ++
 man/add_nrs_columns.Rd   | 2 ++
 man/add_nsu_columns.Rd   | 2 ++
 man/add_ooh_columns.Rd   | 2 ++
 man/add_op_columns.Rd    | 2 ++
 man/add_pis_columns.Rd   | 2 ++
 man/add_sds_columns.Rd   | 2 ++
 man/add_standard_cols.Rd | 2 ++
 man/create_col.Rd        | 2 ++
 man/create_cols.Rd       | 2 ++
 23 files changed, 47 insertions(+)

diff --git a/man/add_acute_columns.Rd b/man/add_acute_columns.Rd
index db02a88a5..52ba071b6 100644
--- a/man/add_acute_columns.Rd
+++ b/man/add_acute_columns.Rd
@@ -7,6 +7,8 @@
 add_acute_columns(episode_file, prefix, condition)
 }
 \arguments{
+\item{episode_file}{Tibble containing episodic data}
+
 \item{prefix}{Prefix to add to related columns, e.g. "Acute"}
 
 \item{condition}{Condition to create new columns based on}
diff --git a/man/add_ae_columns.Rd b/man/add_ae_columns.Rd
index 3c90fb4f1..9b7099513 100644
--- a/man/add_ae_columns.Rd
+++ b/man/add_ae_columns.Rd
@@ -7,6 +7,8 @@
 add_ae_columns(episode_file, prefix, condition)
 }
 \arguments{
+\item{episode_file}{Tibble containing episodic data}
+
 \item{prefix}{Prefix to add to related columns, e.g. "Acute"}
 
 \item{condition}{Condition to create new columns based on}
diff --git a/man/add_all_columns.Rd b/man/add_all_columns.Rd
index ce0540864..d502e95c3 100644
--- a/man/add_all_columns.Rd
+++ b/man/add_all_columns.Rd
@@ -6,6 +6,9 @@
 \usage{
 add_all_columns(episode_file)
 }
+\arguments{
+\item{episode_file}{Tibble containing episodic data}
+}
 \description{
 Add new columns based on SMRType and recid which follow a pattern
 of prefixed column names created based on some condition.
diff --git a/man/add_at_columns.Rd b/man/add_at_columns.Rd
index 5cb469eea..e05ea9101 100644
--- a/man/add_at_columns.Rd
+++ b/man/add_at_columns.Rd
@@ -7,6 +7,8 @@
 add_at_columns(episode_file, prefix, condition)
 }
 \arguments{
+\item{episode_file}{Tibble containing episodic data}
+
 \item{prefix}{Prefix to add to related columns, e.g. "Acute"}
 
 \item{condition}{Condition to create new columns based on}
diff --git a/man/add_ch_columns.Rd b/man/add_ch_columns.Rd
index e0abfeaa0..4938f7690 100644
--- a/man/add_ch_columns.Rd
+++ b/man/add_ch_columns.Rd
@@ -7,6 +7,8 @@
 add_ch_columns(episode_file, prefix, condition)
 }
 \arguments{
+\item{episode_file}{Tibble containing episodic data}
+
 \item{prefix}{Prefix to add to related columns, e.g. "Acute"}
 
 \item{condition}{Condition to create new columns based on}
diff --git a/man/add_cmh_columns.Rd b/man/add_cmh_columns.Rd
index ebb80c293..a1d82cba6 100644
--- a/man/add_cmh_columns.Rd
+++ b/man/add_cmh_columns.Rd
@@ -7,6 +7,8 @@
 add_cmh_columns(episode_file, prefix, condition)
 }
 \arguments{
+\item{episode_file}{Tibble containing episodic data}
+
 \item{prefix}{Prefix to add to related columns, e.g. "Acute"}
 
 \item{condition}{Condition to create new columns based on}
diff --git a/man/add_dd_columns.Rd b/man/add_dd_columns.Rd
index 425169e70..08d9c0fe4 100644
--- a/man/add_dd_columns.Rd
+++ b/man/add_dd_columns.Rd
@@ -7,6 +7,8 @@
 add_dd_columns(episode_file, prefix, condition)
 }
 \arguments{
+\item{episode_file}{Tibble containing episodic data}
+
 \item{prefix}{Prefix to add to related columns, e.g. "Acute"}
 
 \item{condition}{Condition to create new columns based on}
diff --git a/man/add_dn_columns.Rd b/man/add_dn_columns.Rd
index 0f97bd01f..bf6af008f 100644
--- a/man/add_dn_columns.Rd
+++ b/man/add_dn_columns.Rd
@@ -7,6 +7,8 @@
 add_dn_columns(episode_file, prefix, condition)
 }
 \arguments{
+\item{episode_file}{Tibble containing episodic data}
+
 \item{prefix}{Prefix to add to related columns, e.g. "Acute"}
 
 \item{condition}{Condition to create new columns based on}
diff --git a/man/add_gls_columns.Rd b/man/add_gls_columns.Rd
index 4475fa5d0..e71dc755b 100644
--- a/man/add_gls_columns.Rd
+++ b/man/add_gls_columns.Rd
@@ -7,6 +7,8 @@
 add_gls_columns(episode_file, prefix, condition)
 }
 \arguments{
+\item{episode_file}{Tibble containing episodic data}
+
 \item{prefix}{Prefix to add to related columns, e.g. "Acute"}
 
 \item{condition}{Condition to create new columns based on}
diff --git a/man/add_hc_columns.Rd b/man/add_hc_columns.Rd
index 60352e37b..95d8f1d3b 100644
--- a/man/add_hc_columns.Rd
+++ b/man/add_hc_columns.Rd
@@ -7,6 +7,8 @@
 add_hc_columns(episode_file, prefix, condition)
 }
 \arguments{
+\item{episode_file}{Tibble containing episodic data}
+
 \item{prefix}{Prefix to add to related columns, e.g. "Acute"}
 
 \item{condition}{Condition to create new columns based on}
diff --git a/man/add_hl1_columns.Rd b/man/add_hl1_columns.Rd
index 03dcc2dac..7600db5e9 100644
--- a/man/add_hl1_columns.Rd
+++ b/man/add_hl1_columns.Rd
@@ -7,6 +7,8 @@
 add_hl1_columns(episode_file, prefix, condition)
 }
 \arguments{
+\item{episode_file}{Tibble containing episodic data}
+
 \item{prefix}{Prefix to add to related columns, e.g. "Acute"}
 
 \item{condition}{Condition to create new columns based on}
diff --git a/man/add_ipdc_cols.Rd b/man/add_ipdc_cols.Rd
index 537f6d0ce..0f91cbd90 100644
--- a/man/add_ipdc_cols.Rd
+++ b/man/add_ipdc_cols.Rd
@@ -7,6 +7,8 @@
 add_ipdc_cols(episode_file, prefix, condition, ipdc_d = TRUE, elective = TRUE)
 }
 \arguments{
+\item{episode_file}{Tibble containing episodic data}
+
 \item{prefix}{Prefix to add to related columns, e.g. "Acute"}
 
 \item{condition}{Condition to create new columns based on}
diff --git a/man/add_mat_columns.Rd b/man/add_mat_columns.Rd
index 2836faa2a..aae729323 100644
--- a/man/add_mat_columns.Rd
+++ b/man/add_mat_columns.Rd
@@ -7,6 +7,8 @@
 add_mat_columns(episode_file, prefix, condition)
 }
 \arguments{
+\item{episode_file}{Tibble containing episodic data}
+
 \item{prefix}{Prefix to add to related columns, e.g. "Acute"}
 
 \item{condition}{Condition to create new columns based on}
diff --git a/man/add_mh_columns.Rd b/man/add_mh_columns.Rd
index 5c1279656..3c50c6cb8 100644
--- a/man/add_mh_columns.Rd
+++ b/man/add_mh_columns.Rd
@@ -7,6 +7,8 @@
 add_mh_columns(episode_file, prefix, condition)
 }
 \arguments{
+\item{episode_file}{Tibble containing episodic data}
+
 \item{prefix}{Prefix to add to related columns, e.g. "Acute"}
 
 \item{condition}{Condition to create new columns based on}
diff --git a/man/add_nrs_columns.Rd b/man/add_nrs_columns.Rd
index 20076ce93..9d7b3f8bf 100644
--- a/man/add_nrs_columns.Rd
+++ b/man/add_nrs_columns.Rd
@@ -7,6 +7,8 @@
 add_nrs_columns(episode_file, prefix, condition)
 }
 \arguments{
+\item{episode_file}{Tibble containing episodic data}
+
 \item{prefix}{Prefix to add to related columns, e.g. "Acute"}
 
 \item{condition}{Condition to create new columns based on}
diff --git a/man/add_nsu_columns.Rd b/man/add_nsu_columns.Rd
index 8518dc6dd..6a54bbcbf 100644
--- a/man/add_nsu_columns.Rd
+++ b/man/add_nsu_columns.Rd
@@ -7,6 +7,8 @@
 add_nsu_columns(episode_file, prefix, condition)
 }
 \arguments{
+\item{episode_file}{Tibble containing episodic data}
+
 \item{prefix}{Prefix to add to related columns, e.g. "Acute"}
 
 \item{condition}{Condition to create new columns based on}
diff --git a/man/add_ooh_columns.Rd b/man/add_ooh_columns.Rd
index 5a9078259..01814ab6d 100644
--- a/man/add_ooh_columns.Rd
+++ b/man/add_ooh_columns.Rd
@@ -7,6 +7,8 @@
 add_ooh_columns(episode_file, prefix, condition)
 }
 \arguments{
+\item{episode_file}{Tibble containing episodic data}
+
 \item{prefix}{Prefix to add to related columns, e.g. "Acute"}
 
 \item{condition}{Condition to create new columns based on}
diff --git a/man/add_op_columns.Rd b/man/add_op_columns.Rd
index 5fd8d78c7..08c4419e2 100644
--- a/man/add_op_columns.Rd
+++ b/man/add_op_columns.Rd
@@ -7,6 +7,8 @@
 add_op_columns(episode_file, prefix, condition)
 }
 \arguments{
+\item{episode_file}{Tibble containing episodic data}
+
 \item{prefix}{Prefix to add to related columns, e.g. "Acute"}
 
 \item{condition}{Condition to create new columns based on}
diff --git a/man/add_pis_columns.Rd b/man/add_pis_columns.Rd
index b19178df7..b582acf2e 100644
--- a/man/add_pis_columns.Rd
+++ b/man/add_pis_columns.Rd
@@ -7,6 +7,8 @@
 add_pis_columns(episode_file, prefix, condition)
 }
 \arguments{
+\item{episode_file}{Tibble containing episodic data}
+
 \item{prefix}{Prefix to add to related columns, e.g. "Acute"}
 
 \item{condition}{Condition to create new columns based on}
diff --git a/man/add_sds_columns.Rd b/man/add_sds_columns.Rd
index ec2a4668e..d5a5fb2cf 100644
--- a/man/add_sds_columns.Rd
+++ b/man/add_sds_columns.Rd
@@ -7,6 +7,8 @@
 add_sds_columns(episode_file, prefix, condition)
 }
 \arguments{
+\item{episode_file}{Tibble containing episodic data}
+
 \item{prefix}{Prefix to add to related columns, e.g. "Acute"}
 
 \item{condition}{Condition to create new columns based on}
diff --git a/man/add_standard_cols.Rd b/man/add_standard_cols.Rd
index 2e22df08a..becec0ddd 100644
--- a/man/add_standard_cols.Rd
+++ b/man/add_standard_cols.Rd
@@ -14,6 +14,8 @@ add_standard_cols(
 )
 }
 \arguments{
+\item{episode_file}{Tibble containing episodic data}
+
 \item{prefix}{Prefix to add to related columns, e.g. "Acute"}
 
 \item{condition}{Condition to create new columns based on}
diff --git a/man/create_col.Rd b/man/create_col.Rd
index 496057bc1..7357adf5d 100644
--- a/man/create_col.Rd
+++ b/man/create_col.Rd
@@ -7,6 +7,8 @@
 create_col(episode_file, col, prefix, condition)
 }
 \arguments{
+\item{episode_file}{Tibble containing episodic data}
+
 \item{col}{Which column to create ("DoB", "postcode", or "gpprac")}
 
 \item{prefix}{Prefix to add to related columns, e.g. "Acute"}
diff --git a/man/create_cols.Rd b/man/create_cols.Rd
index d6540fa89..6bbe1d98a 100644
--- a/man/create_cols.Rd
+++ b/man/create_cols.Rd
@@ -7,6 +7,8 @@
 create_cols(episode_file, prefix, condition, drop)
 }
 \arguments{
+\item{episode_file}{Tibble containing episodic data}
+
 \item{prefix}{Prefix to add to related columns, e.g. "Acute"}
 
 \item{condition}{Condition to create new columns based on}

From 90257a63688f9736a6036978756fb6f1ef564e64 Mon Sep 17 00:00:00 2001
From: Mandy Norrbo <mandy@jumpingrivers.com>
Date: Thu, 9 Feb 2023 09:25:37 +0000
Subject: [PATCH 010/200] WIP

---
 R/create_individual_file.R | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/R/create_individual_file.R b/R/create_individual_file.R
index f0d9f7b02..8259d126c 100644
--- a/R/create_individual_file.R
+++ b/R/create_individual_file.R
@@ -9,7 +9,10 @@ create_individual_file <- function(episode_file) {
     find_non_duplicates(.data$cij_marker, "Distinct_CIJ") %>%
     add_cij_columns() %>%
     find_non_duplicates(.data$ch_chi_cis, "first_ch_ep") %>%
-    add_all_columns()
+    add_all_columns() %>%
+    find_non_duplicates(.data$ooh_case_id, "unique_ooh_case") %>%
+    dplyr::mutate(unique_ooh_case = dplyr::if_else(recid != "OoH", 0, unique_ooh_case)) %>%
+    aggregate_cis_episodes()
 }
 
 #' Remove blank CHI
@@ -469,3 +472,14 @@ na_type <- function(col = c("DoB", "postcode", "gpprac")) {
   )
   return(na_type)
 }
+
+aggregate_cis_episodes <- function(episode_file) {
+  episode_file %>%
+    dplyr::group_by(.data$chi, .data$ch_chi_cis == 1) %>%
+    dplyr::mutate(
+      ch_no_cost = max(.data$ch_no_cost),
+      ch_ep_start = min(.data$keydate1_dateformat),
+      ch_ep_end = max(.data$ch_ep_end),
+      ch_cost_per_day = mean(.data$ch_cost_per_day)
+    )
+}

From b0b1460cfee0525739e34cb73ff8ef699eb030e9 Mon Sep 17 00:00:00 2001
From: Mandy Norrbo <mandy@jumpingrivers.com>
Date: Thu, 9 Feb 2023 09:51:45 +0000
Subject: [PATCH 011/200] Add rowwise() as quarter not vectorised

---
 R/create_individual_file.R | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/R/create_individual_file.R b/R/create_individual_file.R
index f0d9f7b02..5445ae623 100644
--- a/R/create_individual_file.R
+++ b/R/create_individual_file.R
@@ -299,9 +299,12 @@ add_ch_columns <- function(episode_file, prefix, condition) {
       ch_cost_per_day = dplyr::if_else(eval(condition) & .data$yearstay > 0, .data$cost_total_net / .data$yearstay, NA_real_),
       ch_cost_per_day = dplyr::if_else(eval(condition) & .data$yearstay == 0, .data$cost_total_net / .data$yearstay, .data$ch_cost_per_day),
       ch_no_cost = eval(condition) & is.na(ch_cost_per_day),
-      ch_ep_end = dplyr::if_else(eval(condition), .data$keydate2_dateformat, lubridate::NA_Date_),
+      ch_ep_end = dplyr::if_else(eval(condition), .data$keydate2_dateformat, lubridate::NA_Date_)) %>%
+    dplyr::rowwise() %>%
+    dplyr::mutate(
       ch_ep_end = dplyr::if_else(eval(condition) & is.na(ch_ep_end), lubridate::quarter(zoo::as.yearqtr(.data$sc_latest_submission), type = "date_first"), .data$ch_ep_end)
-    )
+    ) %>%
+    dplyr::ungroup()
 }
 
 #' Add HC columns
@@ -469,3 +472,4 @@ na_type <- function(col = c("DoB", "postcode", "gpprac")) {
   )
   return(na_type)
 }
+

From 50f83739035e94d1f0538ee964f7e65e3e1e4074 Mon Sep 17 00:00:00 2001
From: jr-mandy <jr-mandy@users.noreply.github.com>
Date: Thu, 9 Feb 2023 09:54:30 +0000
Subject: [PATCH 012/200] Style code

---
 R/create_individual_file.R | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/R/create_individual_file.R b/R/create_individual_file.R
index 5445ae623..4bc7e68d1 100644
--- a/R/create_individual_file.R
+++ b/R/create_individual_file.R
@@ -299,7 +299,8 @@ add_ch_columns <- function(episode_file, prefix, condition) {
       ch_cost_per_day = dplyr::if_else(eval(condition) & .data$yearstay > 0, .data$cost_total_net / .data$yearstay, NA_real_),
       ch_cost_per_day = dplyr::if_else(eval(condition) & .data$yearstay == 0, .data$cost_total_net / .data$yearstay, .data$ch_cost_per_day),
       ch_no_cost = eval(condition) & is.na(ch_cost_per_day),
-      ch_ep_end = dplyr::if_else(eval(condition), .data$keydate2_dateformat, lubridate::NA_Date_)) %>%
+      ch_ep_end = dplyr::if_else(eval(condition), .data$keydate2_dateformat, lubridate::NA_Date_)
+    ) %>%
     dplyr::rowwise() %>%
     dplyr::mutate(
       ch_ep_end = dplyr::if_else(eval(condition) & is.na(ch_ep_end), lubridate::quarter(zoo::as.yearqtr(.data$sc_latest_submission), type = "date_first"), .data$ch_ep_end)
@@ -472,4 +473,3 @@ na_type <- function(col = c("DoB", "postcode", "gpprac")) {
   )
   return(na_type)
 }
-

From 0c426c5d9fb47840dd4d17c6406f27698c0de9a6 Mon Sep 17 00:00:00 2001
From: Mandy Norrbo <mandy@jumpingrivers.com>
Date: Tue, 14 Feb 2023 11:06:47 +0000
Subject: [PATCH 013/200] Until L594

---
 R/create_individual_file.R | 167 ++++++++++++++++++++++++++++++++++++-
 1 file changed, 164 insertions(+), 3 deletions(-)

diff --git a/R/create_individual_file.R b/R/create_individual_file.R
index b12971990..afd925b76 100644
--- a/R/create_individual_file.R
+++ b/R/create_individual_file.R
@@ -12,7 +12,10 @@ create_individual_file <- function(episode_file) {
     add_all_columns() %>%
     find_non_duplicates(.data$ooh_case_id, "unique_ooh_case") %>%
     dplyr::mutate(unique_ooh_case = dplyr::if_else(recid != "OoH", 0, unique_ooh_case)) %>%
-    aggregate_cis_episodes()
+    aggregate_cis_episodes() %>%
+    clean_up_ch() %>%
+    recode_gender() %>%
+    aggregate_by_chi()
 }
 
 #' Remove blank CHI
@@ -305,7 +308,7 @@ add_ch_columns <- function(episode_file, prefix, condition) {
       ch_ep_end = dplyr::if_else(eval(condition), .data$keydate2_dateformat, lubridate::NA_Date_)) %>%
     dplyr::rowwise() %>%
     dplyr::mutate(
-      ch_ep_end = dplyr::if_else(eval(condition) & is.na(ch_ep_end), lubridate::quarter(zoo::as.yearqtr(.data$sc_latest_submission), type = "date_first"), .data$ch_ep_end)
+      ch_ep_end = dplyr::if_else(eval(condition) & is.na(ch_ep_end), lubridate::quarter(zoo::as.yearqtr(.data$sc_latest_submission), type = "date_first", fiscal_start = 4), .data$ch_ep_end)
     ) %>%
     dplyr::ungroup()
 }
@@ -478,11 +481,169 @@ na_type <- function(col = c("DoB", "postcode", "gpprac")) {
 
 aggregate_cis_episodes <- function(episode_file) {
   episode_file %>%
-    dplyr::group_by(.data$chi, .data$ch_chi_cis == 1) %>%
+    dplyr::group_by(.data$chi, .data$ch_chi_cis) %>%
     dplyr::mutate(
       ch_no_cost = max(.data$ch_no_cost),
       ch_ep_start = min(.data$keydate1_dateformat),
       ch_ep_end = max(.data$ch_ep_end),
       ch_cost_per_day = mean(.data$ch_cost_per_day)
+    ) %>%
+    dplyr::ungroup()
+}
+
+#' @inheritParams create_individual_file
+clean_up_ch <- function(episode_file) {
+  episode_file %>%
+    dplyr::mutate(
+        fy_end = date_from_fy(year, "end") + 1,
+        fy_start = date_from_fy(year, "start")) %>%
+    dplyr::rowwise() %>%
+    dplyr::mutate(
+        term_1 = min(ch_ep_end, fy_end + 1),
+        term_2 = max(ch_ep_start, fy_start)
+    ) %>%
+    dplyr::ungroup() %>%
+    dplyr::mutate(
+      ch_beddays = dplyr::if_else(
+        recid == "CH",
+        as.numeric(term_1 - term_2),
+        NA_real_
+      ),
+      ch_cost = dplyr::if_else(
+        recid == "CH" & ch_no_cost == 0,
+        ch_beddays * ch_cost_per_day,
+        NA_real_
+      ),
+      ch_beddays = dplyr::if_else(
+        recid == "CH" & first_ch_ep == 0,
+        0,
+        ch_beddays
+      ),
+      ch_cost = dplyr::if_else(
+        recid == "CH" & first_ch_ep == 0,
+        0,
+        ch_cost
+      )
+    ) %>%
+    dplyr::select(
+      -fy_end, -fy_start, -term_1, -term_2
     )
 }
+
+date_from_fy <- function(financial_year, type = c("start", "end")) {
+  match.arg(type)
+  n <- switch(type,
+              "start" = 0,
+              "end" = 2)
+  year = as.numeric(paste0("20", substr(financial_year, 1 + n, 2 + n)))
+  if (type == "start") {
+    date <- lubridate::make_date(year, 4, 1)
+    return(date)
+  }
+  date <- lubridate::make_date(year, 3, 31)
+  return(date)
+}
+
+recode_gender <- function(episode_file) {
+  episode_file %>%
+    dplyr::mutate(
+      gender = dplyr::if_else(
+        gender == 0 | gender == 9,
+        1.5,
+        gender
+      )
+    )
+}
+
+aggregate_by_chi <- function(episode_file) {
+  episode_file %>%
+    dplyr::arrange(chi,
+                   keydate1_dateformat,
+                   keytime1,
+                   keydate2_dateformat,
+                   keytime2) %>%
+    dplyr::group_by(chi) %>%
+    dplyr::summarise(
+      gender = mean(gender),
+      dplyr::across(dplyr::ends_with(c(
+        "postcode", "DoB", "gpprac"
+      )), ~ dplyr::last(., na_rm = TRUE)),
+      dplyr::across(
+        c(
+          "CIJ_el",
+          "CIJ_non_el",
+          "CIJ_mat",
+          "cij_delay",
+          dplyr::ends_with(
+            c(
+              "episodes",
+              "beddays",
+              "cost",
+              "attendances",
+              "case",
+              "attend",
+              "contacts",
+              "hours",
+              "alarms",
+              "telecare",
+              "paid_items",
+              "advice",
+              "homeV",
+              "time",
+              "admissions"
+            )
+          ),
+          dplyr::starts_with("SDS_option")
+        ),
+        ~ sum(., na.rm = TRUE)
+      ),
+      dplyr::across(
+        c(
+          dplyr::starts_with("sc_"),
+          -"sc_send_lca",
+          -"sc_latest_submission",
+          "hh_in_FY",
+          "NSU"
+        ),
+        ~ max(., na.rm = TRUE)
+      ),
+      dplyr::across(
+        c(condition_cols(),
+          dplyr::ends_with(c(
+            "_Cohort", "end_fy", "start_fy"
+          )),),
+        ~ dplyr::first(., na_rm = TRUE)
+      )
+    )
+}
+
+
+conditions_cols <- function() {
+  condition_cols <- c(
+    "arth",
+    "asthma",
+    "atrialfib",
+    "cancer",
+    "cvd",
+    "liver",
+    "copd",
+    "dementia",
+    "diabetes",
+    "epilepsy",
+    "chd",
+    "hefailure",
+    "ms",
+    "parkinsons",
+    "refailure",
+    "congen" ,
+    "bloodbfo",
+    "endomet",
+    "digestive"
+  )
+  date_cols <- paste0(conditions, "_date")
+  all_cols <- c(condition_cols, date_cols)
+  return(all_cols)
+}
+
+
+# need to rename: OoH_cases, HL1_in_FY

From 7f17ef24ff08cee78af16c4e90f9917045e261f9 Mon Sep 17 00:00:00 2001
From: Mandy Norrbo <mandy@jumpingrivers.com>
Date: Thu, 16 Feb 2023 10:32:10 +0000
Subject: [PATCH 014/200] Converted until L677

---
 R/create_individual_file.R | 128 +++++++++++++++++++++++++++++++++----
 1 file changed, 116 insertions(+), 12 deletions(-)

diff --git a/R/create_individual_file.R b/R/create_individual_file.R
index afd925b76..ff84fc98f 100644
--- a/R/create_individual_file.R
+++ b/R/create_individual_file.R
@@ -15,7 +15,8 @@ create_individual_file <- function(episode_file) {
     aggregate_cis_episodes() %>%
     clean_up_ch() %>%
     recode_gender() %>%
-    aggregate_by_chi()
+    aggregate_by_chi() %>%
+    clean_individual_file()
 }
 
 #' Remove blank CHI
@@ -212,7 +213,7 @@ add_ooh_columns <- function(episode_file, prefix, condition) {
       "{prefix}_other" := dplyr::if_else(eval(condition) & .data$smrtype == "OOH-Other", 1, NA_real_),
       "{prefix}_PCC" := dplyr::if_else(eval(condition) & .data$smrtype == "OOH-PCC", 1, NA_real_),
       ooh_covid_advice = dplyr::if_else(eval(condition) & .data$smrtype == "OOH-C19Adv", 1, NA_real_),
-      ooh_covid_assessment = dplyr::if_else(eval(condition) & .data$smrtype == "OOH-C19Ass", 1, NA_real_),
+      ooh_covid_assesment = dplyr::if_else(eval(condition) & .data$smrtype == "OOH-C19Ass", 1, NA_real_),
       ooh_covid_other = dplyr::if_else(eval(condition) & .data$smrtype == "OOH-C190th", 1, NA_real_)
     )
 
@@ -387,6 +388,7 @@ add_ipdc_cols <- function(episode_file, prefix, condition, ipdc_d = TRUE, electi
   condition_i <- substitute(eval(condition) & ipdc == "I")
   episode_file <- episode_file %>%
     dplyr::mutate(
+      "{prefix}_inpatient_cost" := dplyr::if_else(eval(condition_i), .data$cost_total_net, NA_real_),
       "{prefix}_inpatient_episodes" := dplyr::if_else(eval(condition_i), 1, NA_real_),
       "{prefix}_inpatient_beddays" := dplyr::if_else(eval(condition_i), .data$yearstay, NA_real_)
     )
@@ -530,17 +532,21 @@ clean_up_ch <- function(episode_file) {
     )
 }
 
-date_from_fy <- function(financial_year, type = c("start", "end")) {
+date_from_fy <- function(financial_year, type = c("start", "end", "mid")) {
   match.arg(type)
   n <- switch(type,
               "start" = 0,
+              "mid" = 0,
               "end" = 2)
   year = as.numeric(paste0("20", substr(financial_year, 1 + n, 2 + n)))
   if (type == "start") {
     date <- lubridate::make_date(year, 4, 1)
     return(date)
+  } else if (type == "end") {
+    date <- lubridate::make_date(year, 3, 31)
+    return(date)
   }
-  date <- lubridate::make_date(year, 3, 31)
+  date <- lubridate::make_date(year, 9, 30)
   return(date)
 }
 
@@ -574,13 +580,13 @@ aggregate_by_chi <- function(episode_file) {
           "CIJ_non_el",
           "CIJ_mat",
           "cij_delay",
+          "OoH_cases" = "unique_ooh_case",
           dplyr::ends_with(
             c(
               "episodes",
               "beddays",
               "cost",
               "attendances",
-              "case",
               "attend",
               "contacts",
               "hours",
@@ -590,7 +596,13 @@ aggregate_by_chi <- function(episode_file) {
               "advice",
               "homeV",
               "time",
-              "admissions"
+              "admissions",
+              "assesment",
+              "other",
+              "DN",
+              "NHS24",
+              "PCC",
+              "_dnas"
             )
           ),
           dplyr::starts_with("SDS_option")
@@ -602,13 +614,16 @@ aggregate_by_chi <- function(episode_file) {
           dplyr::starts_with("sc_"),
           -"sc_send_lca",
           -"sc_latest_submission",
-          "hh_in_FY",
+          "HL1_in_FY" = "hh_in_fy",
           "NSU"
         ),
-        ~ max(., na.rm = TRUE)
+        ~ max_no_inf(.)
       ),
       dplyr::across(
         c(condition_cols(),
+          "death_date",
+          "deceased",
+          "year",
           dplyr::ends_with(c(
             "_Cohort", "end_fy", "start_fy"
           )),),
@@ -618,8 +633,8 @@ aggregate_by_chi <- function(episode_file) {
 }
 
 
-conditions_cols <- function() {
-  condition_cols <- c(
+condition_cols <- function() {
+  conditions <- c(
     "arth",
     "asthma",
     "atrialfib",
@@ -641,9 +656,98 @@ conditions_cols <- function() {
     "digestive"
   )
   date_cols <- paste0(conditions, "_date")
-  all_cols <- c(condition_cols, date_cols)
+  all_cols <- c(conditions, date_cols)
   return(all_cols)
 }
 
+max_no_inf <- function(x) {
+  ifelse(!all(is.na(x)), max(x, na.rm = TRUE), NA)
+}
+
+clean_individual_file <- function(individual_file) {
+  individual_file %>%
+    drop_cols() %>%
+    clean_up_gender() %>%
+    clean_up_dob()
+}
+
+drop_cols <- function(individual_file) {
+  individual_file %>%
+    dplyr::select(
+      -month_cols(),
+      -"ch_no_cost",
+      -"dob",
+      -"postcode",
+      -"gpprac",
+      -"no_paid_items",
+      -"totalnodncontacts"
+    )
+}
+
+month_cols <- function() {
+  suffix <- c("_beddays", "_cost")
+  months <- tolower(c(rep(month.abb, each = 2)))
+  month_cols <- paste0(months, suffix)
+  return(month_cols)
+}
+
+clean_up_gender <- function(individual_file) {
+  individual_file %>%
+    dplyr::mutate(
+      gender = dplyr::case_when(
+        gender != 1.5 ~ round(gender),
+        as.numeric(substr(chi_subset, 9, 9)) %% 2 == 1 ~ 1,
+        TRUE ~ 2
+      ),
+      gender = dplyr::case_when(
+        gender == 1 ~ "Male",
+        gender == 2 ~ "Female"
+      )
+    )
+}
+
+clean_up_dob <- function(individual_file) {
+  individual_file %>%
+    dplyr::mutate(
+      chi_dob_1 = lubridate::dmy(paste0(substr(.data$chi, 1, 4), "19", substr(.data$chi, 5, 6))),
+      chi_dob_2 = lubridate::dmy(paste0(substr(.data$chi, 1, 4), "20", substr(.data$chi, 5, 6))),
+      chi_age_1 = as.numeric(lubridate::interval(lubridate::ymd(.data$chi_dob_1), date_from_fy(year, "mid")), "years"), #  date_from_fy(year, "mid") - lubridate::ymd(.data$chi_dob_1)
+      chi_age_2 = as.numeric(lubridate::interval(lubridate::ymd(.data$chi_dob_2), date_from_fy(year, "mid")), "years") #  date_from_fy(year, "mid") - lubridate::ymd(.data$chi_dob_2)
+    ) %>%
+    dplyr::rowwise() %>%
+    dplyr::mutate(
+      dob_condition_1 = .data$chi_dob_1 %in% dplyr::pick(dplyr::ends_with("_DoB")) & !(chi_dob_2 %in% dplyr::pick(dplyr::ends_with("_DoB"))),
+      dob_condition_2 = .data$chi_dob_2 %in% dplyr::pick(dplyr::ends_with("_DoB")) & !(chi_dob_1 %in% dplyr::pick(dplyr::ends_with("_DoB"))),
+      dob_condition_3 = .data$chi_dob_2 > min(lubridate::today(), date_from_fy(year, "end")),
+      dob_condition_4 = .data$chi_dob_2 > min(dplyr::pick(.data$arth_date:.data$death_date)),
+      dob_condition_5 = .data$congen_date %in% c(.data$chi_dob_1, .data$chi_dob_2)
+    ) %>%
+    dplyr::ungroup() %>%
+    dplyr::mutate(
+      DoB = dplyr::case_when(
+        .data$dob_condition_1 ~ .data$chi_dob_1,
+        .data$dob_condition_2 ~ .data$chi_dob_2
+      )
+    ) %>%
+    dplyr::mutate(
+      DoB = dplyr::case_when(
+        is.na(.data$DoB) & is.na(.data$chi_dob_1) & !is.na(.data$chi_dob_2) ~ .data$chi_dob_2,
+        is.na(.data$DoB) & is.na(.data$chi_dob_2) & !is.na(.data$chi_dob_1) ~ .data$chi_dob_1,
+        is.na(.data$DoB) & .data$chi_age_2 < 0 ~ .data$chi_dob_1,
+        is.na(.data$DoB) & .data$dob_condition_3 ~ .data$chi_dob_1,
+        is.na(.data$DoB) & .data$dob_condition_4 ~ .data$chi_dob_1,
+        is.na(.data$DoB) & .data$dob_condition_5 ~ .data$congen_date,
+        is.na(.data$DoB) & .data$chi_age_1 > 115 ~ .data$chi_dob_2
+      )
+    ) %>%
+    dplyr::select(
+      -dplyr::starts_with(c("dob_condition_", "chi_dob_", "chi_age_"))
+    )
+}
+
+clean_up_age <- function(individual_file) {
+  individual_file %>%
+    dplyr::mutate(
 
-# need to rename: OoH_cases, HL1_in_FY
+    )
+}

From 52a4ffdb50523e276352ef3ff4af9434e528074f Mon Sep 17 00:00:00 2001
From: Mandy Norrbo <mandy@jumpingrivers.com>
Date: Fri, 17 Feb 2023 10:35:03 +0000
Subject: [PATCH 015/200] Until L731

---
 R/create_individual_file.R | 152 +++++++++++++++++++++++++++++++++----
 1 file changed, 139 insertions(+), 13 deletions(-)

diff --git a/R/create_individual_file.R b/R/create_individual_file.R
index ff84fc98f..352c26e95 100644
--- a/R/create_individual_file.R
+++ b/R/create_individual_file.R
@@ -12,7 +12,7 @@ create_individual_file <- function(episode_file) {
     add_all_columns() %>%
     find_non_duplicates(.data$ooh_case_id, "unique_ooh_case") %>%
     dplyr::mutate(unique_ooh_case = dplyr::if_else(recid != "OoH", 0, unique_ooh_case)) %>%
-    aggregate_cis_episodes() %>%
+    aggregate_ch_episodes() %>%
     clean_up_ch() %>%
     recode_gender() %>%
     aggregate_by_chi() %>%
@@ -481,7 +481,12 @@ na_type <- function(col = c("DoB", "postcode", "gpprac")) {
   return(na_type)
 }
 
-aggregate_cis_episodes <- function(episode_file) {
+#' Aggregate CIS episodes
+#'
+#' @description Aggregate CH variables by CHI and CIS.
+#'
+#' @inheritParams create_individual_file
+aggregate_ch_episodes <- function(episode_file) {
   episode_file %>%
     dplyr::group_by(.data$chi, .data$ch_chi_cis) %>%
     dplyr::mutate(
@@ -493,6 +498,10 @@ aggregate_cis_episodes <- function(episode_file) {
     dplyr::ungroup()
 }
 
+#' Clean up CH
+#'
+#' @description Clean up CH-related columns.
+#'
 #' @inheritParams create_individual_file
 clean_up_ch <- function(episode_file) {
   episode_file %>%
@@ -532,6 +541,12 @@ clean_up_ch <- function(episode_file) {
     )
 }
 
+#' Date from FY
+#'
+#' @description Return start, mid, or end date from financial year in format "2122".
+#'
+#' @param financial_year Financial year represented in "YYYY" format e.g. "2122"
+#' @param type One of "start", "end", and "mid", representing the date to return
 date_from_fy <- function(financial_year, type = c("start", "end", "mid")) {
   match.arg(type)
   n <- switch(type,
@@ -550,6 +565,11 @@ date_from_fy <- function(financial_year, type = c("start", "end", "mid")) {
   return(date)
 }
 
+#' Recode gender
+#'
+#' @description Recode gender to 1.5 if 0 or 9.
+#'
+#' @inheritParams create_individual_file
 recode_gender <- function(episode_file) {
   episode_file %>%
     dplyr::mutate(
@@ -561,6 +581,12 @@ recode_gender <- function(episode_file) {
     )
 }
 
+#' Aggregate by CHI
+#'
+#' @description Aggregate episode file by CHI to convert into
+#' individual file.
+#'
+#' @inheritParams create_individual_file
 aggregate_by_chi <- function(episode_file) {
   episode_file %>%
     dplyr::arrange(chi,
@@ -632,7 +658,11 @@ aggregate_by_chi <- function(episode_file) {
     )
 }
 
-
+#' Condition columns
+#'
+#' @description Returns chr vector of column names
+#' which follow format "condition" and "condition_date" e.g.
+#' "dementia" and "dementia_date"
 condition_cols <- function() {
   conditions <- c(
     "arth",
@@ -660,17 +690,48 @@ condition_cols <- function() {
   return(all_cols)
 }
 
+#' Custom maximum
+#'
+#' @description Custom maximum function which removes
+#' missing values but doesn't return Inf if all values
+#' are missing (instead returns NA)
+#'
+#' @param x Vector to return max of
 max_no_inf <- function(x) {
   ifelse(!all(is.na(x)), max(x, na.rm = TRUE), NA)
 }
 
+#' Custom minimum
+#'
+#' @description Custom minimum function which removes
+#' missing values but doesn't return Inf if all values
+#' are missing (instead returns NA)
+#'
+#' @param x Vector to return min of
+min_no_inf <- function(x) {
+  ifelse(!all(is.na(x)), min(x, na.rm = TRUE), NA)
+}
+
+#' Clean individual file
+#'
+#' @description Clean up columns in individual file
+#'
+#' @param individual_file Individual file where each row represents a unique CHI
 clean_individual_file <- function(individual_file) {
   individual_file %>%
     drop_cols() %>%
     clean_up_gender() %>%
-    clean_up_dob()
+    clean_up_dob() %>%
+    dplyr::mutate(
+      age = floor(as.numeric(lubridate::interval(.data$DoB, date_from_fy(year, "mid")), "years"))
+    )
 }
 
+#' Drop redundant columns
+#'
+#' @description Drop redundant columns from individual file.
+#'
+#' @inheritParams clean_individual_file
 drop_cols <- function(individual_file) {
   individual_file %>%
     dplyr::select(
@@ -684,6 +745,10 @@ drop_cols <- function(individual_file) {
     )
 }
 
+#' Month columns
+#'
+#' @description Return chr of column names following pattern
+#' "month_beddays" and "month_cost" e.g. apr_beddays" and "apr_cost"
 month_cols <- function() {
   suffix <- c("_beddays", "_cost")
   months <- tolower(c(rep(month.abb, each = 2)))
@@ -691,35 +756,45 @@ month_cols <- function() {
   return(month_cols)
 }
 
+#' Clean up gender column
+#'
+#' @description Clean up column containing gender.
+#'
+#' @inheritParams clean_individual_file
 clean_up_gender <- function(individual_file) {
   individual_file %>%
     dplyr::mutate(
       gender = dplyr::case_when(
-        gender != 1.5 ~ round(gender),
-        as.numeric(substr(chi_subset, 9, 9)) %% 2 == 1 ~ 1,
+        .data$gender != 1.5 ~ round(.data$gender),
+        as.numeric(substr(.data$chi, 9, 9)) %% 2 == 1 ~ 1,
         TRUE ~ 2
       ),
       gender = dplyr::case_when(
-        gender == 1 ~ "Male",
-        gender == 2 ~ "Female"
+        .data$gender == 1 ~ "Male",
+        .data$gender == 2 ~ "Female"
       )
     )
 }
 
+#' Clean up date of birth column
+#'
+#' @description Clean up column containing date of birth.
+#'
+#' @inheritParams clean_individual_file
 clean_up_dob <- function(individual_file) {
   individual_file %>%
     dplyr::mutate(
       chi_dob_1 = lubridate::dmy(paste0(substr(.data$chi, 1, 4), "19", substr(.data$chi, 5, 6))),
       chi_dob_2 = lubridate::dmy(paste0(substr(.data$chi, 1, 4), "20", substr(.data$chi, 5, 6))),
-      chi_age_1 = as.numeric(lubridate::interval(lubridate::ymd(.data$chi_dob_1), date_from_fy(year, "mid")), "years"), #  date_from_fy(year, "mid") - lubridate::ymd(.data$chi_dob_1)
-      chi_age_2 = as.numeric(lubridate::interval(lubridate::ymd(.data$chi_dob_2), date_from_fy(year, "mid")), "years") #  date_from_fy(year, "mid") - lubridate::ymd(.data$chi_dob_2)
+      chi_age_1 = as.numeric(lubridate::interval(lubridate::ymd(.data$chi_dob_1), date_from_fy(year, "mid")), "years"),
+      chi_age_2 = as.numeric(lubridate::interval(lubridate::ymd(.data$chi_dob_2), date_from_fy(year, "mid")), "years")
     ) %>%
     dplyr::rowwise() %>%
     dplyr::mutate(
       dob_condition_1 = .data$chi_dob_1 %in% dplyr::pick(dplyr::ends_with("_DoB")) & !(chi_dob_2 %in% dplyr::pick(dplyr::ends_with("_DoB"))),
       dob_condition_2 = .data$chi_dob_2 %in% dplyr::pick(dplyr::ends_with("_DoB")) & !(chi_dob_1 %in% dplyr::pick(dplyr::ends_with("_DoB"))),
       dob_condition_3 = .data$chi_dob_2 > min(lubridate::today(), date_from_fy(year, "end")),
-      dob_condition_4 = .data$chi_dob_2 > min(dplyr::pick(.data$arth_date:.data$death_date)),
+      dob_condition_4 = unclass(.data$chi_dob_2) > min_no_inf(as.numeric(dplyr::pick(.data$arth_date:.data$death_date))),
       dob_condition_5 = .data$congen_date %in% c(.data$chi_dob_1, .data$chi_dob_2)
     ) %>%
     dplyr::ungroup() %>%
@@ -737,17 +812,68 @@ clean_up_dob <- function(individual_file) {
         is.na(.data$DoB) & .data$dob_condition_3 ~ .data$chi_dob_1,
         is.na(.data$DoB) & .data$dob_condition_4 ~ .data$chi_dob_1,
         is.na(.data$DoB) & .data$dob_condition_5 ~ .data$congen_date,
-        is.na(.data$DoB) & .data$chi_age_1 > 115 ~ .data$chi_dob_2
+        is.na(.data$DoB) & .data$chi_age_1 > 115 ~ .data$chi_dob_2,
+        TRUE ~ .data$DoB
       )
     ) %>%
+    fill_dob() %>%
     dplyr::select(
       -dplyr::starts_with(c("dob_condition_", "chi_dob_", "chi_age_"))
     )
 }
 
-clean_up_age <- function(individual_file) {
+#' Fill missing date of births
+#'
+#' @description Fill missing date of births with
+#' date of births from specific episode columns in hierarchy.
+#'
+#' @inheritParams clean_individual_file
+fill_dob <- function(individual_file) {
+  column_prefix <- c("PIS", "AE", "OoH", "OP", "Acute", "Mat", "DN", "CMH", "MH",
+              "GLS", "HL1", "CH", "HC", "AT", "SDS", "NSU", "NRS")
+  columns <- paste0(column_prefix, "_DoB")
+  for (i in length(columns)) {
+    individual_file = replace_dob_with_col(individual_file, columns[i])
+  }
+  return(individual_file)
+}
+
+#' Fill missing date of births
+#'
+#' @description Fill missing date of births with
+#' date of births from an episode date of birth column.
+#'
+#' @inheritParams clean_individual_file
+#' @param col Column containing date of birth for episode
+replace_dob_with_col <- function(individual_file, col) {
   individual_file %>%
     dplyr::mutate(
+      DoB = dplyr::if_else(
+        is.na(.data$DoB) & !is.na(.data[[col]]),
+        .data[[col]],
+        .data$DoB
+      )
+    )
+}
 
+# WIP function to clean up postcodes L721-L805 Of D01 Make Individual File.sps
+clean_up_postcode <- function(individual_file) {
+  postcode_lookup <- readr::read_rds(get_slf_postcode_path())
+  testy2= testy %>%
+    dplyr::mutate(
+      all_blank = dplyr::if_else(
+        all(is.na(dplyr::pick(dplyr::ends_with("_postcode")))),
+        1,
+        0
+      )
+    ) %>%
+    dplyr::mutate(
+      HL1_postcode = dplyr::if_else(
+        all_blank == 1,
+        "XXX XXX",
+        .data$HL1_postcode
+      )
     )
+
 }
+

From 3223189890568f0095e209e765f0dc4a4a7a41b6 Mon Sep 17 00:00:00 2001
From: jr-mandy <jr-mandy@users.noreply.github.com>
Date: Fri, 17 Feb 2023 10:41:44 +0000
Subject: [PATCH 016/200] Update documentation

---
 man/aggregate_by_chi.Rd      | 15 +++++++++++++++
 man/aggregate_ch_episodes.Rd | 14 ++++++++++++++
 man/clean_individual_file.Rd | 14 ++++++++++++++
 man/clean_up_ch.Rd           | 14 ++++++++++++++
 man/clean_up_dob.Rd          | 14 ++++++++++++++
 man/clean_up_gender.Rd       | 14 ++++++++++++++
 man/condition_cols.Rd        | 13 +++++++++++++
 man/date_from_fy.Rd          | 16 ++++++++++++++++
 man/drop_cols.Rd             | 14 ++++++++++++++
 man/fill_dob.Rd              | 15 +++++++++++++++
 man/max_no_inf.Rd            | 16 ++++++++++++++++
 man/min_no_inf.Rd            | 16 ++++++++++++++++
 man/month_cols.Rd            | 12 ++++++++++++
 man/recode_gender.Rd         | 14 ++++++++++++++
 man/replace_dob_with_col.Rd  | 17 +++++++++++++++++
 15 files changed, 218 insertions(+)
 create mode 100644 man/aggregate_by_chi.Rd
 create mode 100644 man/aggregate_ch_episodes.Rd
 create mode 100644 man/clean_individual_file.Rd
 create mode 100644 man/clean_up_ch.Rd
 create mode 100644 man/clean_up_dob.Rd
 create mode 100644 man/clean_up_gender.Rd
 create mode 100644 man/condition_cols.Rd
 create mode 100644 man/date_from_fy.Rd
 create mode 100644 man/drop_cols.Rd
 create mode 100644 man/fill_dob.Rd
 create mode 100644 man/max_no_inf.Rd
 create mode 100644 man/min_no_inf.Rd
 create mode 100644 man/month_cols.Rd
 create mode 100644 man/recode_gender.Rd
 create mode 100644 man/replace_dob_with_col.Rd

diff --git a/man/aggregate_by_chi.Rd b/man/aggregate_by_chi.Rd
new file mode 100644
index 000000000..73804ad9b
--- /dev/null
+++ b/man/aggregate_by_chi.Rd
@@ -0,0 +1,15 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/create_individual_file.R
+\name{aggregate_by_chi}
+\alias{aggregate_by_chi}
+\title{Aggregate by CHI}
+\usage{
+aggregate_by_chi(episode_file)
+}
+\arguments{
+\item{episode_file}{Tibble containing episodic data}
+}
+\description{
+Aggregate episode file by CHI to convert into
+individual file.
+}
diff --git a/man/aggregate_ch_episodes.Rd b/man/aggregate_ch_episodes.Rd
new file mode 100644
index 000000000..2753da14f
--- /dev/null
+++ b/man/aggregate_ch_episodes.Rd
@@ -0,0 +1,14 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/create_individual_file.R
+\name{aggregate_ch_episodes}
+\alias{aggregate_ch_episodes}
+\title{Aggregate CIS episodes}
+\usage{
+aggregate_ch_episodes(episode_file)
+}
+\arguments{
+\item{episode_file}{Tibble containing episodic data}
+}
+\description{
+Aggregate CH variables by CHI and CIS.
+}
diff --git a/man/clean_individual_file.Rd b/man/clean_individual_file.Rd
new file mode 100644
index 000000000..30d5479c6
--- /dev/null
+++ b/man/clean_individual_file.Rd
@@ -0,0 +1,14 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/create_individual_file.R
+\name{clean_individual_file}
+\alias{clean_individual_file}
+\title{Clean individual file}
+\usage{
+clean_individual_file(individual_file)
+}
+\arguments{
+\item{individual_file}{Individual file where each row represents a unique CHI}
+}
+\description{
+Clean up columns in individual file
+}
diff --git a/man/clean_up_ch.Rd b/man/clean_up_ch.Rd
new file mode 100644
index 000000000..64bb3e330
--- /dev/null
+++ b/man/clean_up_ch.Rd
@@ -0,0 +1,14 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/create_individual_file.R
+\name{clean_up_ch}
+\alias{clean_up_ch}
+\title{Clean up CH}
+\usage{
+clean_up_ch(episode_file)
+}
+\arguments{
+\item{episode_file}{Tibble containing episodic data}
+}
+\description{
+Clean up CH-related columns.
+}
diff --git a/man/clean_up_dob.Rd b/man/clean_up_dob.Rd
new file mode 100644
index 000000000..4b9003726
--- /dev/null
+++ b/man/clean_up_dob.Rd
@@ -0,0 +1,14 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/create_individual_file.R
+\name{clean_up_dob}
+\alias{clean_up_dob}
+\title{Clean up date of birth column}
+\usage{
+clean_up_dob(individual_file)
+}
+\arguments{
+\item{individual_file}{Individual file where each row represents a unique CHI}
+}
+\description{
+Clean up column containing date of birth.
+}
diff --git a/man/clean_up_gender.Rd b/man/clean_up_gender.Rd
new file mode 100644
index 000000000..edf05bfc8
--- /dev/null
+++ b/man/clean_up_gender.Rd
@@ -0,0 +1,14 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/create_individual_file.R
+\name{clean_up_gender}
+\alias{clean_up_gender}
+\title{Clean up gender column}
+\usage{
+clean_up_gender(individual_file)
+}
+\arguments{
+\item{individual_file}{Individual file where each row represents a unique CHI}
+}
+\description{
+Clean up column containing gender.
+}
diff --git a/man/condition_cols.Rd b/man/condition_cols.Rd
new file mode 100644
index 000000000..ba037a609
--- /dev/null
+++ b/man/condition_cols.Rd
@@ -0,0 +1,13 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/create_individual_file.R
+\name{condition_cols}
+\alias{condition_cols}
+\title{Condition columns}
+\usage{
+condition_cols()
+}
+\description{
+Returns chr vector of column names
+which follow format "condition" and "condition_date" e.g.
+"dementia" and "dementia_date"
+}
diff --git a/man/date_from_fy.Rd b/man/date_from_fy.Rd
new file mode 100644
index 000000000..cc3b8f8a3
--- /dev/null
+++ b/man/date_from_fy.Rd
@@ -0,0 +1,16 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/create_individual_file.R
+\name{date_from_fy}
+\alias{date_from_fy}
+\title{Date from FY}
+\usage{
+date_from_fy(financial_year, type = c("start", "end", "mid"))
+}
+\arguments{
+\item{financial_year}{Financial year represented in "YYYY" format e.g. "2122"}
+
+\item{type}{One of "start", "end", and "mid", representing the date to return}
+}
+\description{
+Return start, mid, or end date from financial year in format "2122".
+}
diff --git a/man/drop_cols.Rd b/man/drop_cols.Rd
new file mode 100644
index 000000000..8029d289c
--- /dev/null
+++ b/man/drop_cols.Rd
@@ -0,0 +1,14 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/create_individual_file.R
+\name{drop_cols}
+\alias{drop_cols}
+\title{Drop redundant columns}
+\usage{
+drop_cols(individual_file)
+}
+\arguments{
+\item{individual_file}{Individual file where each row represents a unique CHI}
+}
+\description{
+Drop redundant columns from individual file.
+}
diff --git a/man/fill_dob.Rd b/man/fill_dob.Rd
new file mode 100644
index 000000000..99d3c03bd
--- /dev/null
+++ b/man/fill_dob.Rd
@@ -0,0 +1,15 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/create_individual_file.R
+\name{fill_dob}
+\alias{fill_dob}
+\title{Fill missing date of births}
+\usage{
+fill_dob(individual_file)
+}
+\arguments{
+\item{individual_file}{Individual file where each row represents a unique CHI}
+}
+\description{
+Fill missing date of births with
+date of births from specific episode columns in hierarchy.
+}
diff --git a/man/max_no_inf.Rd b/man/max_no_inf.Rd
new file mode 100644
index 000000000..79b9a1057
--- /dev/null
+++ b/man/max_no_inf.Rd
@@ -0,0 +1,16 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/create_individual_file.R
+\name{max_no_inf}
+\alias{max_no_inf}
+\title{Custom maximum}
+\usage{
+max_no_inf(x)
+}
+\arguments{
+\item{x}{Vector to return max of}
+}
+\description{
+Custom maximum function which removes
+missing values but doesn't return Inf if all values
+are missing (instead returns NA)
+}
diff --git a/man/min_no_inf.Rd b/man/min_no_inf.Rd
new file mode 100644
index 000000000..38029214f
--- /dev/null
+++ b/man/min_no_inf.Rd
@@ -0,0 +1,16 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/create_individual_file.R
+\name{min_no_inf}
+\alias{min_no_inf}
+\title{Custom minimum}
+\usage{
+min_no_inf(x)
+}
+\arguments{
+\item{x}{Vector to return min of}
+}
+\description{
+Custom minimum function which removes
+missing values but doesn't return Inf if all values
+are missing (instead returns NA)
+}
diff --git a/man/month_cols.Rd b/man/month_cols.Rd
new file mode 100644
index 000000000..b8dd641e5
--- /dev/null
+++ b/man/month_cols.Rd
@@ -0,0 +1,12 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/create_individual_file.R
+\name{month_cols}
+\alias{month_cols}
+\title{Month columns}
+\usage{
+month_cols()
+}
+\description{
+Return chr of column names following pattern
+"month_beddays" and "month_cost" e.g. apr_beddays" and "apr_cost"
+}
diff --git a/man/recode_gender.Rd b/man/recode_gender.Rd
new file mode 100644
index 000000000..526d2829d
--- /dev/null
+++ b/man/recode_gender.Rd
@@ -0,0 +1,14 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/create_individual_file.R
+\name{recode_gender}
+\alias{recode_gender}
+\title{Recode gender}
+\usage{
+recode_gender(episode_file)
+}
+\arguments{
+\item{episode_file}{Tibble containing episodic data}
+}
+\description{
+Recode gender to 1.5 if 0 or 9.
+}
diff --git a/man/replace_dob_with_col.Rd b/man/replace_dob_with_col.Rd
new file mode 100644
index 000000000..61016ec2e
--- /dev/null
+++ b/man/replace_dob_with_col.Rd
@@ -0,0 +1,17 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/create_individual_file.R
+\name{replace_dob_with_col}
+\alias{replace_dob_with_col}
+\title{Fill missing date of births}
+\usage{
+replace_dob_with_col(individual_file, col)
+}
+\arguments{
+\item{individual_file}{Individual file where each row represents a unique CHI}
+
+\item{col}{Column containing date of birth for episode}
+}
+\description{
+Fill missing date of births with
+date of births from an episode date of birth column.
+}

From 1c67f20269cf1459bca51614df0706c9347e55c1 Mon Sep 17 00:00:00 2001
From: Mandy Norrbo <mandy@jumpingrivers.com>
Date: Fri, 17 Feb 2023 10:43:29 +0000
Subject: [PATCH 017/200] Remove test ref

---
 R/create_individual_file.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/R/create_individual_file.R b/R/create_individual_file.R
index 352c26e95..5f6b8acfe 100644
--- a/R/create_individual_file.R
+++ b/R/create_individual_file.R
@@ -859,7 +859,7 @@ replace_dob_with_col <- function(individual_file, col) {
 # WIP function to clean up postcodes L721-L805 Of D01 Make Individual File.sps
 clean_up_postcode <- function(individual_file) {
   postcode_lookup <- readr::read_rds(get_slf_postcode_path())
-  testy2= testy %>%
+  individual_file %>%
     dplyr::mutate(
       all_blank = dplyr::if_else(
         all(is.na(dplyr::pick(dplyr::ends_with("_postcode")))),

From b14e4b2175639b09e66325dff00a71d551b2523c Mon Sep 17 00:00:00 2001
From: jr-mandy <jr-mandy@users.noreply.github.com>
Date: Fri, 17 Feb 2023 10:48:52 +0000
Subject: [PATCH 018/200] Style code

---
 R/create_individual_file.R | 51 ++++++++++++++++++++++----------------
 1 file changed, 29 insertions(+), 22 deletions(-)

diff --git a/R/create_individual_file.R b/R/create_individual_file.R
index 5f6b8acfe..0d598a38e 100644
--- a/R/create_individual_file.R
+++ b/R/create_individual_file.R
@@ -306,7 +306,8 @@ add_ch_columns <- function(episode_file, prefix, condition) {
       ch_cost_per_day = dplyr::if_else(eval(condition) & .data$yearstay > 0, .data$cost_total_net / .data$yearstay, NA_real_),
       ch_cost_per_day = dplyr::if_else(eval(condition) & .data$yearstay == 0, .data$cost_total_net / .data$yearstay, .data$ch_cost_per_day),
       ch_no_cost = eval(condition) & is.na(ch_cost_per_day),
-      ch_ep_end = dplyr::if_else(eval(condition), .data$keydate2_dateformat, lubridate::NA_Date_)) %>%
+      ch_ep_end = dplyr::if_else(eval(condition), .data$keydate2_dateformat, lubridate::NA_Date_)
+    ) %>%
     dplyr::rowwise() %>%
     dplyr::mutate(
       ch_ep_end = dplyr::if_else(eval(condition) & is.na(ch_ep_end), lubridate::quarter(zoo::as.yearqtr(.data$sc_latest_submission), type = "date_first", fiscal_start = 4), .data$ch_ep_end)
@@ -506,12 +507,13 @@ aggregate_ch_episodes <- function(episode_file) {
 clean_up_ch <- function(episode_file) {
   episode_file %>%
     dplyr::mutate(
-        fy_end = date_from_fy(year, "end") + 1,
-        fy_start = date_from_fy(year, "start")) %>%
+      fy_end = date_from_fy(year, "end") + 1,
+      fy_start = date_from_fy(year, "start")
+    ) %>%
     dplyr::rowwise() %>%
     dplyr::mutate(
-        term_1 = min(ch_ep_end, fy_end + 1),
-        term_2 = max(ch_ep_start, fy_start)
+      term_1 = min(ch_ep_end, fy_end + 1),
+      term_2 = max(ch_ep_start, fy_start)
     ) %>%
     dplyr::ungroup() %>%
     dplyr::mutate(
@@ -550,10 +552,11 @@ clean_up_ch <- function(episode_file) {
 date_from_fy <- function(financial_year, type = c("start", "end", "mid")) {
   match.arg(type)
   n <- switch(type,
-              "start" = 0,
-              "mid" = 0,
-              "end" = 2)
-  year = as.numeric(paste0("20", substr(financial_year, 1 + n, 2 + n)))
+    "start" = 0,
+    "mid" = 0,
+    "end" = 2
+  )
+  year <- as.numeric(paste0("20", substr(financial_year, 1 + n, 2 + n)))
   if (type == "start") {
     date <- lubridate::make_date(year, 4, 1)
     return(date)
@@ -589,11 +592,13 @@ recode_gender <- function(episode_file) {
 #' @inheritParams create_individual_file
 aggregate_by_chi <- function(episode_file) {
   episode_file %>%
-    dplyr::arrange(chi,
-                   keydate1_dateformat,
-                   keytime1,
-                   keydate2_dateformat,
-                   keytime2) %>%
+    dplyr::arrange(
+      chi,
+      keydate1_dateformat,
+      keytime1,
+      keydate2_dateformat,
+      keytime2
+    ) %>%
     dplyr::group_by(chi) %>%
     dplyr::summarise(
       gender = mean(gender),
@@ -646,13 +651,15 @@ aggregate_by_chi <- function(episode_file) {
         ~ max_no_inf(.)
       ),
       dplyr::across(
-        c(condition_cols(),
+        c(
+          condition_cols(),
           "death_date",
           "deceased",
           "year",
           dplyr::ends_with(c(
             "_Cohort", "end_fy", "start_fy"
-          )),),
+          )),
+        ),
         ~ dplyr::first(., na_rm = TRUE)
       )
     )
@@ -680,7 +687,7 @@ condition_cols <- function() {
     "ms",
     "parkinsons",
     "refailure",
-    "congen" ,
+    "congen",
     "bloodbfo",
     "endomet",
     "digestive"
@@ -829,11 +836,13 @@ clean_up_dob <- function(individual_file) {
 #'
 #' @inheritParams clean_individual_file
 fill_dob <- function(individual_file) {
-  column_prefix <- c("PIS", "AE", "OoH", "OP", "Acute", "Mat", "DN", "CMH", "MH",
-              "GLS", "HL1", "CH", "HC", "AT", "SDS", "NSU", "NRS")
+  column_prefix <- c(
+    "PIS", "AE", "OoH", "OP", "Acute", "Mat", "DN", "CMH", "MH",
+    "GLS", "HL1", "CH", "HC", "AT", "SDS", "NSU", "NRS"
+  )
   columns <- paste0(column_prefix, "_DoB")
   for (i in length(columns)) {
-    individual_file = replace_dob_with_col(individual_file, columns[i])
+    individual_file <- replace_dob_with_col(individual_file, columns[i])
   }
   return(individual_file)
 }
@@ -874,6 +883,4 @@ clean_up_postcode <- function(individual_file) {
         .data$HL1_postcode
       )
     )
-
 }
-

From f31f19bff6584def710d931f72ec7d2781e69f78 Mon Sep 17 00:00:00 2001
From: "shintoLampgit config --global user.email bateman.mcbride@phs.scotm  git
 config --global user.name shintoLamp" <bateman.mcbride@phs.scot>
Date: Thu, 2 Mar 2023 17:17:56 +0000
Subject: [PATCH 019/200] WIP writing functions to fill postcode in line with
 previous DOB functions

---
 R/create_individual_file.R | 56 ++++++++++++++++++++++++++++++++------
 1 file changed, 47 insertions(+), 9 deletions(-)

diff --git a/R/create_individual_file.R b/R/create_individual_file.R
index 0d598a38e..f2f32ee5d 100644
--- a/R/create_individual_file.R
+++ b/R/create_individual_file.R
@@ -731,7 +731,8 @@ clean_individual_file <- function(individual_file) {
     clean_up_dob() %>%
     dplyr::mutate(
       age = floor(as.numeric(lubridate::interval(.data$DoB, date_from_fy(year, "mid")), "years"))
-    )
+    ) %>%
+    clean_up_postcode()
 }
 
 #' Drop redundant columns
@@ -865,18 +866,19 @@ replace_dob_with_col <- function(individual_file, col) {
     )
 }
 
-# WIP function to clean up postcodes L721-L805 Of D01 Make Individual File.sps
+#' Clean up postcode column
+#'
+#' @description Clean up column containing postcode.
+#'
+#' @inheritParams clean_individual_file
 clean_up_postcode <- function(individual_file) {
   postcode_lookup <- readr::read_rds(get_slf_postcode_path())
   individual_file %>%
     dplyr::mutate(
-      all_blank = dplyr::if_else(
-        all(is.na(dplyr::pick(dplyr::ends_with("_postcode")))),
-        1,
-        0
-      )
-    ) %>%
-    dplyr::mutate(
+      # all_blank is TRUE when all postcode variables are blank
+      all_blank = all(is.na(dplyr::pick(dplyr::ends_with("_postcode")))),
+      # Use NRS_postcode to store the dummy for no other reason than it's last
+      # in the hierarchy
       HL1_postcode = dplyr::if_else(
         all_blank == 1,
         "XXX XXX",
@@ -884,3 +886,39 @@ clean_up_postcode <- function(individual_file) {
       )
     )
 }
+
+#' Fill missing postcodes
+#'
+#' @description Fill missing postcodes with
+#' postcodes from specific episode columns in hierarchy.
+#'
+#' @inheritParams clean_individual_file
+fill_dob <- function(individual_file) {
+  column_prefix <- c(
+    "PIS", "AE", "OoH", "OP", "Acute", "Mat", "HC", "DN", "CMH", "MH",
+    "GLS", "AT", "SDS", "CH", "NSU", "NRS", "HL1"
+  )
+  columns <- paste0(column_prefix, "_postcode")
+  for (i in length(columns)) {
+    individual_file <- replace_postcode_with_col(individual_file, columns[i])
+  }
+  return(individual_file)
+}
+
+#' Fill missing postcode
+#'
+#' @description Fill missing postcode with
+#' postcodes from an episode postcode column.
+#'
+#' @inheritParams clean_individual_file
+#' @param col Column containing postcode for episode
+replace_postcode_with_col <- function(individual_file, col) {
+  individual_file %>%
+    dplyr::mutate(
+      postcode = dplyr::if_else(
+        is.na(.data$postcode) & !is.na(.data[[col]]),
+        .data[[col]],
+        .data$postcode
+      )
+    )
+}

From 891c9a9fedf76a1ef4074cecb7961a1148dd78c9 Mon Sep 17 00:00:00 2001
From: shintoLamp <shintoLamp@users.noreply.github.com>
Date: Thu, 2 Mar 2023 17:21:35 +0000
Subject: [PATCH 020/200] Update documentation

---
 man/clean_up_postcode.Rd         | 14 ++++++++++++++
 man/fill_dob.Rd                  |  5 +++++
 man/replace_postcode_with_col.Rd | 17 +++++++++++++++++
 3 files changed, 36 insertions(+)
 create mode 100644 man/clean_up_postcode.Rd
 create mode 100644 man/replace_postcode_with_col.Rd

diff --git a/man/clean_up_postcode.Rd b/man/clean_up_postcode.Rd
new file mode 100644
index 000000000..b3cd91548
--- /dev/null
+++ b/man/clean_up_postcode.Rd
@@ -0,0 +1,14 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/create_individual_file.R
+\name{clean_up_postcode}
+\alias{clean_up_postcode}
+\title{Clean up postcode column}
+\usage{
+clean_up_postcode(individual_file)
+}
+\arguments{
+\item{individual_file}{Individual file where each row represents a unique CHI}
+}
+\description{
+Clean up column containing postcode.
+}
diff --git a/man/fill_dob.Rd b/man/fill_dob.Rd
index 99d3c03bd..3dc8e4295 100644
--- a/man/fill_dob.Rd
+++ b/man/fill_dob.Rd
@@ -4,6 +4,8 @@
 \alias{fill_dob}
 \title{Fill missing date of births}
 \usage{
+fill_dob(individual_file)
+
 fill_dob(individual_file)
 }
 \arguments{
@@ -12,4 +14,7 @@ fill_dob(individual_file)
 \description{
 Fill missing date of births with
 date of births from specific episode columns in hierarchy.
+
+Fill missing postcodes with
+postcodes from specific episode columns in hierarchy.
 }
diff --git a/man/replace_postcode_with_col.Rd b/man/replace_postcode_with_col.Rd
new file mode 100644
index 000000000..3feb0fbcb
--- /dev/null
+++ b/man/replace_postcode_with_col.Rd
@@ -0,0 +1,17 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/create_individual_file.R
+\name{replace_postcode_with_col}
+\alias{replace_postcode_with_col}
+\title{Fill missing postcode}
+\usage{
+replace_postcode_with_col(individual_file, col)
+}
+\arguments{
+\item{individual_file}{Individual file where each row represents a unique CHI}
+
+\item{col}{Column containing postcode for episode}
+}
+\description{
+Fill missing postcode with
+postcodes from an episode postcode column.
+}

From 73f77d5a965d5c81eabade4115be415a6b8c3747 Mon Sep 17 00:00:00 2001
From: Jennifer Thom <jennifer.thom@phs.scot>
Date: Wed, 3 May 2023 13:35:06 +0100
Subject: [PATCH 021/200] implement quick fix for running 22/23

---
 R/add_nsu_cohort.R | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/R/add_nsu_cohort.R b/R/add_nsu_cohort.R
index ff27b0afa..1f4d4b334 100644
--- a/R/add_nsu_cohort.R
+++ b/R/add_nsu_cohort.R
@@ -11,6 +11,10 @@
 add_nsu_cohort <- function(data, year) {
   year_param <- year
 
+  if (year == "2223"){
+    return(data)
+  }else{
+
   # Check that the variables we need are in the data
   check_variables_exist(data,
     variables = c(
@@ -110,4 +114,5 @@ add_nsu_cohort <- function(data, year) {
     dplyr::select(-dplyr::contains("_nsu"), -"has_chi")
 
   return(return_df)
+  }
 }

From 00f37c807a266b2d09ed6f4f461b44ee303279be Mon Sep 17 00:00:00 2001
From: Jennit07 <Jennit07@users.noreply.github.com>
Date: Wed, 3 May 2023 12:42:33 +0000
Subject: [PATCH 022/200] Style code

---
 R/add_nsu_cohort.R | 195 ++++++++++++++++++++++-----------------------
 1 file changed, 97 insertions(+), 98 deletions(-)

diff --git a/R/add_nsu_cohort.R b/R/add_nsu_cohort.R
index 1f4d4b334..d98da5ee1 100644
--- a/R/add_nsu_cohort.R
+++ b/R/add_nsu_cohort.R
@@ -11,108 +11,107 @@
 add_nsu_cohort <- function(data, year) {
   year_param <- year
 
-  if (year == "2223"){
+  if (year == "2223") {
     return(data)
-  }else{
-
-  # Check that the variables we need are in the data
-  check_variables_exist(data,
-    variables = c(
-      "year",
-      "chi",
-      "recid",
-      "smrtype",
-      "postcode",
-      "gpprac",
-      "dob",
-      "gender"
+  } else {
+    # Check that the variables we need are in the data
+    check_variables_exist(data,
+      variables = c(
+        "year",
+        "chi",
+        "recid",
+        "smrtype",
+        "postcode",
+        "gpprac",
+        "dob",
+        "gender"
+      )
     )
-  )
 
-  matched <- dplyr::full_join(data,
-    # NSU cohort file
-    read_file(get_nsu_path(year)) %>%
-      dplyr::mutate(
-        dob = as.Date(.data[["dob"]]),
-        gpprac = convert_eng_gpprac_to_dummy(.data[["gpprac"]])
-      ),
-    # Match on by chi
-    by = "chi",
-    # Name the incoming variables with "_nsu"
-    suffix = c("", "_nsu"),
-    # Keep the chi from both sources
-    keep = TRUE
-  ) %>%
-    # Change the chi from the NSU cohort to a boolean
-    dplyr::mutate(has_chi = !is_missing(.data[["chi_nsu"]]))
-
-  return_df <- matched %>%
-    # Get data from non service user lookup if the recid is empty
-    dplyr::mutate(
-      year = year_param,
-      recid = dplyr::if_else(
-        is_missing(.data[["recid"]]),
-        "NSU",
-        .data[["recid"]]
-      ),
-      smrtype = dplyr::if_else(
-        is_missing(.data[["recid"]]),
-        "Non-User",
-        .data[["smrtype"]]
-      ),
-      postcode = dplyr::if_else(
-        is_missing(.data[["recid"]]),
-        .data[["postcode_nsu"]],
-        .data[["postcode"]]
-      ),
-      gpprac = dplyr::if_else(
-        is_missing(.data[["recid"]]),
-        .data[["gpprac_nsu"]],
-        .data[["gpprac"]]
-      ),
-      dob = dplyr::if_else(
-        is_missing(.data[["recid"]]),
-        .data[["dob_nsu"]],
-        .data[["dob"]]
-      ),
-      gender = dplyr::if_else(
-        is_missing(.data[["recid"]]),
-        .data[["gender_nsu"]],
-        .data[["gender"]]
-      )
+    matched <- dplyr::full_join(data,
+      # NSU cohort file
+      read_file(get_nsu_path(year)) %>%
+        dplyr::mutate(
+          dob = as.Date(.data[["dob"]]),
+          gpprac = convert_eng_gpprac_to_dummy(.data[["gpprac"]])
+        ),
+      # Match on by chi
+      by = "chi",
+      # Name the incoming variables with "_nsu"
+      suffix = c("", "_nsu"),
+      # Keep the chi from both sources
+      keep = TRUE
     ) %>%
-    # If the data has come from the NSU cohort,
-    # use that data for the below variables
-    dplyr::mutate(
-      postcode = dplyr::if_else(
-        is_missing(.data[["postcode"]]) & .data[["has_chi"]],
-        .data[["postcode_nsu"]],
-        .data[["postcode"]]
-      ),
-      gpprac = dplyr::if_else(
-        is.na(.data[["gpprac"]]) & .data[["has_chi"]],
-        .data[["gpprac_nsu"]],
-        .data[["gpprac"]]
-      ),
-      dob = dplyr::if_else(
-        is.na(.data[["dob"]]) & .data[["has_chi"]],
-        .data[["dob_nsu"]],
-        .data[["dob"]]
-      ),
-      gender = dplyr::if_else(
-        is.na(.data[["gender"]]) & .data[["has_chi"]],
-        .data[["gender_nsu"]],
-        .data[["gender"]]
-      ),
-      chi = dplyr::if_else(
-        is_missing(.data[["chi"]]) & .data[["has_chi"]],
-        .data[["chi_nsu"]],
-        .data[["chi"]]
-      )
-    ) %>%
-    # Remove the additional columns
-    dplyr::select(-dplyr::contains("_nsu"), -"has_chi")
+      # Change the chi from the NSU cohort to a boolean
+      dplyr::mutate(has_chi = !is_missing(.data[["chi_nsu"]]))
+
+    return_df <- matched %>%
+      # Get data from non service user lookup if the recid is empty
+      dplyr::mutate(
+        year = year_param,
+        recid = dplyr::if_else(
+          is_missing(.data[["recid"]]),
+          "NSU",
+          .data[["recid"]]
+        ),
+        smrtype = dplyr::if_else(
+          is_missing(.data[["recid"]]),
+          "Non-User",
+          .data[["smrtype"]]
+        ),
+        postcode = dplyr::if_else(
+          is_missing(.data[["recid"]]),
+          .data[["postcode_nsu"]],
+          .data[["postcode"]]
+        ),
+        gpprac = dplyr::if_else(
+          is_missing(.data[["recid"]]),
+          .data[["gpprac_nsu"]],
+          .data[["gpprac"]]
+        ),
+        dob = dplyr::if_else(
+          is_missing(.data[["recid"]]),
+          .data[["dob_nsu"]],
+          .data[["dob"]]
+        ),
+        gender = dplyr::if_else(
+          is_missing(.data[["recid"]]),
+          .data[["gender_nsu"]],
+          .data[["gender"]]
+        )
+      ) %>%
+      # If the data has come from the NSU cohort,
+      # use that data for the below variables
+      dplyr::mutate(
+        postcode = dplyr::if_else(
+          is_missing(.data[["postcode"]]) & .data[["has_chi"]],
+          .data[["postcode_nsu"]],
+          .data[["postcode"]]
+        ),
+        gpprac = dplyr::if_else(
+          is.na(.data[["gpprac"]]) & .data[["has_chi"]],
+          .data[["gpprac_nsu"]],
+          .data[["gpprac"]]
+        ),
+        dob = dplyr::if_else(
+          is.na(.data[["dob"]]) & .data[["has_chi"]],
+          .data[["dob_nsu"]],
+          .data[["dob"]]
+        ),
+        gender = dplyr::if_else(
+          is.na(.data[["gender"]]) & .data[["has_chi"]],
+          .data[["gender_nsu"]],
+          .data[["gender"]]
+        ),
+        chi = dplyr::if_else(
+          is_missing(.data[["chi"]]) & .data[["has_chi"]],
+          .data[["chi_nsu"]],
+          .data[["chi"]]
+        )
+      ) %>%
+      # Remove the additional columns
+      dplyr::select(-dplyr::contains("_nsu"), -"has_chi")
 
-  return(return_df)
+    return(return_df)
   }
 }

From 2cacee859549e031fde19c92dc92d4ad38222082 Mon Sep 17 00:00:00 2001
From: James McMahon <james.mcmahon@phs.scot>
Date: Wed, 3 May 2023 14:46:21 +0100
Subject: [PATCH 023/200] Fix missed comma

---
 DESCRIPTION | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index de98cd97e..0d2df7dc2 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -52,7 +52,7 @@ Imports:
     stringr (>= 1.5.0),
     tibble (>= 3.2.1),
     tidyr (>= 1.3.0),
-    tidyselect (>= 1.2.0)
+    tidyselect (>= 1.2.0),
     zoo (>= 1.8.0)
 Suggests:
     covr (>= 3.6.1),

From 369a8bbfa1bf6418d8aee3e36ebee5c7541c0d6a Mon Sep 17 00:00:00 2001
From: Jennifer Thom <jennifer.thom@phs.scot>
Date: Fri, 5 May 2023 11:05:18 +0100
Subject: [PATCH 024/200] Exclude DD code for now - TEMP fix

---
 R/create_individual_file.R | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/R/create_individual_file.R b/R/create_individual_file.R
index f2f32ee5d..1e0a28457 100644
--- a/R/create_individual_file.R
+++ b/R/create_individual_file.R
@@ -67,11 +67,11 @@ add_cij_columns <- function(episode_file) {
         NA_real_
       )
     ) %>%
-    dplyr::mutate(cij_delay = dplyr::if_else(
-      (.data$cij_delay == 1 & .data$Distinct_CIJ == 1),
-      1,
-      0
-    )) %>%
+    # dplyr::mutate(cij_delay = dplyr::if_else(
+    #   (.data$cij_delay == 1 & .data$Distinct_CIJ == 1),
+    #   1,
+    #   0
+    # )) %>%
     dplyr::mutate(
       preventable_admissions = dplyr::if_else(
         (.data$cij_ppa == 1 & .data$Distinct_CIJ == 1),
@@ -104,7 +104,7 @@ add_all_columns <- function(episode_file) {
     add_ooh_columns("OoH", .data$recid == "OoH") %>%
     add_dn_columns("DN", .data$recid == "DN") %>%
     add_cmh_columns("CMH", .data$recid == "CMH") %>%
-    add_dd_columns("DD", .data$recid == "DD") %>%
+    #add_dd_columns("DD", .data$recid == "DD") %>%
     add_nsu_columns("NSU", .data$recid == "NSU") %>%
     add_nrs_columns("NRS", .data$recid == "NRS") %>%
     add_hl1_columns("HL1", .data$recid == "HL1") %>%

From f7158305181ba9e228d38bf827614817c6565cc2 Mon Sep 17 00:00:00 2001
From: Jennifer Thom <jennifer.thom@phs.scot>
Date: Fri, 5 May 2023 11:05:56 +0100
Subject: [PATCH 025/200] Correct/rename variables

---
 R/create_individual_file.R | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/R/create_individual_file.R b/R/create_individual_file.R
index 1e0a28457..5f32dc39f 100644
--- a/R/create_individual_file.R
+++ b/R/create_individual_file.R
@@ -173,7 +173,7 @@ add_op_columns <- function(episode_file, prefix, condition) {
   episode_file <- episode_file %>%
     dplyr::mutate(
       "{prefix}_newcons_dnas" := dplyr::if_else(eval(condition_5_8), 1, NA_real_),
-      "{prefix}_cost_dnas" := dplyr::if_else(eval(condition_5_8), .data$cost_total_net_incdnas, NA_real_)
+      "{prefix}_cost_dnas" := dplyr::if_else(eval(condition_5_8), .data$cost_total_net_inc_dnas, NA_real_)
     )
   return(episode_file)
 }
@@ -219,7 +219,7 @@ add_ooh_columns <- function(episode_file, prefix, condition) {
 
   episode_file <- episode_file %>%
     dplyr::mutate(
-      OoH_consultation_time = dplyr::if_else(eval(condition), as.numeric((lubridate::seconds_to_period(.data$keytime2) + .data$keydate2_dateformat) - (lubridate::seconds_to_period(.data$keytime1) + .data$keydate1_dateformat), units = "mins"), NA_real_),
+      OoH_consultation_time = dplyr::if_else(eval(condition), as.numeric((lubridate::seconds_to_period(.data$keytime2) + .data$record_keydate2) - (lubridate::seconds_to_period(.data$keytime1) + .data$record_keydate1), units = "mins"), NA_real_),
       OoH_consultation_time = dplyr::if_else(OoH_consultation_time < 0, 0, .data$OoH_consultation_time)
     )
   return(episode_file)
@@ -232,7 +232,7 @@ add_dn_columns <- function(episode_file, prefix, condition) {
   condition <- substitute(condition)
   episode_file %>%
     add_standard_cols(prefix, condition, episode = TRUE, cost = TRUE) %>%
-    dplyr::mutate("{prefix}_contacts" := dplyr::if_else(eval(condition), .data$totalnodncontacts, NA_real_))
+    dplyr::mutate("{prefix}_contacts" := dplyr::if_else(eval(condition), .data$total_no_dn_contacts, NA_real_))
 }
 
 #' Add CMH columns
@@ -306,7 +306,7 @@ add_ch_columns <- function(episode_file, prefix, condition) {
       ch_cost_per_day = dplyr::if_else(eval(condition) & .data$yearstay > 0, .data$cost_total_net / .data$yearstay, NA_real_),
       ch_cost_per_day = dplyr::if_else(eval(condition) & .data$yearstay == 0, .data$cost_total_net / .data$yearstay, .data$ch_cost_per_day),
       ch_no_cost = eval(condition) & is.na(ch_cost_per_day),
-      ch_ep_end = dplyr::if_else(eval(condition), .data$keydate2_dateformat, lubridate::NA_Date_)
+      ch_ep_end = dplyr::if_else(eval(condition), .data$record_keydate2, lubridate::NA_Date_)
     ) %>%
     dplyr::rowwise() %>%
     dplyr::mutate(
@@ -492,7 +492,7 @@ aggregate_ch_episodes <- function(episode_file) {
     dplyr::group_by(.data$chi, .data$ch_chi_cis) %>%
     dplyr::mutate(
       ch_no_cost = max(.data$ch_no_cost),
-      ch_ep_start = min(.data$keydate1_dateformat),
+      ch_ep_start = min(.data$record_keydate1),
       ch_ep_end = max(.data$ch_ep_end),
       ch_cost_per_day = mean(.data$ch_cost_per_day)
     ) %>%
@@ -594,9 +594,9 @@ aggregate_by_chi <- function(episode_file) {
   episode_file %>%
     dplyr::arrange(
       chi,
-      keydate1_dateformat,
+      record_keydate1,
       keytime1,
-      keydate2_dateformat,
+      record_keydate2,
       keytime2
     ) %>%
     dplyr::group_by(chi) %>%
@@ -749,7 +749,7 @@ drop_cols <- function(individual_file) {
       -"postcode",
       -"gpprac",
       -"no_paid_items",
-      -"totalnodncontacts"
+      -"total_no_dn_contacts"
     )
 }
 

From d537aadb1ed8da72edafc8f1f3abfcdd40f40aaf Mon Sep 17 00:00:00 2001
From: Jennit07 <Jennit07@users.noreply.github.com>
Date: Fri, 5 May 2023 10:09:57 +0000
Subject: [PATCH 026/200] Style code

---
 R/create_individual_file.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/R/create_individual_file.R b/R/create_individual_file.R
index 5f32dc39f..be1c0ab81 100644
--- a/R/create_individual_file.R
+++ b/R/create_individual_file.R
@@ -104,7 +104,7 @@ add_all_columns <- function(episode_file) {
     add_ooh_columns("OoH", .data$recid == "OoH") %>%
     add_dn_columns("DN", .data$recid == "DN") %>%
     add_cmh_columns("CMH", .data$recid == "CMH") %>%
-    #add_dd_columns("DD", .data$recid == "DD") %>%
+    # add_dd_columns("DD", .data$recid == "DD") %>%
     add_nsu_columns("NSU", .data$recid == "NSU") %>%
     add_nrs_columns("NRS", .data$recid == "NRS") %>%
     add_hl1_columns("HL1", .data$recid == "HL1") %>%

From 50641b3a3c29b5e80338f8de2543e6837922119e Mon Sep 17 00:00:00 2001
From: Jennifer Thom <jennifer.thom@phs.scot>
Date: Fri, 12 May 2023 12:50:49 +0100
Subject: [PATCH 027/200] Include NSU in `check_year_valid`

---
 R/check_year_valid.R | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/R/check_year_valid.R b/R/check_year_valid.R
index 3cdab4598..fcb93984b 100644
--- a/R/check_year_valid.R
+++ b/R/check_year_valid.R
@@ -21,16 +21,21 @@ check_year_valid <- function(year, type = c(
                                "Homelessness",
                                "Maternity",
                                "MH",
+                               "NSU",
                                "Outpatients",
                                "PIS",
                                "SDS"
                              )) {
+  if (year >= "2223" & type == "NSU"){
+    return(FALSE)
+  }
+
   if (year <= "1415") {
-    if (type %in% c("CMH", "DN", "Homelessness")) {
+    if (type %in% c("CMH", "DN", "Homelessness", "CH", "HC", "SDS", "AT")) {
       return(FALSE)
     }
   } else if (year <= "1516") {
-    if (type %in% c("CMH", "Homelessness")) {
+    if (type %in% c("CMH", "Homelessness", "CH", "HC", "SDS", "AT")) {
       return(FALSE)
     }
   } else if (year <= "1617") {

From 1b52ebbc760390c5823ff25f9686f50965c97c3d Mon Sep 17 00:00:00 2001
From: Jennifer Thom <jennifer.thom@phs.scot>
Date: Fri, 12 May 2023 12:52:00 +0100
Subject: [PATCH 028/200] Update `check_year_valid_tests`

---
 tests/testthat/test-check_year_valid.R | 36 ++++++++++++++++++++++++++
 1 file changed, 36 insertions(+)

diff --git a/tests/testthat/test-check_year_valid.R b/tests/testthat/test-check_year_valid.R
index 5cc173e97..02f61d880 100644
--- a/tests/testthat/test-check_year_valid.R
+++ b/tests/testthat/test-check_year_valid.R
@@ -14,6 +14,25 @@ test_that("Check year valid works for specific datasets ", {
   expect_equal(check_year_valid("1415", "MH"), TRUE)
   expect_equal(check_year_valid("1516", "Maternity"), TRUE)
 
+  # year <= "1617"
+  expect_equal(check_year_valid("1415", "AT"), FALSE)
+  expect_equal(check_year_valid("1516", "AT"), FALSE)
+  expect_equal(check_year_valid("1617", "AT"), FALSE)
+  expect_equal(check_year_valid("1718", "AT"), TRUE)
+  expect_equal(check_year_valid("1415", "CH"), FALSE)
+  expect_equal(check_year_valid("1516", "CH"), FALSE)
+  expect_equal(check_year_valid("1617", "CH"), FALSE)
+  expect_equal(check_year_valid("1718", "CH"), TRUE)
+  expect_equal(check_year_valid("1415", "HC"), FALSE)
+  expect_equal(check_year_valid("1516", "HC"), FALSE)
+  expect_equal(check_year_valid("1617", "HC"), FALSE)
+  expect_equal(check_year_valid("1718", "HC"), TRUE)
+  expect_equal(check_year_valid("1415", "SDS"), FALSE)
+  expect_equal(check_year_valid("1516", "SDS"), FALSE)
+  expect_equal(check_year_valid("1617", "SDS"), FALSE)
+  expect_equal(check_year_valid("1718", "SDS"), TRUE)
+
+
   # year >= "2122"
   expect_equal(check_year_valid("2122", "CMH"), FALSE)
   expect_equal(check_year_valid("2122", "DN"), FALSE)
@@ -21,10 +40,27 @@ test_that("Check year valid works for specific datasets ", {
   expect_equal(check_year_valid("2122", "MH"), TRUE)
   expect_equal(check_year_valid("2122", "Maternity"), TRUE)
 
+  # NSUs
+  expect_equal(check_year_valid("1415", "NSU"), TRUE)
+  expect_equal(check_year_valid("1516", "NSU"), TRUE)
+  expect_equal(check_year_valid("1617", "NSU"), TRUE)
+  expect_equal(check_year_valid("1718", "NSU"), TRUE)
+  expect_equal(check_year_valid("1819", "NSU"), TRUE)
+  expect_equal(check_year_valid("1920", "NSU"), TRUE)
+  expect_equal(check_year_valid("2021", "NSU"), TRUE)
+  expect_equal(check_year_valid("2122", "NSU"), TRUE)
+  expect_equal(check_year_valid("2223", "NSU"), FALSE)
 
   # Other extracts not within boundaries
   expect_equal(check_year_valid("2021", "Acute"), TRUE)
   expect_equal(check_year_valid("1920", "Maternity"), TRUE)
   expect_equal(check_year_valid("1819", "MH"), TRUE)
   expect_equal(check_year_valid("1718", "Outpatients"), TRUE)
+
+  # Social care
+  expect_equal(check_year_valid("1819", "AT"), TRUE)
+  expect_equal(check_year_valid("1920", "CH"), TRUE)
+  expect_equal(check_year_valid("2021", "HC"), TRUE)
+  expect_equal(check_year_valid("2122", "SDS"), TRUE)
+
 })

From e5cf2a0b225c0f6d3d4322324e8e4acd38699319 Mon Sep 17 00:00:00 2001
From: Jennit07 <Jennit07@users.noreply.github.com>
Date: Fri, 12 May 2023 11:55:10 +0000
Subject: [PATCH 029/200] Update documentation

---
 man/check_year_valid.Rd | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/man/check_year_valid.Rd b/man/check_year_valid.Rd
index 587a27b2b..e469d7fae 100644
--- a/man/check_year_valid.Rd
+++ b/man/check_year_valid.Rd
@@ -9,7 +9,7 @@ is only available from 2016/17 onwards.}
 check_year_valid(
   year,
   type = c("Acute", "AE", "AT", "CH", "Client", "CMH", "DD", "Deaths", "DN", "GPOoH",
-    "HC", "Homelessness", "Maternity", "MH", "Outpatients", "PIS", "SDS")
+    "HC", "Homelessness", "Maternity", "MH", "NSU", "Outpatients", "PIS", "SDS")
 )
 }
 \arguments{

From 33fe1055ab457e6a51d48a1a7b4ea1296a34bad2 Mon Sep 17 00:00:00 2001
From: Jennifer Thom <jennifer.thom@phs.scot>
Date: Fri, 12 May 2023 13:15:31 +0100
Subject: [PATCH 030/200] Update `add_nsu_cohort` to pick up years valid

---
 R/add_nsu_cohort.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/R/add_nsu_cohort.R b/R/add_nsu_cohort.R
index d98da5ee1..65794e8ff 100644
--- a/R/add_nsu_cohort.R
+++ b/R/add_nsu_cohort.R
@@ -11,7 +11,7 @@
 add_nsu_cohort <- function(data, year) {
   year_param <- year
 
-  if (year == "2223") {
+  if (!check_year_valid("2223", "NSU")) {
     return(data)
   } else {
     # Check that the variables we need are in the data

From 3473c18f67aa3f0086ac89e567e4953588e04e54 Mon Sep 17 00:00:00 2001
From: Jennit07 <Jennit07@users.noreply.github.com>
Date: Fri, 12 May 2023 12:23:04 +0000
Subject: [PATCH 031/200] Style code

---
 R/check_year_valid.R                   | 2 +-
 tests/testthat/test-check_year_valid.R | 1 -
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/R/check_year_valid.R b/R/check_year_valid.R
index fcb93984b..1bad7d042 100644
--- a/R/check_year_valid.R
+++ b/R/check_year_valid.R
@@ -26,7 +26,7 @@ check_year_valid <- function(year, type = c(
                                "PIS",
                                "SDS"
                              )) {
-  if (year >= "2223" & type == "NSU"){
+  if (year >= "2223" & type == "NSU") {
     return(FALSE)
   }
 
diff --git a/tests/testthat/test-check_year_valid.R b/tests/testthat/test-check_year_valid.R
index 02f61d880..a7197084c 100644
--- a/tests/testthat/test-check_year_valid.R
+++ b/tests/testthat/test-check_year_valid.R
@@ -62,5 +62,4 @@ test_that("Check year valid works for specific datasets ", {
   expect_equal(check_year_valid("1920", "CH"), TRUE)
   expect_equal(check_year_valid("2021", "HC"), TRUE)
   expect_equal(check_year_valid("2122", "SDS"), TRUE)
-
 })

From fff9badfe039b203aa7509d3b15dd481790a9009 Mon Sep 17 00:00:00 2001
From: Jennifer Thom <jennifer.thom@phs.scot>
Date: Fri, 12 May 2023 13:29:03 +0100
Subject: [PATCH 032/200] remove extra `!`

---
 R/add_nsu_cohort.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/R/add_nsu_cohort.R b/R/add_nsu_cohort.R
index 65794e8ff..6fbcf9bc1 100644
--- a/R/add_nsu_cohort.R
+++ b/R/add_nsu_cohort.R
@@ -11,7 +11,7 @@
 add_nsu_cohort <- function(data, year) {
   year_param <- year
 
-  if (!check_year_valid("2223", "NSU")) {
+  if (check_year_valid("2223", "NSU")) {
     return(data)
   } else {
     # Check that the variables we need are in the data

From 8a37356e10ae9d9fa582c97f7801f0b83281741a Mon Sep 17 00:00:00 2001
From: Jennifer Thom <jennifer.thom@phs.scot>
Date: Fri, 12 May 2023 13:49:55 +0100
Subject: [PATCH 033/200] Exclude `cij_delay`

---
 R/create_individual_file.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/R/create_individual_file.R b/R/create_individual_file.R
index be1c0ab81..4c5183d3b 100644
--- a/R/create_individual_file.R
+++ b/R/create_individual_file.R
@@ -610,7 +610,7 @@ aggregate_by_chi <- function(episode_file) {
           "CIJ_el",
           "CIJ_non_el",
           "CIJ_mat",
-          "cij_delay",
+          #"cij_delay",
           "OoH_cases" = "unique_ooh_case",
           dplyr::ends_with(
             c(

From b2f694118491cd75c71c779703ae63ce41413398 Mon Sep 17 00:00:00 2001
From: Jennit07 <Jennit07@users.noreply.github.com>
Date: Fri, 12 May 2023 12:55:41 +0000
Subject: [PATCH 034/200] Style code

---
 R/create_individual_file.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/R/create_individual_file.R b/R/create_individual_file.R
index 4c5183d3b..9a2368aae 100644
--- a/R/create_individual_file.R
+++ b/R/create_individual_file.R
@@ -610,7 +610,7 @@ aggregate_by_chi <- function(episode_file) {
           "CIJ_el",
           "CIJ_non_el",
           "CIJ_mat",
-          #"cij_delay",
+          # "cij_delay",
           "OoH_cases" = "unique_ooh_case",
           dplyr::ends_with(
             c(

From 07b03f33fe4e5c4dd141e23339201b851c343ff1 Mon Sep 17 00:00:00 2001
From: Jennit07 <67372904+Jennit07@users.noreply.github.com>
Date: Tue, 16 May 2023 11:50:42 +0100
Subject: [PATCH 035/200] improve `max_no_inf()`

---
 R/create_individual_file.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/R/create_individual_file.R b/R/create_individual_file.R
index 9a2368aae..d01967ca8 100644
--- a/R/create_individual_file.R
+++ b/R/create_individual_file.R
@@ -705,7 +705,7 @@ condition_cols <- function() {
 #'
 #' @param x Vector to return max of
 max_no_inf <- function(x) {
-  ifelse(!all(is.na(x)), max(x, na.rm = TRUE), NA)
+  dplyr::if_else(all(is.na(x)), NA, max(x, na.rm = TRUE))
 }
 
 #' Custom minimum

From 617ac68c825da0ac908693449808b0c54b1b6021 Mon Sep 17 00:00:00 2001
From: Jennifer Thom <jennifer.thom@phs.scot>
Date: Tue, 16 May 2023 11:46:09 +0100
Subject: [PATCH 036/200] Use pmin/max instead of `rowwise`

---
 R/create_individual_file.R | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/R/create_individual_file.R b/R/create_individual_file.R
index d01967ca8..ec18a01e2 100644
--- a/R/create_individual_file.R
+++ b/R/create_individual_file.R
@@ -510,12 +510,10 @@ clean_up_ch <- function(episode_file) {
       fy_end = date_from_fy(year, "end") + 1,
       fy_start = date_from_fy(year, "start")
     ) %>%
-    dplyr::rowwise() %>%
     dplyr::mutate(
-      term_1 = min(ch_ep_end, fy_end + 1),
-      term_2 = max(ch_ep_start, fy_start)
+      term_1 = pmin(ch_ep_end, fy_end + 1),
+      term_2 = pmax(ch_ep_start, fy_start)
     ) %>%
-    dplyr::ungroup() %>%
     dplyr::mutate(
       ch_beddays = dplyr::if_else(
         recid == "CH",

From 01cc1b4e1c5f81ac731ce4c68cbffcbf1d0cadae Mon Sep 17 00:00:00 2001
From: Jennifer Thom <jennifer.thom@phs.scot>
Date: Tue, 16 May 2023 11:52:44 +0100
Subject: [PATCH 037/200] improve `min_no_inf()`

---
 R/create_individual_file.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/R/create_individual_file.R b/R/create_individual_file.R
index ec18a01e2..8e6077fc7 100644
--- a/R/create_individual_file.R
+++ b/R/create_individual_file.R
@@ -714,7 +714,7 @@ max_no_inf <- function(x) {
 #'
 #' @param x Vector to return min of
 min_no_inf <- function(x) {
-  ifelse(!all(is.na(x)), min(x, na.rm = TRUE), NA)
+  ifelse(all(is.na(x)), NA, min(x, na.rm = TRUE))
 }
 
 #' Clean individual file

From 2ff02bdb78bc5c81dcb39978fff18114909a06fb Mon Sep 17 00:00:00 2001
From: Jennifer Thom <jennifer.thom@phs.scot>
Date: Tue, 16 May 2023 12:55:38 +0100
Subject: [PATCH 038/200] Use n_distinct(cij_marker)

---
 R/create_individual_file.R | 1 +
 1 file changed, 1 insertion(+)

diff --git a/R/create_individual_file.R b/R/create_individual_file.R
index 8e6077fc7..ad6d09132 100644
--- a/R/create_individual_file.R
+++ b/R/create_individual_file.R
@@ -599,6 +599,7 @@ aggregate_by_chi <- function(episode_file) {
     ) %>%
     dplyr::group_by(chi) %>%
     dplyr::summarise(
+      distinct_cij = n_distinct(cij_marker),
       gender = mean(gender),
       dplyr::across(dplyr::ends_with(c(
         "postcode", "DoB", "gpprac"

From 435cd0fdd562925573c77ccff736a9b2cc8cffcf Mon Sep 17 00:00:00 2001
From: Jennifer Thom <jennifer.thom@phs.scot>
Date: Tue, 16 May 2023 13:02:10 +0100
Subject: [PATCH 039/200] deal with distinct(ch_chi_cis)

---
 R/create_individual_file.R | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/R/create_individual_file.R b/R/create_individual_file.R
index ad6d09132..0dea43da0 100644
--- a/R/create_individual_file.R
+++ b/R/create_individual_file.R
@@ -489,6 +489,7 @@ na_type <- function(col = c("DoB", "postcode", "gpprac")) {
 #' @inheritParams create_individual_file
 aggregate_ch_episodes <- function(episode_file) {
   episode_file %>%
+    dplyr::filter(!is.na(.data$ch_chi_cis)) %>%
     dplyr::group_by(.data$chi, .data$ch_chi_cis) %>%
     dplyr::mutate(
       ch_no_cost = max(.data$ch_no_cost),
@@ -496,7 +497,10 @@ aggregate_ch_episodes <- function(episode_file) {
       ch_ep_end = max(.data$ch_ep_end),
       ch_cost_per_day = mean(.data$ch_cost_per_day)
     ) %>%
-    dplyr::ungroup()
+    dplyr::ungroup() %>%
+    dplyr::distinct(.data$chi, .data$ch_chi_cis) %>%
+    dplyr::select(.data$chi, .data$ch_chi_cis, ch_no_cost, ch_ep_start, ch_ep_end, ch_cost_per_day) %>%
+    dplyr::right_join(episode_file, by = c(.data$chi, .data$ch_chi_cis))
 }
 
 #' Clean up CH

From 5a0b5506c14a3ba69f6c4fd6367492399228d68b Mon Sep 17 00:00:00 2001
From: Jennifer Thom <jennifer.thom@phs.scot>
Date: Tue, 16 May 2023 13:32:54 +0100
Subject: [PATCH 040/200] use n_distinct(ooh_case_id)

---
 R/create_individual_file.R | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/R/create_individual_file.R b/R/create_individual_file.R
index 0dea43da0..35889b1a1 100644
--- a/R/create_individual_file.R
+++ b/R/create_individual_file.R
@@ -220,8 +220,10 @@ add_ooh_columns <- function(episode_file, prefix, condition) {
   episode_file <- episode_file %>%
     dplyr::mutate(
       OoH_consultation_time = dplyr::if_else(eval(condition), as.numeric((lubridate::seconds_to_period(.data$keytime2) + .data$record_keydate2) - (lubridate::seconds_to_period(.data$keytime1) + .data$record_keydate1), units = "mins"), NA_real_),
-      OoH_consultation_time = dplyr::if_else(OoH_consultation_time < 0, 0, .data$OoH_consultation_time)
+      OoH_consultation_time = dplyr::if_else(OoH_consultation_time < 0, 0, .data$OoH_consultation_time),
+      unique_ooh_case = dplyr::if_else(recid != "OoH", 0, n_distinct(ooh_case_id))
     )
+
   return(episode_file)
 }
 

From 0da09b03429a7cd66e2aeacd7bb9ab019c86b055 Mon Sep 17 00:00:00 2001
From: Jennifer Thom <jennifer.thom@phs.scot>
Date: Tue, 16 May 2023 15:38:58 +0100
Subject: [PATCH 041/200] remove `find_non_duplicates`

---
 R/create_individual_file.R | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/R/create_individual_file.R b/R/create_individual_file.R
index 35889b1a1..dfa43c75e 100644
--- a/R/create_individual_file.R
+++ b/R/create_individual_file.R
@@ -8,10 +8,7 @@ create_individual_file <- function(episode_file) {
     remove_blank_chi() %>%
     find_non_duplicates(.data$cij_marker, "Distinct_CIJ") %>%
     add_cij_columns() %>%
-    find_non_duplicates(.data$ch_chi_cis, "first_ch_ep") %>%
     add_all_columns() %>%
-    find_non_duplicates(.data$ooh_case_id, "unique_ooh_case") %>%
-    dplyr::mutate(unique_ooh_case = dplyr::if_else(recid != "OoH", 0, unique_ooh_case)) %>%
     aggregate_ch_episodes() %>%
     clean_up_ch() %>%
     recode_gender() %>%

From faa0a966ed8c52c28082b777f08995aaa9e6daba Mon Sep 17 00:00:00 2001
From: Jennit07 <67372904+Jennit07@users.noreply.github.com>
Date: Wed, 17 May 2023 08:27:13 +0100
Subject: [PATCH 042/200] Use dplyr::if_else()

Co-authored-by: James McMahon <james.mcmahon@phs.scot>
---
 R/create_individual_file.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/R/create_individual_file.R b/R/create_individual_file.R
index dfa43c75e..904ecc401 100644
--- a/R/create_individual_file.R
+++ b/R/create_individual_file.R
@@ -718,7 +718,7 @@ max_no_inf <- function(x) {
 #'
 #' @param x Vector to return min of
 min_no_inf <- function(x) {
-  ifelse(all(is.na(x)), NA, min(x, na.rm = TRUE))
+  dplyr::if_else(all(is.na(x)), NA, min(x, na.rm = TRUE))
 }
 
 #' Clean individual file

From 979fc81ae48510b15b096feaf86779064e9d4587 Mon Sep 17 00:00:00 2001
From: Jennifer Thom <jennifer.thom@phs.scot>
Date: Wed, 17 May 2023 08:37:34 +0100
Subject: [PATCH 043/200] Fix typo in `ooh_covid_assessment`

---
 R/create_individual_file.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/R/create_individual_file.R b/R/create_individual_file.R
index 904ecc401..5f25b7afd 100644
--- a/R/create_individual_file.R
+++ b/R/create_individual_file.R
@@ -210,7 +210,7 @@ add_ooh_columns <- function(episode_file, prefix, condition) {
       "{prefix}_other" := dplyr::if_else(eval(condition) & .data$smrtype == "OOH-Other", 1, NA_real_),
       "{prefix}_PCC" := dplyr::if_else(eval(condition) & .data$smrtype == "OOH-PCC", 1, NA_real_),
       ooh_covid_advice = dplyr::if_else(eval(condition) & .data$smrtype == "OOH-C19Adv", 1, NA_real_),
-      ooh_covid_assesment = dplyr::if_else(eval(condition) & .data$smrtype == "OOH-C19Ass", 1, NA_real_),
+      ooh_covid_assessment = dplyr::if_else(eval(condition) & .data$smrtype == "OOH-C19Ass", 1, NA_real_),
       ooh_covid_other = dplyr::if_else(eval(condition) & .data$smrtype == "OOH-C190th", 1, NA_real_)
     )
 

From 6a57809c8b108261a5d9b05176da5ca2cd873cd9 Mon Sep 17 00:00:00 2001
From: Jennifer Thom <jennifer.thom@phs.scot>
Date: Wed, 17 May 2023 08:39:51 +0100
Subject: [PATCH 044/200] Move `ooh_case_id` to aggregate

---
 R/create_individual_file.R | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/R/create_individual_file.R b/R/create_individual_file.R
index 5f25b7afd..7a9eba7fa 100644
--- a/R/create_individual_file.R
+++ b/R/create_individual_file.R
@@ -602,7 +602,8 @@ aggregate_by_chi <- function(episode_file) {
     ) %>%
     dplyr::group_by(chi) %>%
     dplyr::summarise(
-      distinct_cij = n_distinct(cij_marker),
+      distinct_cij = n_distinct("cij_marker"),
+      ooh_cases = n_distinct("ooh_case_id"),
       gender = mean(gender),
       dplyr::across(dplyr::ends_with(c(
         "postcode", "DoB", "gpprac"

From 83fbdcbe1cc744ca4babe3204e8a4bc78919ae27 Mon Sep 17 00:00:00 2001
From: Jennifer Thom <jennifer.thom@phs.scot>
Date: Wed, 17 May 2023 08:52:11 +0100
Subject: [PATCH 045/200] Use `slfhelper::ltc_vars`

---
 R/create_individual_file.R | 22 +---------------------
 1 file changed, 1 insertion(+), 21 deletions(-)

diff --git a/R/create_individual_file.R b/R/create_individual_file.R
index 7a9eba7fa..a77ccddd0 100644
--- a/R/create_individual_file.R
+++ b/R/create_individual_file.R
@@ -674,27 +674,7 @@ aggregate_by_chi <- function(episode_file) {
 #' which follow format "condition" and "condition_date" e.g.
 #' "dementia" and "dementia_date"
 condition_cols <- function() {
-  conditions <- c(
-    "arth",
-    "asthma",
-    "atrialfib",
-    "cancer",
-    "cvd",
-    "liver",
-    "copd",
-    "dementia",
-    "diabetes",
-    "epilepsy",
-    "chd",
-    "hefailure",
-    "ms",
-    "parkinsons",
-    "refailure",
-    "congen",
-    "bloodbfo",
-    "endomet",
-    "digestive"
-  )
+  conditions <- slfhelper::ltc_vars
   date_cols <- paste0(conditions, "_date")
   all_cols <- c(conditions, date_cols)
   return(all_cols)

From 8a761c0e7e1a0f96b9e3e7b0266fefaf98c5a714 Mon Sep 17 00:00:00 2001
From: Jennifer Thom <jennifer.thom@phs.scot>
Date: Wed, 17 May 2023 09:38:18 +0100
Subject: [PATCH 046/200] Remove `clean_up_dob` Already done in
 `correct_demographics`

---
 R/create_individual_file.R | 46 --------------------------------------
 1 file changed, 46 deletions(-)

diff --git a/R/create_individual_file.R b/R/create_individual_file.R
index a77ccddd0..b9e0222de 100644
--- a/R/create_individual_file.R
+++ b/R/create_individual_file.R
@@ -711,7 +711,6 @@ clean_individual_file <- function(individual_file) {
   individual_file %>%
     drop_cols() %>%
     clean_up_gender() %>%
-    clean_up_dob() %>%
     dplyr::mutate(
       age = floor(as.numeric(lubridate::interval(.data$DoB, date_from_fy(year, "mid")), "years"))
     ) %>%
@@ -767,51 +766,6 @@ clean_up_gender <- function(individual_file) {
     )
 }
 
-#' Clean up date of birth column
-#'
-#' @description Clean up column containing date of birth.
-#'
-#' @inheritParams clean_individual_file
-clean_up_dob <- function(individual_file) {
-  individual_file %>%
-    dplyr::mutate(
-      chi_dob_1 = lubridate::dmy(paste0(substr(.data$chi, 1, 4), "19", substr(.data$chi, 5, 6))),
-      chi_dob_2 = lubridate::dmy(paste0(substr(.data$chi, 1, 4), "20", substr(.data$chi, 5, 6))),
-      chi_age_1 = as.numeric(lubridate::interval(lubridate::ymd(.data$chi_dob_1), date_from_fy(year, "mid")), "years"),
-      chi_age_2 = as.numeric(lubridate::interval(lubridate::ymd(.data$chi_dob_2), date_from_fy(year, "mid")), "years")
-    ) %>%
-    dplyr::rowwise() %>%
-    dplyr::mutate(
-      dob_condition_1 = .data$chi_dob_1 %in% dplyr::pick(dplyr::ends_with("_DoB")) & !(chi_dob_2 %in% dplyr::pick(dplyr::ends_with("_DoB"))),
-      dob_condition_2 = .data$chi_dob_2 %in% dplyr::pick(dplyr::ends_with("_DoB")) & !(chi_dob_1 %in% dplyr::pick(dplyr::ends_with("_DoB"))),
-      dob_condition_3 = .data$chi_dob_2 > min(lubridate::today(), date_from_fy(year, "end")),
-      dob_condition_4 = unclass(.data$chi_dob_2) > min_no_inf(as.numeric(dplyr::pick(.data$arth_date:.data$death_date))),
-      dob_condition_5 = .data$congen_date %in% c(.data$chi_dob_1, .data$chi_dob_2)
-    ) %>%
-    dplyr::ungroup() %>%
-    dplyr::mutate(
-      DoB = dplyr::case_when(
-        .data$dob_condition_1 ~ .data$chi_dob_1,
-        .data$dob_condition_2 ~ .data$chi_dob_2
-      )
-    ) %>%
-    dplyr::mutate(
-      DoB = dplyr::case_when(
-        is.na(.data$DoB) & is.na(.data$chi_dob_1) & !is.na(.data$chi_dob_2) ~ .data$chi_dob_2,
-        is.na(.data$DoB) & is.na(.data$chi_dob_2) & !is.na(.data$chi_dob_1) ~ .data$chi_dob_1,
-        is.na(.data$DoB) & .data$chi_age_2 < 0 ~ .data$chi_dob_1,
-        is.na(.data$DoB) & .data$dob_condition_3 ~ .data$chi_dob_1,
-        is.na(.data$DoB) & .data$dob_condition_4 ~ .data$chi_dob_1,
-        is.na(.data$DoB) & .data$dob_condition_5 ~ .data$congen_date,
-        is.na(.data$DoB) & .data$chi_age_1 > 115 ~ .data$chi_dob_2,
-        TRUE ~ .data$DoB
-      )
-    ) %>%
-    fill_dob() %>%
-    dplyr::select(
-      -dplyr::starts_with(c("dob_condition_", "chi_dob_", "chi_age_"))
-    )
-}
 
 #' Fill missing date of births
 #'

From 46a7b70599e60f65461b90e22e52653a88da8d10 Mon Sep 17 00:00:00 2001
From: Jennit07 <Jennit07@users.noreply.github.com>
Date: Wed, 17 May 2023 08:42:21 +0000
Subject: [PATCH 047/200] Update documentation

---
 man/clean_up_dob.Rd | 14 --------------
 1 file changed, 14 deletions(-)
 delete mode 100644 man/clean_up_dob.Rd

diff --git a/man/clean_up_dob.Rd b/man/clean_up_dob.Rd
deleted file mode 100644
index 4b9003726..000000000
--- a/man/clean_up_dob.Rd
+++ /dev/null
@@ -1,14 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/create_individual_file.R
-\name{clean_up_dob}
-\alias{clean_up_dob}
-\title{Clean up date of birth column}
-\usage{
-clean_up_dob(individual_file)
-}
-\arguments{
-\item{individual_file}{Individual file where each row represents a unique CHI}
-}
-\description{
-Clean up column containing date of birth.
-}

From 6424c952f0f36e80d23eb6709720640c3615a5b7 Mon Sep 17 00:00:00 2001
From: James McMahon <james.mcmahon@phs.scot>
Date: Wed, 17 May 2023 08:54:44 +0000
Subject: [PATCH 048/200] [check-spelling] Update metadata

Update for https://github.com/Public-Health-Scotland/source-linkage-files/actions/runs/4981058958/attempts/1
Accepted in https://github.com/Public-Health-Scotland/source-linkage-files/pull/654#issuecomment-1551009850

Signed-off-by: check-spelling-bot <check-spelling-bot@users.noreply.github.com>
---
 .github/actions/spelling/expect.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/actions/spelling/expect.txt b/.github/actions/spelling/expect.txt
index 4b36c7d8e..c2380b4c6 100644
--- a/.github/actions/spelling/expect.txt
+++ b/.github/actions/spelling/expect.txt
@@ -65,6 +65,7 @@ fyyear
 geogs
 ggplot
 GLS
+GPOo
 gpprac
 gss
 hbnames

From 89268dc6ca9e706d08a88fc765388d533eafa9aa Mon Sep 17 00:00:00 2001
From: Jennifer Thom <jennifer.thom@phs.scot>
Date: Wed, 17 May 2023 10:59:10 +0100
Subject: [PATCH 049/200] Use `start_next_fy_quarter` in place of rowwise

---
 R/create_individual_file.R | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/R/create_individual_file.R b/R/create_individual_file.R
index b9e0222de..77e3e34f2 100644
--- a/R/create_individual_file.R
+++ b/R/create_individual_file.R
@@ -305,13 +305,11 @@ add_ch_columns <- function(episode_file, prefix, condition) {
       ch_cost_per_day = dplyr::if_else(eval(condition) & .data$yearstay > 0, .data$cost_total_net / .data$yearstay, NA_real_),
       ch_cost_per_day = dplyr::if_else(eval(condition) & .data$yearstay == 0, .data$cost_total_net / .data$yearstay, .data$ch_cost_per_day),
       ch_no_cost = eval(condition) & is.na(ch_cost_per_day),
-      ch_ep_end = dplyr::if_else(eval(condition), .data$record_keydate2, lubridate::NA_Date_)
-    ) %>%
-    dplyr::rowwise() %>%
-    dplyr::mutate(
-      ch_ep_end = dplyr::if_else(eval(condition) & is.na(ch_ep_end), lubridate::quarter(zoo::as.yearqtr(.data$sc_latest_submission), type = "date_first", fiscal_start = 4), .data$ch_ep_end)
-    ) %>%
-    dplyr::ungroup()
+      ch_ep_end = dplyr::if_else(eval(condition), .data$record_keydate2, lubridate::NA_Date_),
+      # If end date is missing use the first day of next FY quarter
+      ch_ep_end = dplyr::if_else(eval(condition) & is.na(ch_ep_end), start_next_fy_quarter(sc_latest_submission), .data$ch_ep_end)
+    )
+
 }
 
 #' Add HC columns

From b6d93ed1f79573e6dbe179ff54dc7228235157a1 Mon Sep 17 00:00:00 2001
From: Jennit07 <Jennit07@users.noreply.github.com>
Date: Wed, 17 May 2023 10:06:51 +0000
Subject: [PATCH 050/200] Style code

---
 R/create_individual_file.R | 1 -
 1 file changed, 1 deletion(-)

diff --git a/R/create_individual_file.R b/R/create_individual_file.R
index 77e3e34f2..e89731dd0 100644
--- a/R/create_individual_file.R
+++ b/R/create_individual_file.R
@@ -309,7 +309,6 @@ add_ch_columns <- function(episode_file, prefix, condition) {
       # If end date is missing use the first day of next FY quarter
       ch_ep_end = dplyr::if_else(eval(condition) & is.na(ch_ep_end), start_next_fy_quarter(sc_latest_submission), .data$ch_ep_end)
     )
-
 }
 
 #' Add HC columns

From d4e1d4154a76e32f4bf8fe6b5c856747c213f782 Mon Sep 17 00:00:00 2001
From: Jennifer Thom <jennifer.thom@phs.scot>
Date: Wed, 17 May 2023 11:10:58 +0100
Subject: [PATCH 051/200] Use `compute_mid_year_age`

---
 R/create_individual_file.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/R/create_individual_file.R b/R/create_individual_file.R
index e89731dd0..5ea0b8853 100644
--- a/R/create_individual_file.R
+++ b/R/create_individual_file.R
@@ -709,7 +709,7 @@ clean_individual_file <- function(individual_file) {
     drop_cols() %>%
     clean_up_gender() %>%
     dplyr::mutate(
-      age = floor(as.numeric(lubridate::interval(.data$DoB, date_from_fy(year, "mid")), "years"))
+      age = compute_mid_year_age(year, .data$DoB)
     ) %>%
     clean_up_postcode()
 }

From eac15ed5cc8578ed5cd88b4ea5867bc1a780f903 Mon Sep 17 00:00:00 2001
From: Jennifer Thom <jennifer.thom@phs.scot>
Date: Wed, 17 May 2023 11:18:53 +0100
Subject: [PATCH 052/200] convert code into data.table for improving speed

---
 R/create_individual_file.R | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/R/create_individual_file.R b/R/create_individual_file.R
index 5ea0b8853..3cab259a2 100644
--- a/R/create_individual_file.R
+++ b/R/create_individual_file.R
@@ -597,7 +597,9 @@ aggregate_by_chi <- function(episode_file) {
       record_keydate2,
       keytime2
     ) %>%
-    dplyr::group_by(chi) %>%
+    # use as.data.table to change the data format to data.table to accelerate
+    data.table::as.data.table() %>%
+    dplyr::group_by("chi") %>%
     dplyr::summarise(
       distinct_cij = n_distinct("cij_marker"),
       ooh_cases = n_distinct("ooh_case_id"),
@@ -662,7 +664,9 @@ aggregate_by_chi <- function(episode_file) {
         ),
         ~ dplyr::first(., na_rm = TRUE)
       )
-    )
+    ) %>%
+    # change the data format from data.table to data.frame
+    tibble::as_tibble()
 }
 
 #' Condition columns

From 4f6d6ff91b63ebaeadbc38cf5d9a628bc3e2eb6b Mon Sep 17 00:00:00 2001
From: Jennifer Thom <jennifer.thom@phs.scot>
Date: Wed, 17 May 2023 12:58:56 +0100
Subject: [PATCH 053/200] Update `get_fy_dates`function

---
 R/get_fy_dates.R | 18 ++++++++----------
 1 file changed, 8 insertions(+), 10 deletions(-)

diff --git a/R/get_fy_dates.R b/R/get_fy_dates.R
index 970e049d1..3a4b96afb 100644
--- a/R/get_fy_dates.R
+++ b/R/get_fy_dates.R
@@ -20,9 +20,9 @@ start_fy <- function(year, format = c("fyyear", "alternate")) {
   format <- match.arg(format)
 
   if (format == "fyyear") {
-    start_fy <- as.Date(paste0(convert_fyyear_to_year(year), "-04-01"))
+    start_fy <- lubridate::make_date(convert_fyyear_to_year(year), 4, 1)
   } else if (format == "alternate") {
-    start_fy <- as.Date(paste0(year, "-04-01"))
+    start_fy <- lubridate::make_date(year, 4, 1)
   }
 
   return(start_fy)
@@ -47,16 +47,14 @@ end_fy <- function(year, format = c("fyyear", "alternate")) {
     format <- "fyyear"
   }
 
+  year <- as.numeric(paste0("20", substr(year, 3, 4)))
+
   format <- match.arg(format)
 
   if (format == "fyyear") {
-    end_fy <- as.Date(
-      paste0(as.numeric(convert_fyyear_to_year(year)) + 1L, "-03-31")
-    )
+    end_fy <- lubridate::make_date(year, 3, 31)
   } else if (format == "alternate") {
-    end_fy <- as.Date(
-      paste0(as.numeric(year) + 1L, "-03-31")
-    )
+    end_fy <- lubridate::make_date(year + 1L, 3, 31)
   }
 
   return(end_fy)
@@ -85,9 +83,9 @@ midpoint_fy <- function(year, format = c("fyyear", "alternate")) {
   format <- match.arg(format)
 
   if (format == "fyyear") {
-    midpoint_fy <- as.Date(paste0(convert_fyyear_to_year(year), "-09-30"))
+    midpoint_fy <- lubridate::make_date(convert_fyyear_to_year(year), 9, 30)
   } else if (format == "alternate") {
-    midpoint_fy <- as.Date(paste0(year, "-09-30"))
+    midpoint_fy <- lubridate::make_date(year, 9, 30)
   }
 
   return(midpoint_fy)

From 4c9134bd3b8ed1d8bfae0ddd86502a17c0e29189 Mon Sep 17 00:00:00 2001
From: Jennifer Thom <jennifer.thom@phs.scot>
Date: Wed, 17 May 2023 13:08:40 +0100
Subject: [PATCH 054/200] remove `date_from_fy`, use `get_fy_dates`

---
 R/create_individual_file.R | 29 ++---------------------------
 1 file changed, 2 insertions(+), 27 deletions(-)

diff --git a/R/create_individual_file.R b/R/create_individual_file.R
index 3cab259a2..af1931d08 100644
--- a/R/create_individual_file.R
+++ b/R/create_individual_file.R
@@ -507,8 +507,8 @@ aggregate_ch_episodes <- function(episode_file) {
 clean_up_ch <- function(episode_file) {
   episode_file %>%
     dplyr::mutate(
-      fy_end = date_from_fy(year, "end") + 1,
-      fy_start = date_from_fy(year, "start")
+      fy_end = end_fy(year),
+      fy_start = start_fy(year)
     ) %>%
     dplyr::mutate(
       term_1 = pmin(ch_ep_end, fy_end + 1),
@@ -541,31 +541,6 @@ clean_up_ch <- function(episode_file) {
     )
 }
 
-#' Date from FY
-#'
-#' @description Return start, mid, or end date from financial year in format "2122".
-#'
-#' @param financial_year Financial year represented in "YYYY" format e.g. "2122"
-#' @param type One of "start", "end", and "mid", representing the date to return
-date_from_fy <- function(financial_year, type = c("start", "end", "mid")) {
-  match.arg(type)
-  n <- switch(type,
-    "start" = 0,
-    "mid" = 0,
-    "end" = 2
-  )
-  year <- as.numeric(paste0("20", substr(financial_year, 1 + n, 2 + n)))
-  if (type == "start") {
-    date <- lubridate::make_date(year, 4, 1)
-    return(date)
-  } else if (type == "end") {
-    date <- lubridate::make_date(year, 3, 31)
-    return(date)
-  }
-  date <- lubridate::make_date(year, 9, 30)
-  return(date)
-}
-
 #' Recode gender
 #'
 #' @description Recode gender to 1.5 if 0 or 9.

From 3730ee14eace457dc4dddff828880a2bf1209544 Mon Sep 17 00:00:00 2001
From: Jennit07 <Jennit07@users.noreply.github.com>
Date: Wed, 17 May 2023 12:12:14 +0000
Subject: [PATCH 055/200] Update documentation

---
 man/date_from_fy.Rd | 16 ----------------
 1 file changed, 16 deletions(-)
 delete mode 100644 man/date_from_fy.Rd

diff --git a/man/date_from_fy.Rd b/man/date_from_fy.Rd
deleted file mode 100644
index cc3b8f8a3..000000000
--- a/man/date_from_fy.Rd
+++ /dev/null
@@ -1,16 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/create_individual_file.R
-\name{date_from_fy}
-\alias{date_from_fy}
-\title{Date from FY}
-\usage{
-date_from_fy(financial_year, type = c("start", "end", "mid"))
-}
-\arguments{
-\item{financial_year}{Financial year represented in "YYYY" format e.g. "2122"}
-
-\item{type}{One of "start", "end", and "mid", representing the date to return}
-}
-\description{
-Return start, mid, or end date from financial year in format "2122".
-}

From c9852b44a6db597b5d899bbbdb3517c4bd537b77 Mon Sep 17 00:00:00 2001
From: Jennifer Thom <jennifer.thom@phs.scot>
Date: Wed, 17 May 2023 15:10:01 +0100
Subject: [PATCH 056/200] Remove `clean_up_postcode` function Not needed
 anymore

---
 R/create_individual_file.R | 20 --------------------
 1 file changed, 20 deletions(-)

diff --git a/R/create_individual_file.R b/R/create_individual_file.R
index af1931d08..1b41b93ad 100644
--- a/R/create_individual_file.R
+++ b/R/create_individual_file.R
@@ -779,26 +779,6 @@ replace_dob_with_col <- function(individual_file, col) {
     )
 }
 
-#' Clean up postcode column
-#'
-#' @description Clean up column containing postcode.
-#'
-#' @inheritParams clean_individual_file
-clean_up_postcode <- function(individual_file) {
-  postcode_lookup <- readr::read_rds(get_slf_postcode_path())
-  individual_file %>%
-    dplyr::mutate(
-      # all_blank is TRUE when all postcode variables are blank
-      all_blank = all(is.na(dplyr::pick(dplyr::ends_with("_postcode")))),
-      # Use NRS_postcode to store the dummy for no other reason than it's last
-      # in the hierarchy
-      HL1_postcode = dplyr::if_else(
-        all_blank == 1,
-        "XXX XXX",
-        .data$HL1_postcode
-      )
-    )
-}
 
 #' Fill missing postcodes
 #'

From 3714bcaa4e54d5956d8e0761a33a06927c0d115d Mon Sep 17 00:00:00 2001
From: Jennifer Thom <jennifer.thom@phs.scot>
Date: Wed, 17 May 2023 15:15:32 +0100
Subject: [PATCH 057/200] Remove non duplicates function/move to aggregate

---
 R/create_individual_file.R | 63 +++++++++++++++++---------------------
 1 file changed, 28 insertions(+), 35 deletions(-)

diff --git a/R/create_individual_file.R b/R/create_individual_file.R
index 1b41b93ad..6c55f2b40 100644
--- a/R/create_individual_file.R
+++ b/R/create_individual_file.R
@@ -6,7 +6,6 @@
 create_individual_file <- function(episode_file) {
   episode_file %>%
     remove_blank_chi() %>%
-    find_non_duplicates(.data$cij_marker, "Distinct_CIJ") %>%
     add_cij_columns() %>%
     add_all_columns() %>%
     aggregate_ch_episodes() %>%
@@ -22,26 +21,14 @@ create_individual_file <- function(episode_file) {
 #'
 #' @inheritParams create_individual_file
 remove_blank_chi <- function(episode_file) {
+
+  cli::cli_alert_info("Remove blank CHI at {Sys.time()} and the memory usage was {object.size()}")
+
   episode_file %>%
     dplyr::mutate(chi = dplyr::na_if(.data$chi, "")) %>%
     dplyr::filter(!is.na(.data$chi))
 }
 
-#' Find non-duplicates
-#'
-#' @description Create new column which marks first (per group)
-#' non-duplicated observation as 1, with any duplicates marked as 0.
-#'
-#' @inheritParams create_individual_file
-#' @param group Column to group by
-#' @param col_name Name of new column
-find_non_duplicates <- function(episode_file, group, col_name) {
-  episode_file %>%
-    dplyr::group_by(.data$chi, {{ group }}) %>%
-    dplyr::mutate("{col_name}" := dplyr::if_else(duplicated({{ group }}), 0, 1)) %>%
-    dplyr::ungroup() %>%
-    dplyr::mutate("{col_name}" := dplyr::if_else(is.na({{ group }}), 0, .data[[col_name]]))
-}
 
 #' Add CIJ-related columns
 #'
@@ -52,34 +39,35 @@ add_cij_columns <- function(episode_file) {
   episode_file %>%
     dplyr::mutate(
       CIJ_non_el = dplyr::if_else(.data$cij_pattype_code == 0,
-        .data$Distinct_CIJ,
+        .data$cij_marker,
         NA_real_
       ),
       CIJ_el = dplyr::if_else(.data$cij_pattype_code == 1,
-        .data$Distinct_CIJ,
+        .data$cij_marker,
         NA_real_
       ),
       CIJ_mat = dplyr::if_else(.data$cij_pattype_code == 2,
-        .data$Distinct_CIJ,
+        .data$cij_marker,
         NA_real_
       )
     ) %>%
     # dplyr::mutate(cij_delay = dplyr::if_else(
-    #   (.data$cij_delay == 1 & .data$Distinct_CIJ == 1),
+    #   (.data$cij_delay == 1 & .data$cij_marker == 1),
     #   1,
     #   0
     # )) %>%
     dplyr::mutate(
       preventable_admissions = dplyr::if_else(
-        (.data$cij_ppa == 1 & .data$Distinct_CIJ == 1),
+        (.data$cij_ppa == 1 & .data$cij_marker == 1),
         1,
         0
-      ),
-      preventable_beddays = dplyr::if_else(
-        (.data$cij_ppa == 1 & .data$Distinct_CIJ == 1),
-        as.numeric(.data$cij_end_date - .data$cij_start_date),
-        0
-      )
+      ) # ,
+      # Come back to here
+      # preventable_beddays = dplyr::if_else(
+      #   (.data$cij_ppa == 1 & .data$Distinct_cij == 1),
+      #   as.numeric(.data$cij_end_date - .data$cij_start_date),
+      #   0
+      # )
     )
 }
 
@@ -218,7 +206,6 @@ add_ooh_columns <- function(episode_file, prefix, condition) {
     dplyr::mutate(
       OoH_consultation_time = dplyr::if_else(eval(condition), as.numeric((lubridate::seconds_to_period(.data$keytime2) + .data$record_keydate2) - (lubridate::seconds_to_period(.data$keytime1) + .data$record_keydate1), units = "mins"), NA_real_),
       OoH_consultation_time = dplyr::if_else(OoH_consultation_time < 0, 0, .data$OoH_consultation_time),
-      unique_ooh_case = dplyr::if_else(recid != "OoH", 0, n_distinct(ooh_case_id))
     )
 
   return(episode_file)
@@ -565,6 +552,8 @@ recode_gender <- function(episode_file) {
 #' @inheritParams create_individual_file
 aggregate_by_chi <- function(episode_file) {
   episode_file %>%
+    # use as.data.table to change the data format to data.table to accelerate
+    data.table::as.data.table() %>%
     dplyr::arrange(
       chi,
       record_keydate1,
@@ -572,23 +561,26 @@ aggregate_by_chi <- function(episode_file) {
       record_keydate2,
       keytime2
     ) %>%
-    # use as.data.table to change the data format to data.table to accelerate
-    data.table::as.data.table() %>%
     dplyr::group_by("chi") %>%
     dplyr::summarise(
-      distinct_cij = n_distinct("cij_marker"),
-      ooh_cases = n_distinct("ooh_case_id"),
       gender = mean(gender),
       dplyr::across(dplyr::ends_with(c(
         "postcode", "DoB", "gpprac"
       )), ~ dplyr::last(., na_rm = TRUE)),
       dplyr::across(
         c(
+          "cij_total" = "cij_marker",
           "CIJ_el",
           "CIJ_non_el",
           "CIJ_mat",
           # "cij_delay",
-          "OoH_cases" = "unique_ooh_case",
+          "ooh_cases" = "ooh_case_id",
+          "preventable_admissions"
+        ),
+        ~ dplyr::n_distinct(.x, na.rm = TRUE)
+      ),
+      dplyr::across(
+        c(
           dplyr::ends_with(
             c(
               "episodes",
@@ -605,13 +597,14 @@ aggregate_by_chi <- function(episode_file) {
               "homeV",
               "time",
               "admissions",
-              "assesment",
+              "assessment",
               "other",
               "DN",
               "NHS24",
               "PCC",
               "_dnas"
-            )
+            ),
+            -"preventable_admissions"
           ),
           dplyr::starts_with("SDS_option")
         ),

From 15ae96afaefa3b8521c9b7d667b9f4feb3252ceb Mon Sep 17 00:00:00 2001
From: Jennit07 <Jennit07@users.noreply.github.com>
Date: Wed, 17 May 2023 14:18:07 +0000
Subject: [PATCH 058/200] Style code

---
 R/create_individual_file.R | 1 -
 1 file changed, 1 deletion(-)

diff --git a/R/create_individual_file.R b/R/create_individual_file.R
index 6c55f2b40..9d29c71ba 100644
--- a/R/create_individual_file.R
+++ b/R/create_individual_file.R
@@ -21,7 +21,6 @@ create_individual_file <- function(episode_file) {
 #'
 #' @inheritParams create_individual_file
 remove_blank_chi <- function(episode_file) {
-
   cli::cli_alert_info("Remove blank CHI at {Sys.time()} and the memory usage was {object.size()}")
 
   episode_file %>%

From e182a1488f66ff909b1e861c8ddcf66533953f76 Mon Sep 17 00:00:00 2001
From: Jennit07 <Jennit07@users.noreply.github.com>
Date: Wed, 17 May 2023 14:18:12 +0000
Subject: [PATCH 059/200] Update documentation

---
 man/clean_up_postcode.Rd   | 14 --------------
 man/find_non_duplicates.Rd | 19 -------------------
 2 files changed, 33 deletions(-)
 delete mode 100644 man/clean_up_postcode.Rd
 delete mode 100644 man/find_non_duplicates.Rd

diff --git a/man/clean_up_postcode.Rd b/man/clean_up_postcode.Rd
deleted file mode 100644
index b3cd91548..000000000
--- a/man/clean_up_postcode.Rd
+++ /dev/null
@@ -1,14 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/create_individual_file.R
-\name{clean_up_postcode}
-\alias{clean_up_postcode}
-\title{Clean up postcode column}
-\usage{
-clean_up_postcode(individual_file)
-}
-\arguments{
-\item{individual_file}{Individual file where each row represents a unique CHI}
-}
-\description{
-Clean up column containing postcode.
-}
diff --git a/man/find_non_duplicates.Rd b/man/find_non_duplicates.Rd
deleted file mode 100644
index ba82bd5c4..000000000
--- a/man/find_non_duplicates.Rd
+++ /dev/null
@@ -1,19 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/create_individual_file.R
-\name{find_non_duplicates}
-\alias{find_non_duplicates}
-\title{Find non-duplicates}
-\usage{
-find_non_duplicates(episode_file, group, col_name)
-}
-\arguments{
-\item{episode_file}{Tibble containing episodic data}
-
-\item{group}{Column to group by}
-
-\item{col_name}{Name of new column}
-}
-\description{
-Create new column which marks first (per group)
-non-duplicated observation as 1, with any duplicates marked as 0.
-}

From 73852cc5c02ca347332c5be57720325d0c252d0f Mon Sep 17 00:00:00 2001
From: Jennifer Thom <jennifer.thom@phs.scot>
Date: Wed, 17 May 2023 15:47:09 +0100
Subject: [PATCH 060/200] Add time stamps to `create_individual_file`

---
 R/create_individual_file.R | 23 ++++++++++++++++++++++-
 1 file changed, 22 insertions(+), 1 deletion(-)

diff --git a/R/create_individual_file.R b/R/create_individual_file.R
index 9d29c71ba..e4c68e2d2 100644
--- a/R/create_individual_file.R
+++ b/R/create_individual_file.R
@@ -21,7 +21,7 @@ create_individual_file <- function(episode_file) {
 #'
 #' @inheritParams create_individual_file
 remove_blank_chi <- function(episode_file) {
-  cli::cli_alert_info("Remove blank CHI at {Sys.time()} and the memory usage was {object.size()}")
+  cli::cli_alert_info("Remove blank CHI function started at {Sys.time()}")
 
   episode_file %>%
     dplyr::mutate(chi = dplyr::na_if(.data$chi, "")) %>%
@@ -35,6 +35,9 @@ remove_blank_chi <- function(episode_file) {
 #'
 #' @inheritParams create_individual_file
 add_cij_columns <- function(episode_file) {
+
+  cli::cli_alert_info("Add cij columns function started at {Sys.time()}")
+
   episode_file %>%
     dplyr::mutate(
       CIJ_non_el = dplyr::if_else(.data$cij_pattype_code == 0,
@@ -77,6 +80,9 @@ add_cij_columns <- function(episode_file) {
 #'
 #' @inheritParams create_individual_file
 add_all_columns <- function(episode_file) {
+
+  cli::cli_alert_info("Add all columns function started at {Sys.time()}")
+
   episode_file %>%
     add_acute_columns("Acute", (.data$smrtype == "Acute-DC" | .data$smrtype == "Acute-IP") & .data$cij_pattype != "Maternity") %>%
     add_mat_columns("Mat", .data$recid == "02B" | .data$cij_pattype == "Maternity") %>%
@@ -470,6 +476,9 @@ na_type <- function(col = c("DoB", "postcode", "gpprac")) {
 #'
 #' @inheritParams create_individual_file
 aggregate_ch_episodes <- function(episode_file) {
+
+  cli::cli_alert_info("Aggregate ch episodes function started at {Sys.time()}")
+
   episode_file %>%
     dplyr::filter(!is.na(.data$ch_chi_cis)) %>%
     dplyr::group_by(.data$chi, .data$ch_chi_cis) %>%
@@ -491,6 +500,9 @@ aggregate_ch_episodes <- function(episode_file) {
 #'
 #' @inheritParams create_individual_file
 clean_up_ch <- function(episode_file) {
+
+  cli::cli_alert_info("Clean up CH function started at {Sys.time()}")
+
   episode_file %>%
     dplyr::mutate(
       fy_end = end_fy(year),
@@ -533,6 +545,9 @@ clean_up_ch <- function(episode_file) {
 #'
 #' @inheritParams create_individual_file
 recode_gender <- function(episode_file) {
+
+  cli::cli_alert_info("Recode Gender function started at {Sys.time()}")
+
   episode_file %>%
     dplyr::mutate(
       gender = dplyr::if_else(
@@ -550,6 +565,9 @@ recode_gender <- function(episode_file) {
 #'
 #' @inheritParams create_individual_file
 aggregate_by_chi <- function(episode_file) {
+
+  cli::cli_alert_info("Aggregate by CHI function started at {Sys.time()}")
+
   episode_file %>%
     # use as.data.table to change the data format to data.table to accelerate
     data.table::as.data.table() %>%
@@ -676,6 +694,9 @@ min_no_inf <- function(x) {
 #'
 #' @param individual_file Individual file where each row represents a unique CHI
 clean_individual_file <- function(individual_file) {
+
+  cli::cli_alert_info("Clean individual file function started at {Sys.time()}")
+
   individual_file %>%
     drop_cols() %>%
     clean_up_gender() %>%

From a358cc57367850bf66ed2bb1e0cf6fc875e941bf Mon Sep 17 00:00:00 2001
From: Jennit07 <Jennit07@users.noreply.github.com>
Date: Wed, 17 May 2023 14:51:12 +0000
Subject: [PATCH 061/200] Style code

---
 R/create_individual_file.R | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/R/create_individual_file.R b/R/create_individual_file.R
index e4c68e2d2..e591f0b27 100644
--- a/R/create_individual_file.R
+++ b/R/create_individual_file.R
@@ -35,7 +35,6 @@ remove_blank_chi <- function(episode_file) {
 #'
 #' @inheritParams create_individual_file
 add_cij_columns <- function(episode_file) {
-
   cli::cli_alert_info("Add cij columns function started at {Sys.time()}")
 
   episode_file %>%
@@ -80,7 +79,6 @@ add_cij_columns <- function(episode_file) {
 #'
 #' @inheritParams create_individual_file
 add_all_columns <- function(episode_file) {
-
   cli::cli_alert_info("Add all columns function started at {Sys.time()}")
 
   episode_file %>%
@@ -476,7 +474,6 @@ na_type <- function(col = c("DoB", "postcode", "gpprac")) {
 #'
 #' @inheritParams create_individual_file
 aggregate_ch_episodes <- function(episode_file) {
-
   cli::cli_alert_info("Aggregate ch episodes function started at {Sys.time()}")
 
   episode_file %>%
@@ -500,7 +497,6 @@ aggregate_ch_episodes <- function(episode_file) {
 #'
 #' @inheritParams create_individual_file
 clean_up_ch <- function(episode_file) {
-
   cli::cli_alert_info("Clean up CH function started at {Sys.time()}")
 
   episode_file %>%
@@ -545,7 +541,6 @@ clean_up_ch <- function(episode_file) {
 #'
 #' @inheritParams create_individual_file
 recode_gender <- function(episode_file) {
-
   cli::cli_alert_info("Recode Gender function started at {Sys.time()}")
 
   episode_file %>%
@@ -565,7 +560,6 @@ recode_gender <- function(episode_file) {
 #'
 #' @inheritParams create_individual_file
 aggregate_by_chi <- function(episode_file) {
-
   cli::cli_alert_info("Aggregate by CHI function started at {Sys.time()}")
 
   episode_file %>%
@@ -694,7 +688,6 @@ min_no_inf <- function(x) {
 #'
 #' @param individual_file Individual file where each row represents a unique CHI
 clean_individual_file <- function(individual_file) {
-
   cli::cli_alert_info("Clean individual file function started at {Sys.time()}")
 
   individual_file %>%

From ca0c7b68a51b9e768a1faca866db8d08cddf866a Mon Sep 17 00:00:00 2001
From: Jennifer Thom <jennifer.thom@phs.scot>
Date: Thu, 18 May 2023 07:46:46 +0100
Subject: [PATCH 062/200] remove `clean_up_postcode`

---
 R/create_individual_file.R | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/R/create_individual_file.R b/R/create_individual_file.R
index e591f0b27..5bd66ddee 100644
--- a/R/create_individual_file.R
+++ b/R/create_individual_file.R
@@ -695,8 +695,7 @@ clean_individual_file <- function(individual_file) {
     clean_up_gender() %>%
     dplyr::mutate(
       age = compute_mid_year_age(year, .data$DoB)
-    ) %>%
-    clean_up_postcode()
+    )
 }
 
 #' Drop redundant columns

From 2cb8a24b98f9fa351e0bd68aafa57a2a9b66aa6d Mon Sep 17 00:00:00 2001
From: Jennifer Thom <jennifer.thom@phs.scot>
Date: Thu, 18 May 2023 08:00:48 +0100
Subject: [PATCH 063/200] Deal with ch cis episodes

---
 R/create_individual_file.R | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/R/create_individual_file.R b/R/create_individual_file.R
index 5bd66ddee..6b172e6e3 100644
--- a/R/create_individual_file.R
+++ b/R/create_individual_file.R
@@ -291,7 +291,6 @@ add_ch_columns <- function(episode_file, prefix, condition) {
   episode_file %>%
     add_standard_cols(prefix, condition) %>%
     dplyr::mutate(
-      ch_cis_episodes = dplyr::if_else(eval(condition), .data$first_ch_ep, NA_real_),
       ch_cost_per_day = dplyr::if_else(eval(condition) & .data$yearstay > 0, .data$cost_total_net / .data$yearstay, NA_real_),
       ch_cost_per_day = dplyr::if_else(eval(condition) & .data$yearstay == 0, .data$cost_total_net / .data$yearstay, .data$ch_cost_per_day),
       ch_no_cost = eval(condition) & is.na(ch_cost_per_day),
@@ -579,7 +578,7 @@ aggregate_by_chi <- function(episode_file) {
         "postcode", "DoB", "gpprac"
       )), ~ dplyr::last(., na_rm = TRUE)),
       dplyr::across(
-        c(
+        c("ch_cis_episodes" = "ch_chi_cis",
           "cij_total" = "cij_marker",
           "CIJ_el",
           "CIJ_non_el",

From fee2b46bed4b4d4587696c3b9c38bf7e46a8756e Mon Sep 17 00:00:00 2001
From: Jennit07 <Jennit07@users.noreply.github.com>
Date: Thu, 18 May 2023 07:02:47 +0000
Subject: [PATCH 064/200] Style code

---
 R/create_individual_file.R | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/R/create_individual_file.R b/R/create_individual_file.R
index 6b172e6e3..607b4fcc2 100644
--- a/R/create_individual_file.R
+++ b/R/create_individual_file.R
@@ -578,7 +578,8 @@ aggregate_by_chi <- function(episode_file) {
         "postcode", "DoB", "gpprac"
       )), ~ dplyr::last(., na_rm = TRUE)),
       dplyr::across(
-        c("ch_cis_episodes" = "ch_chi_cis",
+        c(
+          "ch_cis_episodes" = "ch_chi_cis",
           "cij_total" = "cij_marker",
           "CIJ_el",
           "CIJ_non_el",

From ee36738ede0a0119960832c05d2538b7d43b9020 Mon Sep 17 00:00:00 2001
From: Jennifer Thom <jennifer.thom@phs.scot>
Date: Thu, 18 May 2023 08:32:24 +0100
Subject: [PATCH 065/200] add .data$

---
 R/create_individual_file.R | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/R/create_individual_file.R b/R/create_individual_file.R
index 607b4fcc2..d97393cfa 100644
--- a/R/create_individual_file.R
+++ b/R/create_individual_file.R
@@ -293,10 +293,10 @@ add_ch_columns <- function(episode_file, prefix, condition) {
     dplyr::mutate(
       ch_cost_per_day = dplyr::if_else(eval(condition) & .data$yearstay > 0, .data$cost_total_net / .data$yearstay, NA_real_),
       ch_cost_per_day = dplyr::if_else(eval(condition) & .data$yearstay == 0, .data$cost_total_net / .data$yearstay, .data$ch_cost_per_day),
-      ch_no_cost = eval(condition) & is.na(ch_cost_per_day),
+      ch_no_cost = eval(condition) & is.na(.data$ch_cost_per_day),
       ch_ep_end = dplyr::if_else(eval(condition), .data$record_keydate2, lubridate::NA_Date_),
       # If end date is missing use the first day of next FY quarter
-      ch_ep_end = dplyr::if_else(eval(condition) & is.na(ch_ep_end), start_next_fy_quarter(sc_latest_submission), .data$ch_ep_end)
+      ch_ep_end = dplyr::if_else(eval(condition) & is.na(.data$ch_ep_end), start_next_fy_quarter(.data$sc_latest_submission), .data$ch_ep_end)
     )
 }
 

From feef2b62ad5e0aa197bc974a0892d57d7b14e71c Mon Sep 17 00:00:00 2001
From: Jennifer Thom <jennifer.thom@phs.scot>
Date: Thu, 18 May 2023 08:33:03 +0100
Subject: [PATCH 066/200] Turn ch aggregate into a data table

---
 R/create_individual_file.R | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/R/create_individual_file.R b/R/create_individual_file.R
index d97393cfa..ffa61a93b 100644
--- a/R/create_individual_file.R
+++ b/R/create_individual_file.R
@@ -476,7 +476,9 @@ aggregate_ch_episodes <- function(episode_file) {
   cli::cli_alert_info("Aggregate ch episodes function started at {Sys.time()}")
 
   episode_file %>%
-    dplyr::filter(!is.na(.data$ch_chi_cis)) %>%
+   # dplyr::filter(!is.na(.data$ch_chi_cis)) %>%
+    # use as.data.table to change the data format to data.table to accelerate
+    data.table::as.data.table() %>%
     dplyr::group_by(.data$chi, .data$ch_chi_cis) %>%
     dplyr::mutate(
       ch_no_cost = max(.data$ch_no_cost),
@@ -485,9 +487,12 @@ aggregate_ch_episodes <- function(episode_file) {
       ch_cost_per_day = mean(.data$ch_cost_per_day)
     ) %>%
     dplyr::ungroup() %>%
-    dplyr::distinct(.data$chi, .data$ch_chi_cis) %>%
-    dplyr::select(.data$chi, .data$ch_chi_cis, ch_no_cost, ch_ep_start, ch_ep_end, ch_cost_per_day) %>%
-    dplyr::right_join(episode_file, by = c(.data$chi, .data$ch_chi_cis))
+    # change the data format from data.table to data.frame
+    tibble::as_tibble()
+
+   # dplyr::distinct(.data$chi, .data$ch_chi_cis) %>%
+   # dplyr::select(.data$chi, .data$ch_chi_cis, .data$ch_no_cost, .data$ch_ep_start, .data$ch_ep_end, .data$ch_cost_per_day) %>%
+   # dplyr::right_join(episode_file, by = c(.data$chi, .data$ch_chi_cis))
 }
 
 #' Clean up CH

From da13d9224eb3866546613b44029a71ce8936e83a Mon Sep 17 00:00:00 2001
From: Jennit07 <Jennit07@users.noreply.github.com>
Date: Thu, 18 May 2023 07:36:00 +0000
Subject: [PATCH 067/200] Style code

---
 R/create_individual_file.R | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/R/create_individual_file.R b/R/create_individual_file.R
index ffa61a93b..9142091dd 100644
--- a/R/create_individual_file.R
+++ b/R/create_individual_file.R
@@ -476,7 +476,7 @@ aggregate_ch_episodes <- function(episode_file) {
   cli::cli_alert_info("Aggregate ch episodes function started at {Sys.time()}")
 
   episode_file %>%
-   # dplyr::filter(!is.na(.data$ch_chi_cis)) %>%
+    # dplyr::filter(!is.na(.data$ch_chi_cis)) %>%
     # use as.data.table to change the data format to data.table to accelerate
     data.table::as.data.table() %>%
     dplyr::group_by(.data$chi, .data$ch_chi_cis) %>%
@@ -490,9 +490,9 @@ aggregate_ch_episodes <- function(episode_file) {
     # change the data format from data.table to data.frame
     tibble::as_tibble()
 
-   # dplyr::distinct(.data$chi, .data$ch_chi_cis) %>%
-   # dplyr::select(.data$chi, .data$ch_chi_cis, .data$ch_no_cost, .data$ch_ep_start, .data$ch_ep_end, .data$ch_cost_per_day) %>%
-   # dplyr::right_join(episode_file, by = c(.data$chi, .data$ch_chi_cis))
+  # dplyr::distinct(.data$chi, .data$ch_chi_cis) %>%
+  # dplyr::select(.data$chi, .data$ch_chi_cis, .data$ch_no_cost, .data$ch_ep_start, .data$ch_ep_end, .data$ch_cost_per_day) %>%
+  # dplyr::right_join(episode_file, by = c(.data$chi, .data$ch_chi_cis))
 }
 
 #' Clean up CH

From 7fc40fa01d97de52a8ed8a5423c8735ca50844cf Mon Sep 17 00:00:00 2001
From: Jennifer Thom <jennifer.thom@phs.scot>
Date: Thu, 18 May 2023 08:51:13 +0100
Subject: [PATCH 068/200] use ch_chi_cis

---
 R/create_individual_file.R | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/R/create_individual_file.R b/R/create_individual_file.R
index 9142091dd..bdd2dd98b 100644
--- a/R/create_individual_file.R
+++ b/R/create_individual_file.R
@@ -524,12 +524,12 @@ clean_up_ch <- function(episode_file) {
         NA_real_
       ),
       ch_beddays = dplyr::if_else(
-        recid == "CH" & first_ch_ep == 0,
+        recid == "CH" & ch_chi_cis == 0,
         0,
         ch_beddays
       ),
       ch_cost = dplyr::if_else(
-        recid == "CH" & first_ch_ep == 0,
+        recid == "CH" & ch_chi_cis == 0,
         0,
         ch_cost
       )

From 45eeca09f3d5f8f3b5de8a9f6a6099770ed80e69 Mon Sep 17 00:00:00 2001
From: Jennifer Thom <jennifer.thom@phs.scot>
Date: Thu, 18 May 2023 09:27:02 +0100
Subject: [PATCH 069/200] remove `preventable_admissions` from aggregate

---
 R/create_individual_file.R | 48 ++++++++++++++++++--------------------
 1 file changed, 23 insertions(+), 25 deletions(-)

diff --git a/R/create_individual_file.R b/R/create_individual_file.R
index bdd2dd98b..eec9ee2fd 100644
--- a/R/create_individual_file.R
+++ b/R/create_individual_file.R
@@ -612,7 +612,6 @@ aggregate_by_chi <- function(episode_file) {
               "advice",
               "homeV",
               "time",
-              "admissions",
               "assessment",
               "other",
               "DN",
@@ -620,33 +619,32 @@ aggregate_by_chi <- function(episode_file) {
               "PCC",
               "_dnas"
             ),
-            -"preventable_admissions"
+            dplyr::starts_with("SDS_option")
           ),
-          dplyr::starts_with("SDS_option")
+          ~ sum(., na.rm = TRUE)
         ),
-        ~ sum(., na.rm = TRUE)
-      ),
-      dplyr::across(
-        c(
-          dplyr::starts_with("sc_"),
-          -"sc_send_lca",
-          -"sc_latest_submission",
-          "HL1_in_FY" = "hh_in_fy",
-          "NSU"
-        ),
-        ~ max_no_inf(.)
-      ),
-      dplyr::across(
-        c(
-          condition_cols(),
-          "death_date",
-          "deceased",
-          "year",
-          dplyr::ends_with(c(
-            "_Cohort", "end_fy", "start_fy"
-          )),
+        dplyr::across(
+          c(
+            dplyr::starts_with("sc_"),
+            -"sc_send_lca",
+            -"sc_latest_submission",
+            "HL1_in_FY" = "hh_in_fy",
+            "NSU"
+          ),
+          ~ max_no_inf(.)
         ),
-        ~ dplyr::first(., na_rm = TRUE)
+        dplyr::across(
+          c(
+            condition_cols(),
+            "death_date",
+            "deceased",
+            "year",
+            dplyr::ends_with(c(
+              "_Cohort", "end_fy", "start_fy"
+            )),
+          ),
+          ~ dplyr::first(., na_rm = TRUE)
+        )
       )
     ) %>%
     # change the data format from data.table to data.frame

From d89b0aae78b714fa174bab5b90d003f10975a713 Mon Sep 17 00:00:00 2001
From: Jennifer Thom <jennifer.thom@phs.scot>
Date: Thu, 18 May 2023 10:17:49 +0100
Subject: [PATCH 070/200] exclude `hh_in_fy` for now

---
 R/create_individual_file.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/R/create_individual_file.R b/R/create_individual_file.R
index eec9ee2fd..f18bdbc08 100644
--- a/R/create_individual_file.R
+++ b/R/create_individual_file.R
@@ -628,7 +628,7 @@ aggregate_by_chi <- function(episode_file) {
             dplyr::starts_with("sc_"),
             -"sc_send_lca",
             -"sc_latest_submission",
-            "HL1_in_FY" = "hh_in_fy",
+            #"HL1_in_FY" = "hh_in_fy",
             "NSU"
           ),
           ~ max_no_inf(.)

From 2326c0f4fb36cde1eb0df6e54c79870b77cae95e Mon Sep 17 00:00:00 2001
From: Jennit07 <Jennit07@users.noreply.github.com>
Date: Thu, 18 May 2023 09:19:54 +0000
Subject: [PATCH 071/200] Style code

---
 R/create_individual_file.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/R/create_individual_file.R b/R/create_individual_file.R
index f18bdbc08..340267561 100644
--- a/R/create_individual_file.R
+++ b/R/create_individual_file.R
@@ -628,7 +628,7 @@ aggregate_by_chi <- function(episode_file) {
             dplyr::starts_with("sc_"),
             -"sc_send_lca",
             -"sc_latest_submission",
-            #"HL1_in_FY" = "hh_in_fy",
+            # "HL1_in_FY" = "hh_in_fy",
             "NSU"
           ),
           ~ max_no_inf(.)

From 78d2c3643d41b5bec65e88330fd9956c541cb1e6 Mon Sep 17 00:00:00 2001
From: Jennifer Thom <jennifer.thom@phs.scot>
Date: Thu, 18 May 2023 12:54:20 +0100
Subject: [PATCH 072/200] Test - exclude `sc_` vars from aggregate

---
 R/create_individual_file.R | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/R/create_individual_file.R b/R/create_individual_file.R
index 340267561..3d65cfde9 100644
--- a/R/create_individual_file.R
+++ b/R/create_individual_file.R
@@ -625,9 +625,9 @@ aggregate_by_chi <- function(episode_file) {
         ),
         dplyr::across(
           c(
-            dplyr::starts_with("sc_"),
-            -"sc_send_lca",
-            -"sc_latest_submission",
+            #dplyr::starts_with("sc_"),
+            #-"sc_send_lca",
+            #-"sc_latest_submission",
             # "HL1_in_FY" = "hh_in_fy",
             "NSU"
           ),

From 3ac7d269cc2b0847947c795e704cf3122095b896 Mon Sep 17 00:00:00 2001
From: Jennit07 <Jennit07@users.noreply.github.com>
Date: Thu, 18 May 2023 11:56:41 +0000
Subject: [PATCH 073/200] Style code

---
 R/create_individual_file.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/R/create_individual_file.R b/R/create_individual_file.R
index 3d65cfde9..c4ef08742 100644
--- a/R/create_individual_file.R
+++ b/R/create_individual_file.R
@@ -625,7 +625,7 @@ aggregate_by_chi <- function(episode_file) {
         ),
         dplyr::across(
           c(
-            #dplyr::starts_with("sc_"),
+            # dplyr::starts_with("sc_"),
             #-"sc_send_lca",
             #-"sc_latest_submission",
             # "HL1_in_FY" = "hh_in_fy",

From 141c8808cf7a459aa0dd5ccb3e6d7ff54f4cdaca Mon Sep 17 00:00:00 2001
From: Jennifer Thom <jennifer.thom@phs.scot>
Date: Thu, 18 May 2023 13:45:49 +0100
Subject: [PATCH 074/200] Exclude for now

---
 R/create_individual_file.R | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/R/create_individual_file.R b/R/create_individual_file.R
index c4ef08742..72ae73e1f 100644
--- a/R/create_individual_file.R
+++ b/R/create_individual_file.R
@@ -623,16 +623,16 @@ aggregate_by_chi <- function(episode_file) {
           ),
           ~ sum(., na.rm = TRUE)
         ),
-        dplyr::across(
-          c(
-            # dplyr::starts_with("sc_"),
-            #-"sc_send_lca",
-            #-"sc_latest_submission",
-            # "HL1_in_FY" = "hh_in_fy",
-            "NSU"
-          ),
-          ~ max_no_inf(.)
-        ),
+        # dplyr::across(
+        #   c(
+        #     # dplyr::starts_with("sc_"),
+        #     #-"sc_send_lca",
+        #     #-"sc_latest_submission",
+        #     # "HL1_in_FY" = "hh_in_fy",
+        #     "NSU"
+        #   ),
+        #   ~ max_no_inf(.)
+        # ),
         dplyr::across(
           c(
             condition_cols(),

From 93fcd437b63c2874d36d7eda9350b732fcdc14d4 Mon Sep 17 00:00:00 2001
From: Jennifer Thom <jennifer.thom@phs.scot>
Date: Thu, 18 May 2023 14:31:00 +0100
Subject: [PATCH 075/200] exclude for now

---
 R/create_individual_file.R | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/R/create_individual_file.R b/R/create_individual_file.R
index 72ae73e1f..b3f79cd3d 100644
--- a/R/create_individual_file.R
+++ b/R/create_individual_file.R
@@ -636,8 +636,8 @@ aggregate_by_chi <- function(episode_file) {
         dplyr::across(
           c(
             condition_cols(),
-            "death_date",
-            "deceased",
+            #"death_date",
+            #"deceased",
             "year",
             dplyr::ends_with(c(
               "_Cohort", "end_fy", "start_fy"

From baf5d1339cdd1ccdc13d3f3ed6c876c6f80ccec8 Mon Sep 17 00:00:00 2001
From: Jennit07 <Jennit07@users.noreply.github.com>
Date: Thu, 18 May 2023 13:36:16 +0000
Subject: [PATCH 076/200] Style code

---
 R/create_individual_file.R | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/R/create_individual_file.R b/R/create_individual_file.R
index b3f79cd3d..4a25e8222 100644
--- a/R/create_individual_file.R
+++ b/R/create_individual_file.R
@@ -636,8 +636,8 @@ aggregate_by_chi <- function(episode_file) {
         dplyr::across(
           c(
             condition_cols(),
-            #"death_date",
-            #"deceased",
+            # "death_date",
+            # "deceased",
             "year",
             dplyr::ends_with(c(
               "_Cohort", "end_fy", "start_fy"

From 3bf8fb77c9eae035892f5b800bb60428a46df250 Mon Sep 17 00:00:00 2001
From: Jennifer Thom <jennifer.thom@phs.scot>
Date: Tue, 23 May 2023 14:00:49 +0100
Subject: [PATCH 077/200] automate `check_year_valid`

---
 R/add_nsu_cohort.R | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/R/add_nsu_cohort.R b/R/add_nsu_cohort.R
index 6fbcf9bc1..bb0d9cc4e 100644
--- a/R/add_nsu_cohort.R
+++ b/R/add_nsu_cohort.R
@@ -11,9 +11,10 @@
 add_nsu_cohort <- function(data, year) {
   year_param <- year
 
-  if (check_year_valid("2223", "NSU")) {
+  if (!check_year_valid(year, "NSU")) {
     return(data)
-  } else {
+  }
+
     # Check that the variables we need are in the data
     check_variables_exist(data,
       variables = c(
@@ -113,5 +114,4 @@ add_nsu_cohort <- function(data, year) {
       dplyr::select(-dplyr::contains("_nsu"), -"has_chi")
 
     return(return_df)
-  }
 }

From 3e5a059cbb326e0785a4e2b5d7f89610705827af Mon Sep 17 00:00:00 2001
From: Jennifer Thom <jennifer.thom@phs.scot>
Date: Tue, 23 May 2023 14:03:05 +0100
Subject: [PATCH 078/200] Return dummy file path for NSU not valid

---
 R/get_nsu_paths.R | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/R/get_nsu_paths.R b/R/get_nsu_paths.R
index c4c430672..2d53f0c84 100644
--- a/R/get_nsu_paths.R
+++ b/R/get_nsu_paths.R
@@ -10,11 +10,16 @@
 #' @family file path functions
 #' @seealso [get_file_path()] for the generic function.
 get_nsu_path <- function(year, ...) {
-  nsu_file_path <- get_file_path(
-    directory = fs::path(get_slf_dir(), "NSU"),
-    file_name = stringr::str_glue("All_CHIs_20{year}.parquet"),
-    ...
-  )
+
+  if (!check_year_valid(year, "NSU")) {
+    return(get_dummy_boxi_extract_path())
+  }
+
+    nsu_file_path <- get_file_path(
+      directory = fs::path(get_slf_dir(), "NSU"),
+      file_name = stringr::str_glue("All_CHIs_20{year}.parquet"),
+      ...
+    )
 
   return(nsu_file_path)
 }

From bfeffc7007688ebb741a15211b2e9bd417925be6 Mon Sep 17 00:00:00 2001
From: Jennit07 <Jennit07@users.noreply.github.com>
Date: Tue, 23 May 2023 13:13:11 +0000
Subject: [PATCH 079/200] Style code

---
 R/add_nsu_cohort.R | 190 ++++++++++++++++++++++-----------------------
 R/get_nsu_paths.R  |  11 ++-
 2 files changed, 100 insertions(+), 101 deletions(-)

diff --git a/R/add_nsu_cohort.R b/R/add_nsu_cohort.R
index bb0d9cc4e..c5a26da12 100644
--- a/R/add_nsu_cohort.R
+++ b/R/add_nsu_cohort.R
@@ -15,103 +15,103 @@ add_nsu_cohort <- function(data, year) {
     return(data)
   }
 
-    # Check that the variables we need are in the data
-    check_variables_exist(data,
-      variables = c(
-        "year",
-        "chi",
-        "recid",
-        "smrtype",
-        "postcode",
-        "gpprac",
-        "dob",
-        "gender"
-      )
+  # Check that the variables we need are in the data
+  check_variables_exist(data,
+    variables = c(
+      "year",
+      "chi",
+      "recid",
+      "smrtype",
+      "postcode",
+      "gpprac",
+      "dob",
+      "gender"
     )
+  )
 
-    matched <- dplyr::full_join(data,
-      # NSU cohort file
-      read_file(get_nsu_path(year)) %>%
-        dplyr::mutate(
-          dob = as.Date(.data[["dob"]]),
-          gpprac = convert_eng_gpprac_to_dummy(.data[["gpprac"]])
-        ),
-      # Match on by chi
-      by = "chi",
-      # Name the incoming variables with "_nsu"
-      suffix = c("", "_nsu"),
-      # Keep the chi from both sources
-      keep = TRUE
-    ) %>%
-      # Change the chi from the NSU cohort to a boolean
-      dplyr::mutate(has_chi = !is_missing(.data[["chi_nsu"]]))
-
-    return_df <- matched %>%
-      # Get data from non service user lookup if the recid is empty
+  matched <- dplyr::full_join(data,
+    # NSU cohort file
+    read_file(get_nsu_path(year)) %>%
       dplyr::mutate(
-        year = year_param,
-        recid = dplyr::if_else(
-          is_missing(.data[["recid"]]),
-          "NSU",
-          .data[["recid"]]
-        ),
-        smrtype = dplyr::if_else(
-          is_missing(.data[["recid"]]),
-          "Non-User",
-          .data[["smrtype"]]
-        ),
-        postcode = dplyr::if_else(
-          is_missing(.data[["recid"]]),
-          .data[["postcode_nsu"]],
-          .data[["postcode"]]
-        ),
-        gpprac = dplyr::if_else(
-          is_missing(.data[["recid"]]),
-          .data[["gpprac_nsu"]],
-          .data[["gpprac"]]
-        ),
-        dob = dplyr::if_else(
-          is_missing(.data[["recid"]]),
-          .data[["dob_nsu"]],
-          .data[["dob"]]
-        ),
-        gender = dplyr::if_else(
-          is_missing(.data[["recid"]]),
-          .data[["gender_nsu"]],
-          .data[["gender"]]
-        )
-      ) %>%
-      # If the data has come from the NSU cohort,
-      # use that data for the below variables
-      dplyr::mutate(
-        postcode = dplyr::if_else(
-          is_missing(.data[["postcode"]]) & .data[["has_chi"]],
-          .data[["postcode_nsu"]],
-          .data[["postcode"]]
-        ),
-        gpprac = dplyr::if_else(
-          is.na(.data[["gpprac"]]) & .data[["has_chi"]],
-          .data[["gpprac_nsu"]],
-          .data[["gpprac"]]
-        ),
-        dob = dplyr::if_else(
-          is.na(.data[["dob"]]) & .data[["has_chi"]],
-          .data[["dob_nsu"]],
-          .data[["dob"]]
-        ),
-        gender = dplyr::if_else(
-          is.na(.data[["gender"]]) & .data[["has_chi"]],
-          .data[["gender_nsu"]],
-          .data[["gender"]]
-        ),
-        chi = dplyr::if_else(
-          is_missing(.data[["chi"]]) & .data[["has_chi"]],
-          .data[["chi_nsu"]],
-          .data[["chi"]]
-        )
-      ) %>%
-      # Remove the additional columns
-      dplyr::select(-dplyr::contains("_nsu"), -"has_chi")
+        dob = as.Date(.data[["dob"]]),
+        gpprac = convert_eng_gpprac_to_dummy(.data[["gpprac"]])
+      ),
+    # Match on by chi
+    by = "chi",
+    # Name the incoming variables with "_nsu"
+    suffix = c("", "_nsu"),
+    # Keep the chi from both sources
+    keep = TRUE
+  ) %>%
+    # Change the chi from the NSU cohort to a boolean
+    dplyr::mutate(has_chi = !is_missing(.data[["chi_nsu"]]))
+
+  return_df <- matched %>%
+    # Get data from non service user lookup if the recid is empty
+    dplyr::mutate(
+      year = year_param,
+      recid = dplyr::if_else(
+        is_missing(.data[["recid"]]),
+        "NSU",
+        .data[["recid"]]
+      ),
+      smrtype = dplyr::if_else(
+        is_missing(.data[["recid"]]),
+        "Non-User",
+        .data[["smrtype"]]
+      ),
+      postcode = dplyr::if_else(
+        is_missing(.data[["recid"]]),
+        .data[["postcode_nsu"]],
+        .data[["postcode"]]
+      ),
+      gpprac = dplyr::if_else(
+        is_missing(.data[["recid"]]),
+        .data[["gpprac_nsu"]],
+        .data[["gpprac"]]
+      ),
+      dob = dplyr::if_else(
+        is_missing(.data[["recid"]]),
+        .data[["dob_nsu"]],
+        .data[["dob"]]
+      ),
+      gender = dplyr::if_else(
+        is_missing(.data[["recid"]]),
+        .data[["gender_nsu"]],
+        .data[["gender"]]
+      )
+    ) %>%
+    # If the data has come from the NSU cohort,
+    # use that data for the below variables
+    dplyr::mutate(
+      postcode = dplyr::if_else(
+        is_missing(.data[["postcode"]]) & .data[["has_chi"]],
+        .data[["postcode_nsu"]],
+        .data[["postcode"]]
+      ),
+      gpprac = dplyr::if_else(
+        is.na(.data[["gpprac"]]) & .data[["has_chi"]],
+        .data[["gpprac_nsu"]],
+        .data[["gpprac"]]
+      ),
+      dob = dplyr::if_else(
+        is.na(.data[["dob"]]) & .data[["has_chi"]],
+        .data[["dob_nsu"]],
+        .data[["dob"]]
+      ),
+      gender = dplyr::if_else(
+        is.na(.data[["gender"]]) & .data[["has_chi"]],
+        .data[["gender_nsu"]],
+        .data[["gender"]]
+      ),
+      chi = dplyr::if_else(
+        is_missing(.data[["chi"]]) & .data[["has_chi"]],
+        .data[["chi_nsu"]],
+        .data[["chi"]]
+      )
+    ) %>%
+    # Remove the additional columns
+    dplyr::select(-dplyr::contains("_nsu"), -"has_chi")
 
-    return(return_df)
+  return(return_df)
 }
diff --git a/R/get_nsu_paths.R b/R/get_nsu_paths.R
index 2d53f0c84..107a92168 100644
--- a/R/get_nsu_paths.R
+++ b/R/get_nsu_paths.R
@@ -10,16 +10,15 @@
 #' @family file path functions
 #' @seealso [get_file_path()] for the generic function.
 get_nsu_path <- function(year, ...) {
-
   if (!check_year_valid(year, "NSU")) {
     return(get_dummy_boxi_extract_path())
   }
 
-    nsu_file_path <- get_file_path(
-      directory = fs::path(get_slf_dir(), "NSU"),
-      file_name = stringr::str_glue("All_CHIs_20{year}.parquet"),
-      ...
-    )
+  nsu_file_path <- get_file_path(
+    directory = fs::path(get_slf_dir(), "NSU"),
+    file_name = stringr::str_glue("All_CHIs_20{year}.parquet"),
+    ...
+  )
 
   return(nsu_file_path)
 }

From 4aacf7a4ab1faa2a3b15fd8ccc657c2618eb234b Mon Sep 17 00:00:00 2001
From: Jennifer Thom <jennifer.thom@phs.scot>
Date: Wed, 24 May 2023 12:54:06 +0100
Subject: [PATCH 080/200] Fix brackets in aggregate

---
 R/create_individual_file.R | 61 +++++++++++++++++++-------------------
 1 file changed, 31 insertions(+), 30 deletions(-)

diff --git a/R/create_individual_file.R b/R/create_individual_file.R
index 4a25e8222..064f5c529 100644
--- a/R/create_individual_file.R
+++ b/R/create_individual_file.R
@@ -576,12 +576,13 @@ aggregate_by_chi <- function(episode_file) {
       record_keydate2,
       keytime2
     ) %>%
-    dplyr::group_by("chi") %>%
+    dplyr::group_by(.data$chi) %>%
     dplyr::summarise(
       gender = mean(gender),
-      dplyr::across(dplyr::ends_with(c(
-        "postcode", "DoB", "gpprac"
-      )), ~ dplyr::last(., na_rm = TRUE)),
+      dplyr::across(
+        dplyr::ends_with(c("postcode", "DoB", "gpprac")),
+        ~ dplyr::last(., na_rm = TRUE)
+      ),
       dplyr::across(
         c(
           "ch_cis_episodes" = "ch_chi_cis",
@@ -614,37 +615,37 @@ aggregate_by_chi <- function(episode_file) {
               "time",
               "assessment",
               "other",
-              "DN",
+              # "DN",
               "NHS24",
               "PCC",
               "_dnas"
-            ),
-            dplyr::starts_with("SDS_option")
+            )
           ),
-          ~ sum(., na.rm = TRUE)
+          dplyr::starts_with("SDS_option")
         ),
-        # dplyr::across(
-        #   c(
-        #     # dplyr::starts_with("sc_"),
-        #     #-"sc_send_lca",
-        #     #-"sc_latest_submission",
-        #     # "HL1_in_FY" = "hh_in_fy",
-        #     "NSU"
-        #   ),
-        #   ~ max_no_inf(.)
-        # ),
-        dplyr::across(
-          c(
-            condition_cols(),
-            # "death_date",
-            # "deceased",
-            "year",
-            dplyr::ends_with(c(
-              "_Cohort", "end_fy", "start_fy"
-            )),
-          ),
-          ~ dplyr::first(., na_rm = TRUE)
-        )
+        ~ sum(., na.rm = TRUE)
+      ),
+      # dplyr::across(
+      #   c(
+      #     # dplyr::starts_with("sc_"),
+      #     #-"sc_send_lca",
+      #     #-"sc_latest_submission",
+      #     # "HL1_in_FY" = "hh_in_fy",
+      #     "NSU"
+      #   ),
+      #   ~ max_no_inf(.)
+      # ),
+      dplyr::across(
+        c(
+          condition_cols(),
+          # "death_date",
+          # "deceased",
+          "year",
+          dplyr::ends_with(c(
+            "_Cohort", "end_fy", "start_fy"
+          )),
+        ),
+        ~ dplyr::first(., na_rm = TRUE)
       )
     ) %>%
     # change the data format from data.table to data.frame

From 5bf6a4b124e69517944d8dee831a7da2d7bc767c Mon Sep 17 00:00:00 2001
From: Jennifer Thom <jennifer.thom@phs.scot>
Date: Wed, 24 May 2023 12:54:44 +0100
Subject: [PATCH 081/200] TEMP - exclude variables

---
 R/create_individual_file.R | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/R/create_individual_file.R b/R/create_individual_file.R
index 064f5c529..ffed97b20 100644
--- a/R/create_individual_file.R
+++ b/R/create_individual_file.R
@@ -90,8 +90,8 @@ add_all_columns <- function(episode_file) {
     add_ae_columns("AE", .data$recid == "AE2") %>%
     add_pis_columns("PIS", .data$recid == "PIS") %>%
     add_ooh_columns("OoH", .data$recid == "OoH") %>%
-    add_dn_columns("DN", .data$recid == "DN") %>%
-    add_cmh_columns("CMH", .data$recid == "CMH") %>%
+    # add_dn_columns("DN", .data$recid == "DN") %>%
+    # add_cmh_columns("CMH", .data$recid == "CMH") %>%
     # add_dd_columns("DD", .data$recid == "DD") %>%
     add_nsu_columns("NSU", .data$recid == "NSU") %>%
     add_nrs_columns("NRS", .data$recid == "NRS") %>%
@@ -715,8 +715,8 @@ drop_cols <- function(individual_file) {
       -"dob",
       -"postcode",
       -"gpprac",
-      -"no_paid_items",
-      -"total_no_dn_contacts"
+      -"no_paid_items"#,
+      #-"total_no_dn_contacts"
     )
 }
 

From e045cccc7a3c5718b321a4348b064f9759504806 Mon Sep 17 00:00:00 2001
From: Jennifer Thom <jennifer.thom@phs.scot>
Date: Wed, 24 May 2023 12:55:19 +0100
Subject: [PATCH 082/200] Use `phsmethods::sex_from_chi`

---
 R/create_individual_file.R | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/R/create_individual_file.R b/R/create_individual_file.R
index ffed97b20..314f2abf3 100644
--- a/R/create_individual_file.R
+++ b/R/create_individual_file.R
@@ -741,12 +741,7 @@ clean_up_gender <- function(individual_file) {
     dplyr::mutate(
       gender = dplyr::case_when(
         .data$gender != 1.5 ~ round(.data$gender),
-        as.numeric(substr(.data$chi, 9, 9)) %% 2 == 1 ~ 1,
-        TRUE ~ 2
-      ),
-      gender = dplyr::case_when(
-        .data$gender == 1 ~ "Male",
-        .data$gender == 2 ~ "Female"
+        .default = phsmethods::sex_from_chi(.data$chi, chi_check = FALSE)
       )
     )
 }

From 173ae02055d34c380c6dc3b772b667369ddbc2ad Mon Sep 17 00:00:00 2001
From: Jennit07 <Jennit07@users.noreply.github.com>
Date: Wed, 24 May 2023 11:58:26 +0000
Subject: [PATCH 083/200] Style code

---
 R/create_individual_file.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/R/create_individual_file.R b/R/create_individual_file.R
index 314f2abf3..d1d787683 100644
--- a/R/create_individual_file.R
+++ b/R/create_individual_file.R
@@ -715,7 +715,7 @@ drop_cols <- function(individual_file) {
       -"dob",
       -"postcode",
       -"gpprac",
-      -"no_paid_items"#,
+      -"no_paid_items" # ,
       #-"total_no_dn_contacts"
     )
 }

From e5332ee71a0d77997f2fac2c9e007c1819e5cdab Mon Sep 17 00:00:00 2001
From: Jennifer Thom <jennifer.thom@phs.scot>
Date: Wed, 24 May 2023 18:07:46 +0100
Subject: [PATCH 084/200] Add ungroup()

---
 R/create_individual_file.R | 1 +
 1 file changed, 1 insertion(+)

diff --git a/R/create_individual_file.R b/R/create_individual_file.R
index d1d787683..cf481065b 100644
--- a/R/create_individual_file.R
+++ b/R/create_individual_file.R
@@ -648,6 +648,7 @@ aggregate_by_chi <- function(episode_file) {
         ~ dplyr::first(., na_rm = TRUE)
       )
     ) %>%
+    ungroup() %>%
     # change the data format from data.table to data.frame
     tibble::as_tibble()
 }

From cec63a3498f418c5439180e3b45b21aa83931bc4 Mon Sep 17 00:00:00 2001
From: Jennifer Thom <jennifer.thom@phs.scot>
Date: Wed, 24 May 2023 18:09:30 +0100
Subject: [PATCH 085/200] lowercase dob

---
 R/create_individual_file.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/R/create_individual_file.R b/R/create_individual_file.R
index cf481065b..fb5813b14 100644
--- a/R/create_individual_file.R
+++ b/R/create_individual_file.R
@@ -699,7 +699,7 @@ clean_individual_file <- function(individual_file) {
     drop_cols() %>%
     clean_up_gender() %>%
     dplyr::mutate(
-      age = compute_mid_year_age(year, .data$DoB)
+      age = compute_mid_year_age(year, .data$dob)
     )
 }
 

From 8a652dfe96249c026ac30d31378dab45a7da9fa1 Mon Sep 17 00:00:00 2001
From: Jennifer Thom <jennifer.thom@phs.scot>
Date: Thu, 25 May 2023 11:06:44 +0100
Subject: [PATCH 086/200] Remove as.data.table

---
 R/create_individual_file.R | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/R/create_individual_file.R b/R/create_individual_file.R
index fb5813b14..fe440c723 100644
--- a/R/create_individual_file.R
+++ b/R/create_individual_file.R
@@ -567,8 +567,6 @@ aggregate_by_chi <- function(episode_file) {
   cli::cli_alert_info("Aggregate by CHI function started at {Sys.time()}")
 
   episode_file %>%
-    # use as.data.table to change the data format to data.table to accelerate
-    data.table::as.data.table() %>%
     dplyr::arrange(
       chi,
       record_keydate1,
@@ -648,9 +646,7 @@ aggregate_by_chi <- function(episode_file) {
         ~ dplyr::first(., na_rm = TRUE)
       )
     ) %>%
-    ungroup() %>%
-    # change the data format from data.table to data.frame
-    tibble::as_tibble()
+    dplyr::ungroup()
 }
 
 #' Condition columns

From fc979d9642aa33a751362c738c0b78a35ace22da Mon Sep 17 00:00:00 2001
From: Zihao Li <lizihao_anu@outlook.com>
Date: Wed, 31 May 2023 16:52:48 +0100
Subject: [PATCH 087/200] rewrite aggregate_by_chi with data.table

---
 R/aggregate_by_chi_zihao.R | 125 +++++++++++++++++++++++++++++++++++++
 1 file changed, 125 insertions(+)
 create mode 100644 R/aggregate_by_chi_zihao.R

diff --git a/R/aggregate_by_chi_zihao.R b/R/aggregate_by_chi_zihao.R
new file mode 100644
index 000000000..85b87e413
--- /dev/null
+++ b/R/aggregate_by_chi_zihao.R
@@ -0,0 +1,125 @@
+library(data.table)
+
+aggregate_by_chi_zihao <- function(episode_file) {
+  cli::cli_alert_info("Aggregate by CHI function started at {Sys.time()}")
+
+  data.table::setDT(episode_file)  # Convert to data.table
+
+  # Sort the data within each chunk
+  data.table::setkeyv(episode_file, c("chi", "record_keydate1", "keytime1", "record_keydate2", "keytime2"))
+
+  data.table::setnames(
+    episode_file,
+    c("ch_chi_cis", "cij_marker", "ooh_case_id"
+      #,"hh_in_fy"
+    ),
+    c("ch_cis_episodes", "cij_total", "ooh_cases"
+      #,"HL1_in_FY"
+    )
+  )
+
+  # Initialize an empty data.table for the aggregated results
+  aggregated_data <- data.table::data.table()
+
+  # Process the data in chunks
+  chunk_size <- min(nrow(episode_file), 1e7)  # Adjust the chunk size as per your system's memory capacity
+  n_chunks <- nrow(episode_file) %/% chunk_size
+
+
+  # colums specification
+  cols2 <- names(episode_file)[grepl("postcode$|DoB$|gpprac$",
+                                     names(episode_file),
+                                     ignore.case = TRUE)]
+  cols3 <- c(
+    "ch_cis_episodes",
+    "cij_total",
+    "CIJ_el",
+    "CIJ_non_el",
+    "CIJ_mat",
+    # "cij_delay",
+    "ooh_cases",
+    "preventable_admissions"
+  )
+  cols4 <- names(episode_file)[grepl(
+    paste(
+      "episodes$",
+      "beddays$",
+      "cost$",
+      "attendances$",
+      "attend$",
+      # "contacts$",
+      "hours$",
+      "alarms$",
+      "telecare$",
+      "paid_items$",
+      "advice$",
+      "homeV$",
+      "time$",
+      "assessment$",
+      "other$",
+      # "DN$",
+      "NHS24$",
+      "PCC$",
+      "_dnas$",
+      "^SDS_option",
+      sep = "|"
+    ),
+    names(episode_file),
+    ignore.case = TRUE
+  )]
+  cols4 <- cols4[!(cols4 %in% c("ch_cis_episodes"))]
+  # cols5 <- names(episode_file)[grepl("^sc|HL1_in_FY|NSU", names(episode_file), ignore.case = TRUE)]
+  # cols5 <- cols5[!(cols5 %in% c("sc_send_lca", "sc_latest_submission"))]
+  cols6 <- c(condition_cols(),
+             # "death_date",
+             # "deceased",
+             "year",
+             names(episode_file)[grepl("_Cohort$|end_fy$|start_fy$",
+                                       names(episode_file),
+                                       ignore.case = TRUE)])
+
+  for (i in 1:n_chunks) {
+    start <- (i - 1) * chunk_size + 1
+    end <- i * chunk_size
+    # Subset the data to the current chunk
+    chunk <- episode_file[start:end]
+
+    # compute
+    chunk_cols1 <- chunk[,
+                         .(gender = mean(gender)),
+                         by = chi]
+    chunk_cols2 <- chunk[,
+                         .SD[.N],
+                         # .SDcols = patterns("postcode$|DoB$|gpprac$"),
+                         .SDcols = cols2,
+                         by = chi]
+    chunk_cols3 <- chunk[,
+                         lapply(.SD, function(x) data.table::uniqueN(x, na.rm = TRUE)),
+                         .SDcols = cols3,
+                         by = chi]
+    chunk_cols4 <- chunk[,
+                         lapply(.SD, function(x) sum(x, na.rm = TRUE)),
+                         .SDcols = cols4,
+                         by = chi]
+    # chunk_cols5 <- chunk[,
+    #                      lapply(.SD, function(x) max(x, na.rm = TRUE)),
+    #                      .SDcols = cols5,
+    #                      by = chi]
+    chunk_cols6 <- chunk[,
+                         # .SD[1]
+                         lapply(.SD, function(x) x[!is.na(x)][1]),
+                         .SDcols = cols6,
+                         by = chi]
+    chunk_agg <- cbind(chunk_cols1,
+                       chunk_cols2[, chi := NULL],
+                       chunk_cols3[, chi := NULL],
+                       chunk_cols4[, chi := NULL],
+                       # chunk_cols5[, chi := NULL],
+                       chunk_cols6[, chi := NULL])
+
+    # Append the aggregated chunk to the overall result
+    aggregated_data <- data.table::rbindlist(list(aggregated_data, chunk_agg))
+  }
+  aggregated_data <- dplyr::as_tibble(aggregated_data)
+  return(aggregated_data)
+}

From 7c63f5731946a538201fb75378ac1d03c0e60212 Mon Sep 17 00:00:00 2001
From: lizihao-anu <lizihao-anu@users.noreply.github.com>
Date: Wed, 31 May 2023 15:59:26 +0000
Subject: [PATCH 088/200] Style code

---
 R/aggregate_by_chi_zihao.R | 87 ++++++++++++++++++++++----------------
 1 file changed, 50 insertions(+), 37 deletions(-)

diff --git a/R/aggregate_by_chi_zihao.R b/R/aggregate_by_chi_zihao.R
index 85b87e413..fb8a6b48a 100644
--- a/R/aggregate_by_chi_zihao.R
+++ b/R/aggregate_by_chi_zihao.R
@@ -3,18 +3,20 @@ library(data.table)
 aggregate_by_chi_zihao <- function(episode_file) {
   cli::cli_alert_info("Aggregate by CHI function started at {Sys.time()}")
 
-  data.table::setDT(episode_file)  # Convert to data.table
+  data.table::setDT(episode_file) # Convert to data.table
 
   # Sort the data within each chunk
   data.table::setkeyv(episode_file, c("chi", "record_keydate1", "keytime1", "record_keydate2", "keytime2"))
 
   data.table::setnames(
     episode_file,
-    c("ch_chi_cis", "cij_marker", "ooh_case_id"
-      #,"hh_in_fy"
+    c(
+      "ch_chi_cis", "cij_marker", "ooh_case_id"
+      # ,"hh_in_fy"
     ),
-    c("ch_cis_episodes", "cij_total", "ooh_cases"
-      #,"HL1_in_FY"
+    c(
+      "ch_cis_episodes", "cij_total", "ooh_cases"
+      # ,"HL1_in_FY"
     )
   )
 
@@ -22,14 +24,15 @@ aggregate_by_chi_zihao <- function(episode_file) {
   aggregated_data <- data.table::data.table()
 
   # Process the data in chunks
-  chunk_size <- min(nrow(episode_file), 1e7)  # Adjust the chunk size as per your system's memory capacity
+  chunk_size <- min(nrow(episode_file), 1e7) # Adjust the chunk size as per your system's memory capacity
   n_chunks <- nrow(episode_file) %/% chunk_size
 
 
   # colums specification
   cols2 <- names(episode_file)[grepl("postcode$|DoB$|gpprac$",
-                                     names(episode_file),
-                                     ignore.case = TRUE)]
+    names(episode_file),
+    ignore.case = TRUE
+  )]
   cols3 <- c(
     "ch_cis_episodes",
     "cij_total",
@@ -70,13 +73,16 @@ aggregate_by_chi_zihao <- function(episode_file) {
   cols4 <- cols4[!(cols4 %in% c("ch_cis_episodes"))]
   # cols5 <- names(episode_file)[grepl("^sc|HL1_in_FY|NSU", names(episode_file), ignore.case = TRUE)]
   # cols5 <- cols5[!(cols5 %in% c("sc_send_lca", "sc_latest_submission"))]
-  cols6 <- c(condition_cols(),
-             # "death_date",
-             # "deceased",
-             "year",
-             names(episode_file)[grepl("_Cohort$|end_fy$|start_fy$",
-                                       names(episode_file),
-                                       ignore.case = TRUE)])
+  cols6 <- c(
+    condition_cols(),
+    # "death_date",
+    # "deceased",
+    "year",
+    names(episode_file)[grepl("_Cohort$|end_fy$|start_fy$",
+      names(episode_file),
+      ignore.case = TRUE
+    )]
+  )
 
   for (i in 1:n_chunks) {
     start <- (i - 1) * chunk_size + 1
@@ -86,36 +92,43 @@ aggregate_by_chi_zihao <- function(episode_file) {
 
     # compute
     chunk_cols1 <- chunk[,
-                         .(gender = mean(gender)),
-                         by = chi]
+      .(gender = mean(gender)),
+      by = chi
+    ]
     chunk_cols2 <- chunk[,
-                         .SD[.N],
-                         # .SDcols = patterns("postcode$|DoB$|gpprac$"),
-                         .SDcols = cols2,
-                         by = chi]
+      .SD[.N],
+      # .SDcols = patterns("postcode$|DoB$|gpprac$"),
+      .SDcols = cols2,
+      by = chi
+    ]
     chunk_cols3 <- chunk[,
-                         lapply(.SD, function(x) data.table::uniqueN(x, na.rm = TRUE)),
-                         .SDcols = cols3,
-                         by = chi]
+      lapply(.SD, function(x) data.table::uniqueN(x, na.rm = TRUE)),
+      .SDcols = cols3,
+      by = chi
+    ]
     chunk_cols4 <- chunk[,
-                         lapply(.SD, function(x) sum(x, na.rm = TRUE)),
-                         .SDcols = cols4,
-                         by = chi]
+      lapply(.SD, function(x) sum(x, na.rm = TRUE)),
+      .SDcols = cols4,
+      by = chi
+    ]
     # chunk_cols5 <- chunk[,
     #                      lapply(.SD, function(x) max(x, na.rm = TRUE)),
     #                      .SDcols = cols5,
     #                      by = chi]
     chunk_cols6 <- chunk[,
-                         # .SD[1]
-                         lapply(.SD, function(x) x[!is.na(x)][1]),
-                         .SDcols = cols6,
-                         by = chi]
-    chunk_agg <- cbind(chunk_cols1,
-                       chunk_cols2[, chi := NULL],
-                       chunk_cols3[, chi := NULL],
-                       chunk_cols4[, chi := NULL],
-                       # chunk_cols5[, chi := NULL],
-                       chunk_cols6[, chi := NULL])
+      # .SD[1]
+      lapply(.SD, function(x) x[!is.na(x)][1]),
+      .SDcols = cols6,
+      by = chi
+    ]
+    chunk_agg <- cbind(
+      chunk_cols1,
+      chunk_cols2[, chi := NULL],
+      chunk_cols3[, chi := NULL],
+      chunk_cols4[, chi := NULL],
+      # chunk_cols5[, chi := NULL],
+      chunk_cols6[, chi := NULL]
+    )
 
     # Append the aggregated chunk to the overall result
     aggregated_data <- data.table::rbindlist(list(aggregated_data, chunk_agg))

From 70f0891c0bf64f3fbf58f266230d2fd11785f4aa Mon Sep 17 00:00:00 2001
From: Zihao Li <lizihao_anu@outlook.com>
Date: Wed, 31 May 2023 17:34:44 +0100
Subject: [PATCH 089/200] minor changes

---
 R/create_individual_file.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/R/create_individual_file.R b/R/create_individual_file.R
index fe440c723..4596460dd 100644
--- a/R/create_individual_file.R
+++ b/R/create_individual_file.R
@@ -709,7 +709,7 @@ drop_cols <- function(individual_file) {
     dplyr::select(
       -month_cols(),
       -"ch_no_cost",
-      -"dob",
+      # -"dob",
       -"postcode",
       -"gpprac",
       -"no_paid_items" # ,

From abda3d57ddf8da7e44d06c7aad0dfd07cbf69bdf Mon Sep 17 00:00:00 2001
From: James McMahon <james.mcmahon@phs.scot>
Date: Thu, 1 Jun 2023 12:47:47 +0100
Subject: [PATCH 090/200] Use the updated function

---
 R/create_individual_file.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/R/create_individual_file.R b/R/create_individual_file.R
index 4596460dd..05df574e0 100644
--- a/R/create_individual_file.R
+++ b/R/create_individual_file.R
@@ -11,7 +11,7 @@ create_individual_file <- function(episode_file) {
     aggregate_ch_episodes() %>%
     clean_up_ch() %>%
     recode_gender() %>%
-    aggregate_by_chi() %>%
+    aggregate_by_chi_zihao() %>%
     clean_individual_file()
 }
 

From 38be4d2de94bb2e6c1659c788629cb57ae8a9c11 Mon Sep 17 00:00:00 2001
From: Zihao Li <lizihao_anu@outlook.com>
Date: Tue, 6 Jun 2023 11:22:25 +0100
Subject: [PATCH 091/200] to properly import data.table

---
 NAMESPACE                     |  3 +++
 R/aggregate_by_chi_zihao.R    | 14 +++++++++++---
 man/aggregate_by_chi_zihao.Rd | 15 +++++++++++++++
 3 files changed, 29 insertions(+), 3 deletions(-)
 create mode 100644 man/aggregate_by_chi_zihao.Rd

diff --git a/NAMESPACE b/NAMESPACE
index 5610cf6f2..28cccd6aa 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -158,6 +158,9 @@ export(start_fy)
 export(start_fy_quarter)
 export(start_next_fy_quarter)
 export(write_file)
+importFrom(data.table,":=")
+importFrom(data.table,.N)
+importFrom(data.table,.SD)
 importFrom(magrittr,"%>%")
 importFrom(readr,col_character)
 importFrom(readr,col_date)
diff --git a/R/aggregate_by_chi_zihao.R b/R/aggregate_by_chi_zihao.R
index fb8a6b48a..069aa53a4 100644
--- a/R/aggregate_by_chi_zihao.R
+++ b/R/aggregate_by_chi_zihao.R
@@ -1,5 +1,13 @@
-library(data.table)
-
+#' Aggregate by CHI
+#'
+#' @description Aggregate episode file by CHI to convert into
+#' individual file.
+#'
+#' @importFrom data.table :=
+#' @importFrom data.table .N
+#' @importFrom data.table .SD
+#'
+#' @inheritParams create_individual_file
 aggregate_by_chi_zihao <- function(episode_file) {
   cli::cli_alert_info("Aggregate by CHI function started at {Sys.time()}")
 
@@ -24,7 +32,7 @@ aggregate_by_chi_zihao <- function(episode_file) {
   aggregated_data <- data.table::data.table()
 
   # Process the data in chunks
-  chunk_size <- min(nrow(episode_file), 1e7) # Adjust the chunk size as per your system's memory capacity
+  chunk_size <- min(nrow(episode_file), 5e7) # Adjust the chunk size as per your system's memory capacity
   n_chunks <- nrow(episode_file) %/% chunk_size
 
 
diff --git a/man/aggregate_by_chi_zihao.Rd b/man/aggregate_by_chi_zihao.Rd
new file mode 100644
index 000000000..3d4961e19
--- /dev/null
+++ b/man/aggregate_by_chi_zihao.Rd
@@ -0,0 +1,15 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/aggregate_by_chi_zihao.R
+\name{aggregate_by_chi_zihao}
+\alias{aggregate_by_chi_zihao}
+\title{Aggregate by CHI}
+\usage{
+aggregate_by_chi_zihao(episode_file)
+}
+\arguments{
+\item{episode_file}{Tibble containing episodic data}
+}
+\description{
+Aggregate episode file by CHI to convert into
+individual file.
+}

From 6368535136ce2133820fb48e3d0760a9db8344fd Mon Sep 17 00:00:00 2001
From: Zihao Li <lizihao_anu@outlook.com>
Date: Tue, 6 Jun 2023 12:23:25 +0100
Subject: [PATCH 092/200] remove redundant columns dob postcode and gpprac

---
 NAMESPACE                  | 1 -
 R/aggregate_by_chi_zihao.R | 6 +++++-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/NAMESPACE b/NAMESPACE
index 28cccd6aa..395814633 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -158,7 +158,6 @@ export(start_fy)
 export(start_fy_quarter)
 export(start_next_fy_quarter)
 export(write_file)
-importFrom(data.table,":=")
 importFrom(data.table,.N)
 importFrom(data.table,.SD)
 importFrom(magrittr,"%>%")
diff --git a/R/aggregate_by_chi_zihao.R b/R/aggregate_by_chi_zihao.R
index 069aa53a4..b461484cb 100644
--- a/R/aggregate_by_chi_zihao.R
+++ b/R/aggregate_by_chi_zihao.R
@@ -3,7 +3,6 @@
 #' @description Aggregate episode file by CHI to convert into
 #' individual file.
 #'
-#' @importFrom data.table :=
 #' @importFrom data.table .N
 #' @importFrom data.table .SD
 #'
@@ -11,6 +10,11 @@
 aggregate_by_chi_zihao <- function(episode_file) {
   cli::cli_alert_info("Aggregate by CHI function started at {Sys.time()}")
 
+  episode_file <- episode_file %>%
+    dplyr::select(-dplyr::ends_with("_gpprac") | "most_recent_gpprac") %>%
+    dplyr::select(-dplyr::ends_with("_postcode") | "most_recent_postcode") %>%
+    dplyr::select(-dplyr::ends_with("_DoB") | "dob")
+
   data.table::setDT(episode_file) # Convert to data.table
 
   # Sort the data within each chunk

From 2d7001962d7001abe38b220078ef6dde1e7ecde4 Mon Sep 17 00:00:00 2001
From: Zihao Li <lizihao_anu@outlook.com>
Date: Tue, 6 Jun 2023 16:07:48 +0100
Subject: [PATCH 093/200] minor changes to remove redundant postcode gpprac
 columns

---
 R/aggregate_by_chi_zihao.R | 25 ++++++++++++++++++++-----
 1 file changed, 20 insertions(+), 5 deletions(-)

diff --git a/R/aggregate_by_chi_zihao.R b/R/aggregate_by_chi_zihao.R
index b461484cb..656f25885 100644
--- a/R/aggregate_by_chi_zihao.R
+++ b/R/aggregate_by_chi_zihao.R
@@ -11,14 +11,28 @@ aggregate_by_chi_zihao <- function(episode_file) {
   cli::cli_alert_info("Aggregate by CHI function started at {Sys.time()}")
 
   episode_file <- episode_file %>%
-    dplyr::select(-dplyr::ends_with("_gpprac") | "most_recent_gpprac") %>%
-    dplyr::select(-dplyr::ends_with("_postcode") | "most_recent_postcode") %>%
-    dplyr::select(-dplyr::ends_with("_DoB") | "dob")
+    dplyr::select(-c(postcode, gpprac)) %>%
+    dplyr::rename("gpprac" = "most_recent_gpprac",
+                  "postcode" = "most_recent_postcode") %>%
+    dplyr::select(-c(
+      dplyr::ends_with("_gpprac"),
+      dplyr::ends_with("_postcode"),
+      dplyr::ends_with("_DoB")
+    ))
 
   data.table::setDT(episode_file) # Convert to data.table
 
   # Sort the data within each chunk
-  data.table::setkeyv(episode_file, c("chi", "record_keydate1", "keytime1", "record_keydate2", "keytime2"))
+  data.table::setkeyv(
+    episode_file,
+    c(
+      "chi",
+      "record_keydate1",
+      "keytime1",
+      "record_keydate2",
+      "keytime2"
+    )
+  )
 
   data.table::setnames(
     episode_file,
@@ -36,7 +50,8 @@ aggregate_by_chi_zihao <- function(episode_file) {
   aggregated_data <- data.table::data.table()
 
   # Process the data in chunks
-  chunk_size <- min(nrow(episode_file), 5e7) # Adjust the chunk size as per your system's memory capacity
+  chunk_size <- min(nrow(episode_file), 5e7)
+  # Adjust the chunk size as per your system's memory capacity
   n_chunks <- nrow(episode_file) %/% chunk_size
 
 

From 9f23cff13b75b1201310ffd7ee85a32479188b2b Mon Sep 17 00:00:00 2001
From: lizihao-anu <lizihao-anu@users.noreply.github.com>
Date: Tue, 6 Jun 2023 15:10:03 +0000
Subject: [PATCH 094/200] Style code

---
 R/aggregate_by_chi_zihao.R | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/R/aggregate_by_chi_zihao.R b/R/aggregate_by_chi_zihao.R
index 656f25885..17641d1fb 100644
--- a/R/aggregate_by_chi_zihao.R
+++ b/R/aggregate_by_chi_zihao.R
@@ -12,8 +12,10 @@ aggregate_by_chi_zihao <- function(episode_file) {
 
   episode_file <- episode_file %>%
     dplyr::select(-c(postcode, gpprac)) %>%
-    dplyr::rename("gpprac" = "most_recent_gpprac",
-                  "postcode" = "most_recent_postcode") %>%
+    dplyr::rename(
+      "gpprac" = "most_recent_gpprac",
+      "postcode" = "most_recent_postcode"
+    ) %>%
     dplyr::select(-c(
       dplyr::ends_with("_gpprac"),
       dplyr::ends_with("_postcode"),

From b361616cdfa32683a8fd1421c0b6a9b58329b261 Mon Sep 17 00:00:00 2001
From: Zihao Li <lizihao_anu@outlook.com>
Date: Tue, 6 Jun 2023 16:11:33 +0100
Subject: [PATCH 095/200] rename columns with small letters

---
 R/aggregate_by_chi_zihao.R | 1 +
 1 file changed, 1 insertion(+)

diff --git a/R/aggregate_by_chi_zihao.R b/R/aggregate_by_chi_zihao.R
index 656f25885..b5f4fd549 100644
--- a/R/aggregate_by_chi_zihao.R
+++ b/R/aggregate_by_chi_zihao.R
@@ -161,5 +161,6 @@ aggregate_by_chi_zihao <- function(episode_file) {
     aggregated_data <- data.table::rbindlist(list(aggregated_data, chunk_agg))
   }
   aggregated_data <- dplyr::as_tibble(aggregated_data)
+  names(aggregated_data) = tolower(names(aggregated_data))
   return(aggregated_data)
 }

From 550adab299b13a28901192ac99dc72becbabceb6 Mon Sep 17 00:00:00 2001
From: lizihao-anu <lizihao-anu@users.noreply.github.com>
Date: Tue, 6 Jun 2023 15:17:09 +0000
Subject: [PATCH 096/200] Style code

---
 R/aggregate_by_chi_zihao.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/R/aggregate_by_chi_zihao.R b/R/aggregate_by_chi_zihao.R
index a6e115260..8cd443a9b 100644
--- a/R/aggregate_by_chi_zihao.R
+++ b/R/aggregate_by_chi_zihao.R
@@ -163,6 +163,6 @@ aggregate_by_chi_zihao <- function(episode_file) {
     aggregated_data <- data.table::rbindlist(list(aggregated_data, chunk_agg))
   }
   aggregated_data <- dplyr::as_tibble(aggregated_data)
-  names(aggregated_data) = tolower(names(aggregated_data))
+  names(aggregated_data) <- tolower(names(aggregated_data))
   return(aggregated_data)
 }

From fee7d8bb0b1db8e4b62e27dc6a5fd9b6273a258d Mon Sep 17 00:00:00 2001
From: Zihao Li <lizihao_anu@outlook.com>
Date: Wed, 7 Jun 2023 11:30:34 +0100
Subject: [PATCH 097/200] newaggregate_ch_episodes

---
 R/aggregate_by_chi_zihao.R | 27 +++++++++++++++++++++++++++
 R/create_individual_file.R |  6 +++---
 2 files changed, 30 insertions(+), 3 deletions(-)

diff --git a/R/aggregate_by_chi_zihao.R b/R/aggregate_by_chi_zihao.R
index 8cd443a9b..4143b28c1 100644
--- a/R/aggregate_by_chi_zihao.R
+++ b/R/aggregate_by_chi_zihao.R
@@ -166,3 +166,30 @@ aggregate_by_chi_zihao <- function(episode_file) {
   names(aggregated_data) <- tolower(names(aggregated_data))
   return(aggregated_data)
 }
+
+
+#' Aggregate CIS episodes
+#'
+#' @description Aggregate CH variables by CHI and CIS.
+#'
+#'
+#' @inheritParams create_individual_file
+aggregate_ch_episodes_zihao <- function(episode_file) {
+  cli::cli_alert_info("Aggregate ch episodes function started at {Sys.time()}")
+
+  # Convert to data.table
+  data.table::setDT(episode_file)
+
+  # Perform grouping and aggregation
+  episode_file <- episode_file[, `:=`(
+    ch_no_cost = max(ch_no_cost),
+    ch_ep_start = min(record_keydate1),
+    ch_ep_end = max(ch_ep_end),
+    ch_cost_per_day = mean(ch_cost_per_day)
+  ), by = .(chi, ch_chi_cis)]
+
+  # Convert back to tibble if needed
+  episode_file <- tibble::as_tibble(episode_file)
+
+  return(episode_file)
+}
diff --git a/R/create_individual_file.R b/R/create_individual_file.R
index 05df574e0..04a8be917 100644
--- a/R/create_individual_file.R
+++ b/R/create_individual_file.R
@@ -8,7 +8,7 @@ create_individual_file <- function(episode_file) {
     remove_blank_chi() %>%
     add_cij_columns() %>%
     add_all_columns() %>%
-    aggregate_ch_episodes() %>%
+    aggregate_ch_episodes_zihao() %>%
     clean_up_ch() %>%
     recode_gender() %>%
     aggregate_by_chi_zihao() %>%
@@ -710,8 +710,8 @@ drop_cols <- function(individual_file) {
       -month_cols(),
       -"ch_no_cost",
       # -"dob",
-      -"postcode",
-      -"gpprac",
+      # -"postcode",
+      # -"gpprac",
       -"no_paid_items" # ,
       #-"total_no_dn_contacts"
     )

From a8f4ae27acede3e6e1ed1a509333d0e9e74ff3ad Mon Sep 17 00:00:00 2001
From: lizihao-anu <lizihao-anu@users.noreply.github.com>
Date: Wed, 7 Jun 2023 10:36:21 +0000
Subject: [PATCH 098/200] Update documentation

---
 man/aggregate_ch_episodes_zihao.Rd | 14 ++++++++++++++
 1 file changed, 14 insertions(+)
 create mode 100644 man/aggregate_ch_episodes_zihao.Rd

diff --git a/man/aggregate_ch_episodes_zihao.Rd b/man/aggregate_ch_episodes_zihao.Rd
new file mode 100644
index 000000000..808262654
--- /dev/null
+++ b/man/aggregate_ch_episodes_zihao.Rd
@@ -0,0 +1,14 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/aggregate_by_chi_zihao.R
+\name{aggregate_ch_episodes_zihao}
+\alias{aggregate_ch_episodes_zihao}
+\title{Aggregate CIS episodes}
+\usage{
+aggregate_ch_episodes_zihao(episode_file)
+}
+\arguments{
+\item{episode_file}{Tibble containing episodic data}
+}
+\description{
+Aggregate CH variables by CHI and CIS.
+}

From cd8a08bbd0e242d8e2dc35a38b39f6496aa6a922 Mon Sep 17 00:00:00 2001
From: Zihao Li <lizihao_anu@outlook.com>
Date: Tue, 13 Jun 2023 11:43:01 +0100
Subject: [PATCH 099/200] add functions to replace regular expressions to
 select column/variables

---
 R/aggregate_by_chi_zihao.R | 271 ++++++++++++++++++++-----------------
 1 file changed, 144 insertions(+), 127 deletions(-)

diff --git a/R/aggregate_by_chi_zihao.R b/R/aggregate_by_chi_zihao.R
index 4143b28c1..280f1aa0c 100644
--- a/R/aggregate_by_chi_zihao.R
+++ b/R/aggregate_by_chi_zihao.R
@@ -12,16 +12,16 @@ aggregate_by_chi_zihao <- function(episode_file) {
 
   episode_file <- episode_file %>%
     dplyr::select(-c(postcode, gpprac)) %>%
-    dplyr::rename(
-      "gpprac" = "most_recent_gpprac",
-      "postcode" = "most_recent_postcode"
-    ) %>%
+    dplyr::rename("gpprac" = "most_recent_gpprac",
+                  "postcode" = "most_recent_postcode") %>%
     dplyr::select(-c(
       dplyr::ends_with("_gpprac"),
       dplyr::ends_with("_postcode"),
       dplyr::ends_with("_DoB")
     ))
 
+  names(episode_file) <- tolower(names(episode_file))
+
   data.table::setDT(episode_file) # Convert to data.table
 
   # Sort the data within each chunk
@@ -38,135 +38,152 @@ aggregate_by_chi_zihao <- function(episode_file) {
 
   data.table::setnames(
     episode_file,
-    c(
-      "ch_chi_cis", "cij_marker", "ooh_case_id"
+    c("ch_chi_cis", "cij_marker", "ooh_case_id"
       # ,"hh_in_fy"
-    ),
-    c(
-      "ch_cis_episodes", "cij_total", "ooh_cases"
-      # ,"HL1_in_FY"
-    )
-  )
+      ),
+      c("ch_cis_episodes", "cij_total", "ooh_cases"
+        # ,"hl1_in_fy"
+        )
+      )
+
+      # Initialize an empty data.table for the aggregated results
+      aggregated_data <- data.table::data.table()
+
+      # Process the data in chunks
+      chunk_size <- min(nrow(episode_file), 5e7)
+      # Adjust the chunk size as per your system's memory capacity
+      n_chunks <- nrow(episode_file) %/% chunk_size
+
+
+      # colums specification
+      # columns to select last
+      cols2 <- vars_end_with(episode_file,
+                             c("postcode", "dob", "ggprac"))
+      # columns to select last unique rows
+      cols3 <- c(
+        "ch_cis_episodes",
+        "cij_total",
+        "CIJ_el",
+        "CIJ_non_el",
+        "CIJ_mat",
+        # "cij_delay",
+        "ooh_cases",
+        "preventable_admissions"
+      )
+      # columns to sum up
+      cols4 <- c(vars_end_with(
+        episode_file,
+        c(
+          "episodes",
+          "beddays",
+          "cost",
+          "attendances",
+          "attend",
+          # "contacts",
+          "hours",
+          "alarms",
+          "telecare",
+          "paid_items",
+          "advice",
+          "homev",
+          "time",
+          "assessment",
+          "other",
+          # "dn",
+          "nhs24",
+          "pcc",
+          "_dnas"
+        )
+      ),
+      vars_start_with(episode_file,
+                      "sds_option"))
+      cols4 <- cols4[!(cols4 %in% c("ch_cis_episodes"))]
+      # # columns to select maximum
+      # cols5 <- vars_contain(episode_file, "nsu")
+      # columns to select first row
+      cols6 <- c(condition_cols(),
+                 # "death_date",
+                 # "deceased",
+                 "year",
+                 vars_end_with(episode_file,
+                               c("_cohort", "end_fy", "start_fy")))
+
+      for (i in 1:n_chunks) {
+        start <- (i - 1) * chunk_size + 1
+        end <- i * chunk_size
+        # Subset the data to the current chunk
+        chunk <- episode_file[start:end]
+
+        # compute
+        chunk_cols1 <- chunk[,
+                             .(gender = mean(gender)),
+                             by = chi]
+        chunk_cols2 <- chunk[,
+                             .SD[.N],
+                             .SDcols = cols2,
+                             by = chi]
+        chunk_cols3 <- chunk[,
+                             lapply(.SD, function(x)
+                               data.table::uniqueN(x, na.rm = TRUE)),
+                             .SDcols = cols3,
+                             by = chi]
+        chunk_cols4 <- chunk[,
+                             lapply(.SD, function(x)
+                               sum(x, na.rm = TRUE)),
+                             .SDcols = cols4,
+                             by = chi]
+        # chunk_cols5 <- chunk[,
+        #                      lapply(.SD, function(x) max(x, na.rm = TRUE)),
+        #                      .SDcols = cols5,
+        #                      by = chi]
+        chunk_cols6 <- chunk[,
+                             lapply(.SD, function(x)
+                               x[!is.na(x)][1]),
+                             .SDcols = cols6,
+                             by = chi]
+        chunk_agg <- dplyr::bind_cols(chunk_cols1,
+                                      chunk_cols2[, chi := NULL],
+                                      chunk_cols3[, chi := NULL],
+                                      chunk_cols4[, chi := NULL],
+                                      # chunk_cols5[, chi := NULL],
+                                      chunk_cols6[, chi := NULL])
+
+        # Append the aggregated chunk to the overall result
+        aggregated_data <-
+          data.table::rbindlist(list(aggregated_data, chunk_agg))
+      }
+      aggregated_data <- dplyr::as_tibble(aggregated_data)
+
+      return(aggregated_data)
+}
 
-  # Initialize an empty data.table for the aggregated results
-  aggregated_data <- data.table::data.table()
-
-  # Process the data in chunks
-  chunk_size <- min(nrow(episode_file), 5e7)
-  # Adjust the chunk size as per your system's memory capacity
-  n_chunks <- nrow(episode_file) %/% chunk_size
-
-
-  # colums specification
-  cols2 <- names(episode_file)[grepl("postcode$|DoB$|gpprac$",
-    names(episode_file),
-    ignore.case = TRUE
-  )]
-  cols3 <- c(
-    "ch_cis_episodes",
-    "cij_total",
-    "CIJ_el",
-    "CIJ_non_el",
-    "CIJ_mat",
-    # "cij_delay",
-    "ooh_cases",
-    "preventable_admissions"
-  )
-  cols4 <- names(episode_file)[grepl(
-    paste(
-      "episodes$",
-      "beddays$",
-      "cost$",
-      "attendances$",
-      "attend$",
-      # "contacts$",
-      "hours$",
-      "alarms$",
-      "telecare$",
-      "paid_items$",
-      "advice$",
-      "homeV$",
-      "time$",
-      "assessment$",
-      "other$",
-      # "DN$",
-      "NHS24$",
-      "PCC$",
-      "_dnas$",
-      "^SDS_option",
-      sep = "|"
-    ),
-    names(episode_file),
-    ignore.case = TRUE
-  )]
-  cols4 <- cols4[!(cols4 %in% c("ch_cis_episodes"))]
-  # cols5 <- names(episode_file)[grepl("^sc|HL1_in_FY|NSU", names(episode_file), ignore.case = TRUE)]
-  # cols5 <- cols5[!(cols5 %in% c("sc_send_lca", "sc_latest_submission"))]
-  cols6 <- c(
-    condition_cols(),
-    # "death_date",
-    # "deceased",
-    "year",
-    names(episode_file)[grepl("_Cohort$|end_fy$|start_fy$",
-      names(episode_file),
-      ignore.case = TRUE
-    )]
-  )
 
-  for (i in 1:n_chunks) {
-    start <- (i - 1) * chunk_size + 1
-    end <- i * chunk_size
-    # Subset the data to the current chunk
-    chunk <- episode_file[start:end]
-
-    # compute
-    chunk_cols1 <- chunk[,
-      .(gender = mean(gender)),
-      by = chi
-    ]
-    chunk_cols2 <- chunk[,
-      .SD[.N],
-      # .SDcols = patterns("postcode$|DoB$|gpprac$"),
-      .SDcols = cols2,
-      by = chi
-    ]
-    chunk_cols3 <- chunk[,
-      lapply(.SD, function(x) data.table::uniqueN(x, na.rm = TRUE)),
-      .SDcols = cols3,
-      by = chi
-    ]
-    chunk_cols4 <- chunk[,
-      lapply(.SD, function(x) sum(x, na.rm = TRUE)),
-      .SDcols = cols4,
-      by = chi
-    ]
-    # chunk_cols5 <- chunk[,
-    #                      lapply(.SD, function(x) max(x, na.rm = TRUE)),
-    #                      .SDcols = cols5,
-    #                      by = chi]
-    chunk_cols6 <- chunk[,
-      # .SD[1]
-      lapply(.SD, function(x) x[!is.na(x)][1]),
-      .SDcols = cols6,
-      by = chi
-    ]
-    chunk_agg <- cbind(
-      chunk_cols1,
-      chunk_cols2[, chi := NULL],
-      chunk_cols3[, chi := NULL],
-      chunk_cols4[, chi := NULL],
-      # chunk_cols5[, chi := NULL],
-      chunk_cols6[, chi := NULL]
-    )
+#' select columns ending with some patterns
+#' @describeIn select columns based on patterns
+#'
+vars_end_with <- function(data, vars, ignore_case = FALSE) {
+  names(data)[stringr::str_ends(names(data),
+                                stringr::regex(paste(vars, collapse = "|"),
+                                               ignore_case = ignore_case))]
+}
 
-    # Append the aggregated chunk to the overall result
-    aggregated_data <- data.table::rbindlist(list(aggregated_data, chunk_agg))
-  }
-  aggregated_data <- dplyr::as_tibble(aggregated_data)
-  names(aggregated_data) <- tolower(names(aggregated_data))
-  return(aggregated_data)
+#' select columns starting with some patterns
+#' @describeIn select columns based on patterns
+#'
+vars_start_with <- function(data, vars, ignore_case = FALSE) {
+  names(data)[stringr::str_starts(names(data),
+                                  stringr::regex(paste(vars, collapse = "|"),
+                                                 ignore_case = ignore_case))]
 }
 
+#' select columns contains some characters
+#' @describeIn select columns based on patterns
+#'
+vars_contain <- function(data, vars, ignore_case = FALSE) {
+  names(data)[stringr::str_detect(names(data),
+                                  stringr::regex(paste(vars, collapse = "|"),
+                                                 ignore_case = ignore_case))]
+}
 
 #' Aggregate CIS episodes
 #'

From e03b02df7b7568061414910a8d5842697b967027 Mon Sep 17 00:00:00 2001
From: lizihao-anu <lizihao-anu@users.noreply.github.com>
Date: Tue, 13 Jun 2023 11:03:25 +0000
Subject: [PATCH 100/200] Update documentation

---
 man/select.Rd | 30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)
 create mode 100644 man/select.Rd

diff --git a/man/select.Rd b/man/select.Rd
new file mode 100644
index 000000000..435096d9a
--- /dev/null
+++ b/man/select.Rd
@@ -0,0 +1,30 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/aggregate_by_chi_zihao.R
+\name{vars_end_with}
+\alias{vars_end_with}
+\alias{vars_start_with}
+\alias{vars_contain}
+\title{select columns ending with some patterns}
+\usage{
+vars_end_with(data, vars, ignore_case = FALSE)
+
+vars_start_with(data, vars, ignore_case = FALSE)
+
+vars_contain(data, vars, ignore_case = FALSE)
+}
+\description{
+select columns ending with some patterns
+
+select columns starting with some patterns
+
+select columns contains some characters
+}
+\section{Functions}{
+\itemize{
+\item \code{vars_end_with()}: columns based on patterns
+
+\item \code{vars_start_with()}: columns based on patterns
+
+\item \code{vars_contain()}: columns based on patterns
+
+}}

From 66e21e61409bf2138f6766c68a924e3061f56827 Mon Sep 17 00:00:00 2001
From: lizihao-anu <lizihao-anu@users.noreply.github.com>
Date: Tue, 13 Jun 2023 11:06:39 +0000
Subject: [PATCH 101/200] Style code

---
 R/aggregate_by_chi_zihao.R | 283 +++++++++++++++++++++----------------
 1 file changed, 158 insertions(+), 125 deletions(-)

diff --git a/R/aggregate_by_chi_zihao.R b/R/aggregate_by_chi_zihao.R
index 280f1aa0c..63aaa9cbb 100644
--- a/R/aggregate_by_chi_zihao.R
+++ b/R/aggregate_by_chi_zihao.R
@@ -12,8 +12,10 @@ aggregate_by_chi_zihao <- function(episode_file) {
 
   episode_file <- episode_file %>%
     dplyr::select(-c(postcode, gpprac)) %>%
-    dplyr::rename("gpprac" = "most_recent_gpprac",
-                  "postcode" = "most_recent_postcode") %>%
+    dplyr::rename(
+      "gpprac" = "most_recent_gpprac",
+      "postcode" = "most_recent_postcode"
+    ) %>%
     dplyr::select(-c(
       dplyr::ends_with("_gpprac"),
       dplyr::ends_with("_postcode"),
@@ -38,123 +40,145 @@ aggregate_by_chi_zihao <- function(episode_file) {
 
   data.table::setnames(
     episode_file,
-    c("ch_chi_cis", "cij_marker", "ooh_case_id"
+    c(
+      "ch_chi_cis", "cij_marker", "ooh_case_id"
       # ,"hh_in_fy"
-      ),
-      c("ch_cis_episodes", "cij_total", "ooh_cases"
-        # ,"hl1_in_fy"
-        )
-      )
+    ),
+    c(
+      "ch_cis_episodes", "cij_total", "ooh_cases"
+      # ,"hl1_in_fy"
+    )
+  )
+
+  # Initialize an empty data.table for the aggregated results
+  aggregated_data <- data.table::data.table()
 
-      # Initialize an empty data.table for the aggregated results
-      aggregated_data <- data.table::data.table()
-
-      # Process the data in chunks
-      chunk_size <- min(nrow(episode_file), 5e7)
-      # Adjust the chunk size as per your system's memory capacity
-      n_chunks <- nrow(episode_file) %/% chunk_size
-
-
-      # colums specification
-      # columns to select last
-      cols2 <- vars_end_with(episode_file,
-                             c("postcode", "dob", "ggprac"))
-      # columns to select last unique rows
-      cols3 <- c(
-        "ch_cis_episodes",
-        "cij_total",
-        "CIJ_el",
-        "CIJ_non_el",
-        "CIJ_mat",
-        # "cij_delay",
-        "ooh_cases",
-        "preventable_admissions"
+  # Process the data in chunks
+  chunk_size <- min(nrow(episode_file), 5e7)
+  # Adjust the chunk size as per your system's memory capacity
+  n_chunks <- nrow(episode_file) %/% chunk_size
+
+
+  # colums specification
+  # columns to select last
+  cols2 <- vars_end_with(
+    episode_file,
+    c("postcode", "dob", "ggprac")
+  )
+  # columns to select last unique rows
+  cols3 <- c(
+    "ch_cis_episodes",
+    "cij_total",
+    "CIJ_el",
+    "CIJ_non_el",
+    "CIJ_mat",
+    # "cij_delay",
+    "ooh_cases",
+    "preventable_admissions"
+  )
+  # columns to sum up
+  cols4 <- c(
+    vars_end_with(
+      episode_file,
+      c(
+        "episodes",
+        "beddays",
+        "cost",
+        "attendances",
+        "attend",
+        # "contacts",
+        "hours",
+        "alarms",
+        "telecare",
+        "paid_items",
+        "advice",
+        "homev",
+        "time",
+        "assessment",
+        "other",
+        # "dn",
+        "nhs24",
+        "pcc",
+        "_dnas"
       )
-      # columns to sum up
-      cols4 <- c(vars_end_with(
-        episode_file,
-        c(
-          "episodes",
-          "beddays",
-          "cost",
-          "attendances",
-          "attend",
-          # "contacts",
-          "hours",
-          "alarms",
-          "telecare",
-          "paid_items",
-          "advice",
-          "homev",
-          "time",
-          "assessment",
-          "other",
-          # "dn",
-          "nhs24",
-          "pcc",
-          "_dnas"
-        )
-      ),
-      vars_start_with(episode_file,
-                      "sds_option"))
-      cols4 <- cols4[!(cols4 %in% c("ch_cis_episodes"))]
-      # # columns to select maximum
-      # cols5 <- vars_contain(episode_file, "nsu")
-      # columns to select first row
-      cols6 <- c(condition_cols(),
-                 # "death_date",
-                 # "deceased",
-                 "year",
-                 vars_end_with(episode_file,
-                               c("_cohort", "end_fy", "start_fy")))
-
-      for (i in 1:n_chunks) {
-        start <- (i - 1) * chunk_size + 1
-        end <- i * chunk_size
-        # Subset the data to the current chunk
-        chunk <- episode_file[start:end]
-
-        # compute
-        chunk_cols1 <- chunk[,
-                             .(gender = mean(gender)),
-                             by = chi]
-        chunk_cols2 <- chunk[,
-                             .SD[.N],
-                             .SDcols = cols2,
-                             by = chi]
-        chunk_cols3 <- chunk[,
-                             lapply(.SD, function(x)
-                               data.table::uniqueN(x, na.rm = TRUE)),
-                             .SDcols = cols3,
-                             by = chi]
-        chunk_cols4 <- chunk[,
-                             lapply(.SD, function(x)
-                               sum(x, na.rm = TRUE)),
-                             .SDcols = cols4,
-                             by = chi]
-        # chunk_cols5 <- chunk[,
-        #                      lapply(.SD, function(x) max(x, na.rm = TRUE)),
-        #                      .SDcols = cols5,
-        #                      by = chi]
-        chunk_cols6 <- chunk[,
-                             lapply(.SD, function(x)
-                               x[!is.na(x)][1]),
-                             .SDcols = cols6,
-                             by = chi]
-        chunk_agg <- dplyr::bind_cols(chunk_cols1,
-                                      chunk_cols2[, chi := NULL],
-                                      chunk_cols3[, chi := NULL],
-                                      chunk_cols4[, chi := NULL],
-                                      # chunk_cols5[, chi := NULL],
-                                      chunk_cols6[, chi := NULL])
-
-        # Append the aggregated chunk to the overall result
-        aggregated_data <-
-          data.table::rbindlist(list(aggregated_data, chunk_agg))
-      }
-      aggregated_data <- dplyr::as_tibble(aggregated_data)
-
-      return(aggregated_data)
+    ),
+    vars_start_with(
+      episode_file,
+      "sds_option"
+    )
+  )
+  cols4 <- cols4[!(cols4 %in% c("ch_cis_episodes"))]
+  # # columns to select maximum
+  # cols5 <- vars_contain(episode_file, "nsu")
+  # columns to select first row
+  cols6 <- c(
+    condition_cols(),
+    # "death_date",
+    # "deceased",
+    "year",
+    vars_end_with(
+      episode_file,
+      c("_cohort", "end_fy", "start_fy")
+    )
+  )
+
+  for (i in 1:n_chunks) {
+    start <- (i - 1) * chunk_size + 1
+    end <- i * chunk_size
+    # Subset the data to the current chunk
+    chunk <- episode_file[start:end]
+
+    # compute
+    chunk_cols1 <- chunk[,
+      .(gender = mean(gender)),
+      by = chi
+    ]
+    chunk_cols2 <- chunk[,
+      .SD[.N],
+      .SDcols = cols2,
+      by = chi
+    ]
+    chunk_cols3 <- chunk[,
+      lapply(.SD, function(x) {
+        data.table::uniqueN(x, na.rm = TRUE)
+      }),
+      .SDcols = cols3,
+      by = chi
+    ]
+    chunk_cols4 <- chunk[,
+      lapply(.SD, function(x) {
+        sum(x, na.rm = TRUE)
+      }),
+      .SDcols = cols4,
+      by = chi
+    ]
+    # chunk_cols5 <- chunk[,
+    #                      lapply(.SD, function(x) max(x, na.rm = TRUE)),
+    #                      .SDcols = cols5,
+    #                      by = chi]
+    chunk_cols6 <- chunk[,
+      lapply(.SD, function(x) {
+        x[!is.na(x)][1]
+      }),
+      .SDcols = cols6,
+      by = chi
+    ]
+    chunk_agg <- dplyr::bind_cols(
+      chunk_cols1,
+      chunk_cols2[, chi := NULL],
+      chunk_cols3[, chi := NULL],
+      chunk_cols4[, chi := NULL],
+      # chunk_cols5[, chi := NULL],
+      chunk_cols6[, chi := NULL]
+    )
+
+    # Append the aggregated chunk to the overall result
+    aggregated_data <-
+      data.table::rbindlist(list(aggregated_data, chunk_agg))
+  }
+  aggregated_data <- dplyr::as_tibble(aggregated_data)
+
+  return(aggregated_data)
 }
 
 
@@ -162,27 +186,36 @@ aggregate_by_chi_zihao <- function(episode_file) {
 #' @describeIn select columns based on patterns
 #'
 vars_end_with <- function(data, vars, ignore_case = FALSE) {
-  names(data)[stringr::str_ends(names(data),
-                                stringr::regex(paste(vars, collapse = "|"),
-                                               ignore_case = ignore_case))]
+  names(data)[stringr::str_ends(
+    names(data),
+    stringr::regex(paste(vars, collapse = "|"),
+      ignore_case = ignore_case
+    )
+  )]
 }
 
 #' select columns starting with some patterns
 #' @describeIn select columns based on patterns
 #'
 vars_start_with <- function(data, vars, ignore_case = FALSE) {
-  names(data)[stringr::str_starts(names(data),
-                                  stringr::regex(paste(vars, collapse = "|"),
-                                                 ignore_case = ignore_case))]
+  names(data)[stringr::str_starts(
+    names(data),
+    stringr::regex(paste(vars, collapse = "|"),
+      ignore_case = ignore_case
+    )
+  )]
 }
 
 #' select columns contains some characters
 #' @describeIn select columns based on patterns
 #'
 vars_contain <- function(data, vars, ignore_case = FALSE) {
-  names(data)[stringr::str_detect(names(data),
-                                  stringr::regex(paste(vars, collapse = "|"),
-                                                 ignore_case = ignore_case))]
+  names(data)[stringr::str_detect(
+    names(data),
+    stringr::regex(paste(vars, collapse = "|"),
+      ignore_case = ignore_case
+    )
+  )]
 }
 
 #' Aggregate CIS episodes

From f0fce5b7e065573e86e07d24720e2b7ba8be03b4 Mon Sep 17 00:00:00 2001
From: Zihao Li <lizihao_anu@outlook.com>
Date: Tue, 13 Jun 2023 15:03:39 +0100
Subject: [PATCH 102/200] minor changes

---
 R/aggregate_by_chi_zihao.R | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/R/aggregate_by_chi_zihao.R b/R/aggregate_by_chi_zihao.R
index 63aaa9cbb..c7073d6ad 100644
--- a/R/aggregate_by_chi_zihao.R
+++ b/R/aggregate_by_chi_zihao.R
@@ -69,12 +69,13 @@ aggregate_by_chi_zihao <- function(episode_file) {
   cols3 <- c(
     "ch_cis_episodes",
     "cij_total",
-    "CIJ_el",
-    "CIJ_non_el",
-    "CIJ_mat",
+    "cij_el",
+    "cij_non_el",
+    "cij_mat",
     # "cij_delay",
     "ooh_cases",
-    "preventable_admissions"
+    "preventable_admissions",
+    "gpprac"
   )
   # columns to sum up
   cols4 <- c(

From f1b96d114544857276dcb6b0ac1eb42b14667198 Mon Sep 17 00:00:00 2001
From: Zihao Li <lizihao_anu@outlook.com>
Date: Wed, 14 Jun 2023 15:19:16 +0100
Subject: [PATCH 103/200] add a missing variable, cij_delay

---
 R/link_delayed_discharge_eps.R | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/R/link_delayed_discharge_eps.R b/R/link_delayed_discharge_eps.R
index 677480697..c983afbeb 100644
--- a/R/link_delayed_discharge_eps.R
+++ b/R/link_delayed_discharge_eps.R
@@ -273,6 +273,15 @@ link_delayed_discharge_eps <- function(data, year) {
       .data$record_keydate2_dd,
       .keep_all = TRUE
     ) %>%
+    # add cij_delay
+    dplyr::mutate(has_delay = dplyr::if_else(
+      .data$chi != "" & !is.na(.data$cij_marker),
+      .data$smrtype == "DD-CIJ",
+      NA
+    )) %>%
+    dplyr::group_by(chi, cij_marker) %>%
+    dplyr::mutate(cij_delay = max(has_delay)) %>%
+    dplyr::ungroup()
     # tidy up and rename columns to match the format of episode files
     dplyr::select(
       "year" = "year_dd",
@@ -300,6 +309,7 @@ link_delayed_discharge_eps <- function(data, year) {
       "cij_admtype",
       "cij_adm_spec",
       "cij_dis_spec",
+      "cij_delay",
       "location",
       "spec" = "spec_dd",
       "dd_type"

From f565922e172fc023be3b17b8b6d9734bcdb18dcc Mon Sep 17 00:00:00 2001
From: lizihao-anu <lizihao-anu@users.noreply.github.com>
Date: Wed, 14 Jun 2023 14:22:51 +0000
Subject: [PATCH 104/200] Style code

---
 R/link_delayed_discharge_eps.R | 64 +++++++++++++++++-----------------
 1 file changed, 32 insertions(+), 32 deletions(-)

diff --git a/R/link_delayed_discharge_eps.R b/R/link_delayed_discharge_eps.R
index c983afbeb..e4f418516 100644
--- a/R/link_delayed_discharge_eps.R
+++ b/R/link_delayed_discharge_eps.R
@@ -282,38 +282,38 @@ link_delayed_discharge_eps <- function(data, year) {
     dplyr::group_by(chi, cij_marker) %>%
     dplyr::mutate(cij_delay = max(has_delay)) %>%
     dplyr::ungroup()
-    # tidy up and rename columns to match the format of episode files
-    dplyr::select(
-      "year" = "year_dd",
-      "recid" = "recid_dd",
-      "record_keydate1" = "record_keydate1_dd",
-      "record_keydate2" = "record_keydate2_dd",
-      "smrtype",
-      "chi",
-      "gender",
-      "dob",
-      "age",
-      "gpprac",
-      "postcode" = "postcode_dd",
-      "lca" = "dd_responsible_lca",
-      "hbtreatcode" = "hbtreatcode_dd",
-      "original_admission_date",
-      "delay_end_reason",
-      "primary_delay_reason",
-      "secondary_delay_reason",
-      "cij_marker",
-      "cij_start_date",
-      "cij_end_date",
-      "cij_pattype_code",
-      "cij_ipdc",
-      "cij_admtype",
-      "cij_adm_spec",
-      "cij_dis_spec",
-      "cij_delay",
-      "location",
-      "spec" = "spec_dd",
-      "dd_type"
-    ) %>%
+  # tidy up and rename columns to match the format of episode files
+  dplyr::select(
+    "year" = "year_dd",
+    "recid" = "recid_dd",
+    "record_keydate1" = "record_keydate1_dd",
+    "record_keydate2" = "record_keydate2_dd",
+    "smrtype",
+    "chi",
+    "gender",
+    "dob",
+    "age",
+    "gpprac",
+    "postcode" = "postcode_dd",
+    "lca" = "dd_responsible_lca",
+    "hbtreatcode" = "hbtreatcode_dd",
+    "original_admission_date",
+    "delay_end_reason",
+    "primary_delay_reason",
+    "secondary_delay_reason",
+    "cij_marker",
+    "cij_start_date",
+    "cij_end_date",
+    "cij_pattype_code",
+    "cij_ipdc",
+    "cij_admtype",
+    "cij_adm_spec",
+    "cij_dis_spec",
+    "cij_delay",
+    "location",
+    "spec" = "spec_dd",
+    "dd_type"
+  ) %>%
     # combine DD with episode data
     dplyr::bind_rows( # restore cij_end_date
       data %>%

From 5237f9edb9d5706667a6ab51176d651aa3063aa3 Mon Sep 17 00:00:00 2001
From: Zihao Li <lizihao_anu@outlook.com>
Date: Wed, 14 Jun 2023 15:32:11 +0100
Subject: [PATCH 105/200] add variables cij_delay, preventable_beddays

---
 R/create_individual_file.R | 48 +++++++++++++++++---------------------
 1 file changed, 21 insertions(+), 27 deletions(-)

diff --git a/R/create_individual_file.R b/R/create_individual_file.R
index 04a8be917..1d0f8ac60 100644
--- a/R/create_individual_file.R
+++ b/R/create_individual_file.R
@@ -40,36 +40,30 @@ add_cij_columns <- function(episode_file) {
   episode_file %>%
     dplyr::mutate(
       CIJ_non_el = dplyr::if_else(.data$cij_pattype_code == 0,
-        .data$cij_marker,
-        NA_real_
-      ),
+                                  .data$cij_marker,
+                                  NA_real_),
       CIJ_el = dplyr::if_else(.data$cij_pattype_code == 1,
-        .data$cij_marker,
-        NA_real_
-      ),
+                              .data$cij_marker,
+                              NA_real_),
       CIJ_mat = dplyr::if_else(.data$cij_pattype_code == 2,
-        .data$cij_marker,
-        NA_real_
-      )
-    ) %>%
-    # dplyr::mutate(cij_delay = dplyr::if_else(
-    #   (.data$cij_delay == 1 & .data$cij_marker == 1),
-    #   1,
-    #   0
-    # )) %>%
-    dplyr::mutate(
-      preventable_admissions = dplyr::if_else(
-        (.data$cij_ppa == 1 & .data$cij_marker == 1),
+                               .data$cij_marker,
+                               NA_real_),
+      # assume cij_delay is logic variable
+      cij_delay = dplyr::if_else(
+        (.data$cij_delay & .data$cij_marker == 1),
         1,
-        0
-      ) # ,
-      # Come back to here
-      # preventable_beddays = dplyr::if_else(
-      #   (.data$cij_ppa == 1 & .data$Distinct_cij == 1),
-      #   as.numeric(.data$cij_end_date - .data$cij_start_date),
-      #   0
-      # )
-    )
+        0),
+      preventable_admissions = dplyr::if_else((.data$cij_ppa == 1 &
+                                                 .data$cij_marker == 1),
+                                              1,
+                                              0),
+      preventable_beddays = dplyr::if_else((.data$cij_ppa == 1 &
+                                              .data$cij_marker == 1),
+                                           as.numeric(
+                                             min(.data$cij_end_date, end_fy(year)) -
+                                               min(.data$cij_start_date, start_fy(year))
+                                           ),
+                                           0))
 }
 
 #' Add all columns

From bdfc0b441cffc5c0a2c9c65f1f8d488e5669ba67 Mon Sep 17 00:00:00 2001
From: Zihao Li <lizihao_anu@outlook.com>
Date: Wed, 14 Jun 2023 15:33:35 +0100
Subject: [PATCH 106/200] add missing variables health_net_cost,
 health_net_costincdnas, and cmh, dd sds columns

---
 R/create_individual_file.R | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

diff --git a/R/create_individual_file.R b/R/create_individual_file.R
index 1d0f8ac60..a1743dfd6 100644
--- a/R/create_individual_file.R
+++ b/R/create_individual_file.R
@@ -85,15 +85,26 @@ add_all_columns <- function(episode_file) {
     add_pis_columns("PIS", .data$recid == "PIS") %>%
     add_ooh_columns("OoH", .data$recid == "OoH") %>%
     # add_dn_columns("DN", .data$recid == "DN") %>%
-    # add_cmh_columns("CMH", .data$recid == "CMH") %>%
-    # add_dd_columns("DD", .data$recid == "DD") %>%
+    add_cmh_columns("CMH", .data$recid == "CMH") %>%
+    add_dd_columns("DD", .data$recid == "DD") %>%
     add_nsu_columns("NSU", .data$recid == "NSU") %>%
     add_nrs_columns("NRS", .data$recid == "NRS") %>%
     add_hl1_columns("HL1", .data$recid == "HL1") %>%
     add_ch_columns("CH", .data$recid == "CH") %>%
     add_hc_columns("HC", .data$recid == "HC") %>%
     add_at_columns("AT", .data$recid == "AT") %>%
-    add_sds_columns("SDS", .data$recid == "SDS")
+    add_sds_columns("SDS", .data$recid == "SDS") %>%
+    dplyr::mutate(
+      health_net_cost = Acute_cost +
+        Mat_cost +
+        MH_cost +
+        GLS_cost +
+        OP_cost_attend +
+        AE_cost +
+        PIS_cost +
+        OoH_cost,
+      health_net_costincdnas = health_net_cost + OP_cost_dnas
+    )
 }
 
 #' Add Acute columns

From 7b288bdd42e5d62a961924900488adb2f0c7b05d Mon Sep 17 00:00:00 2001
From: lizihao-anu <lizihao-anu@users.noreply.github.com>
Date: Wed, 14 Jun 2023 14:43:52 +0000
Subject: [PATCH 107/200] Style code

---
 R/create_individual_file.R | 39 ++++++++++++++++++++++----------------
 1 file changed, 23 insertions(+), 16 deletions(-)

diff --git a/R/create_individual_file.R b/R/create_individual_file.R
index a1743dfd6..fa6c43e0f 100644
--- a/R/create_individual_file.R
+++ b/R/create_individual_file.R
@@ -40,30 +40,37 @@ add_cij_columns <- function(episode_file) {
   episode_file %>%
     dplyr::mutate(
       CIJ_non_el = dplyr::if_else(.data$cij_pattype_code == 0,
-                                  .data$cij_marker,
-                                  NA_real_),
+        .data$cij_marker,
+        NA_real_
+      ),
       CIJ_el = dplyr::if_else(.data$cij_pattype_code == 1,
-                              .data$cij_marker,
-                              NA_real_),
+        .data$cij_marker,
+        NA_real_
+      ),
       CIJ_mat = dplyr::if_else(.data$cij_pattype_code == 2,
-                               .data$cij_marker,
-                               NA_real_),
+        .data$cij_marker,
+        NA_real_
+      ),
       # assume cij_delay is logic variable
       cij_delay = dplyr::if_else(
         (.data$cij_delay & .data$cij_marker == 1),
         1,
-        0),
+        0
+      ),
       preventable_admissions = dplyr::if_else((.data$cij_ppa == 1 &
-                                                 .data$cij_marker == 1),
-                                              1,
-                                              0),
+        .data$cij_marker == 1),
+      1,
+      0
+      ),
       preventable_beddays = dplyr::if_else((.data$cij_ppa == 1 &
-                                              .data$cij_marker == 1),
-                                           as.numeric(
-                                             min(.data$cij_end_date, end_fy(year)) -
-                                               min(.data$cij_start_date, start_fy(year))
-                                           ),
-                                           0))
+        .data$cij_marker == 1),
+      as.numeric(
+        min(.data$cij_end_date, end_fy(year)) -
+          min(.data$cij_start_date, start_fy(year))
+      ),
+      0
+      )
+    )
 }
 
 #' Add all columns

From e907dd93f815fe02d19b9dca626d2a4a282d0b08 Mon Sep 17 00:00:00 2001
From: Zihao Li <lizihao_anu@outlook.com>
Date: Wed, 14 Jun 2023 16:32:51 +0100
Subject: [PATCH 108/200] add more variables needed

---
 R/aggregate_by_chi_zihao.R | 26 +++++++++++++++++++++++++-
 1 file changed, 25 insertions(+), 1 deletion(-)

diff --git a/R/aggregate_by_chi_zihao.R b/R/aggregate_by_chi_zihao.R
index c7073d6ad..8ea384a22 100644
--- a/R/aggregate_by_chi_zihao.R
+++ b/R/aggregate_by_chi_zihao.R
@@ -75,7 +75,31 @@ aggregate_by_chi_zihao <- function(episode_file) {
     # "cij_delay",
     "ooh_cases",
     "preventable_admissions",
-    "gpprac"
+    "gpprac",
+
+    "hbrescode",
+    "hscp",
+    "lca",
+    "ca2018",
+    "locality",
+    "datazone2011",
+    "hbpraccode",
+    "cluster",
+    "simd2020v2_rank",
+    "simd2020v2_sc_decile",
+    "simd2020v2_sc_quintile",
+    "simd2020v2_hb2019_decile",
+    "simd2020v2_hb2019_quintile",
+    "simd2020v2_hscp2019_decile",
+    "simd2020v2_hscp2019_quintile",
+    "ur8_2020",
+    "ur6_2020",
+    "ur3_2020",
+    "ur2_2020",
+    "hb2019",
+    "hscp2019",
+    "ca2019",
+    vars_start_with(episode_file, "sc_")
   )
   # columns to sum up
   cols4 <- c(

From 464790466fb5e893e5e809c3643e794bc823c2e1 Mon Sep 17 00:00:00 2001
From: lizihao-anu <lizihao-anu@users.noreply.github.com>
Date: Wed, 14 Jun 2023 15:35:09 +0000
Subject: [PATCH 109/200] Style code

---
 R/aggregate_by_chi_zihao.R | 1 -
 1 file changed, 1 deletion(-)

diff --git a/R/aggregate_by_chi_zihao.R b/R/aggregate_by_chi_zihao.R
index 8ea384a22..270f8c87c 100644
--- a/R/aggregate_by_chi_zihao.R
+++ b/R/aggregate_by_chi_zihao.R
@@ -76,7 +76,6 @@ aggregate_by_chi_zihao <- function(episode_file) {
     "ooh_cases",
     "preventable_admissions",
     "gpprac",
-
     "hbrescode",
     "hscp",
     "lca",

From 45688c309696885cf1bc5235cf30cf599403f461 Mon Sep 17 00:00:00 2001
From: James McMahon <james.mcmahon@phs.scot>
Date: Mon, 19 Jun 2023 16:48:43 +0100
Subject: [PATCH 110/200] Update R/link_delayed_discharge_eps.R

---
 R/link_delayed_discharge_eps.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/R/link_delayed_discharge_eps.R b/R/link_delayed_discharge_eps.R
index e35ca0367..cfc06524e 100644
--- a/R/link_delayed_discharge_eps.R
+++ b/R/link_delayed_discharge_eps.R
@@ -279,7 +279,7 @@ link_delayed_discharge_eps <- function(data, year) {
     )) %>%
     dplyr::group_by(chi, cij_marker) %>%
     dplyr::mutate(cij_delay = max(has_delay)) %>%
-    dplyr::ungroup()
+    dplyr::ungroup() %>%
   # tidy up and rename columns to match the format of episode files
   dplyr::select(
     "year" = "year_dd",

From b2676d41be406d18831805958666e6b5f419558f Mon Sep 17 00:00:00 2001
From: Moohan <Moohan@users.noreply.github.com>
Date: Mon, 19 Jun 2023 15:52:11 +0000
Subject: [PATCH 111/200] Style code

---
 R/link_delayed_discharge_eps.R | 64 +++++++++++++++++-----------------
 1 file changed, 32 insertions(+), 32 deletions(-)

diff --git a/R/link_delayed_discharge_eps.R b/R/link_delayed_discharge_eps.R
index cfc06524e..ee99503dd 100644
--- a/R/link_delayed_discharge_eps.R
+++ b/R/link_delayed_discharge_eps.R
@@ -280,38 +280,38 @@ link_delayed_discharge_eps <- function(data, year) {
     dplyr::group_by(chi, cij_marker) %>%
     dplyr::mutate(cij_delay = max(has_delay)) %>%
     dplyr::ungroup() %>%
-  # tidy up and rename columns to match the format of episode files
-  dplyr::select(
-    "year" = "year_dd",
-    "recid" = "recid_dd",
-    "record_keydate1" = "record_keydate1_dd",
-    "record_keydate2" = "record_keydate2_dd",
-    "smrtype",
-    "chi",
-    "gender",
-    "dob",
-    "age",
-    "gpprac",
-    "postcode" = "postcode_dd",
-    "lca" = "dd_responsible_lca",
-    "hbtreatcode" = "hbtreatcode_dd",
-    "original_admission_date",
-    "delay_end_reason",
-    "primary_delay_reason",
-    "secondary_delay_reason",
-    "cij_marker",
-    "cij_start_date",
-    "cij_end_date",
-    "cij_pattype_code",
-    "cij_ipdc",
-    "cij_admtype",
-    "cij_adm_spec",
-    "cij_dis_spec",
-    "cij_delay",
-    "location",
-    "spec" = "spec_dd",
-    "dd_type"
-  ) %>%
+    # tidy up and rename columns to match the format of episode files
+    dplyr::select(
+      "year" = "year_dd",
+      "recid" = "recid_dd",
+      "record_keydate1" = "record_keydate1_dd",
+      "record_keydate2" = "record_keydate2_dd",
+      "smrtype",
+      "chi",
+      "gender",
+      "dob",
+      "age",
+      "gpprac",
+      "postcode" = "postcode_dd",
+      "lca" = "dd_responsible_lca",
+      "hbtreatcode" = "hbtreatcode_dd",
+      "original_admission_date",
+      "delay_end_reason",
+      "primary_delay_reason",
+      "secondary_delay_reason",
+      "cij_marker",
+      "cij_start_date",
+      "cij_end_date",
+      "cij_pattype_code",
+      "cij_ipdc",
+      "cij_admtype",
+      "cij_adm_spec",
+      "cij_dis_spec",
+      "cij_delay",
+      "location",
+      "spec" = "spec_dd",
+      "dd_type"
+    ) %>%
     # combine DD with episode data
     dplyr::bind_rows( # restore cij_end_date
       data %>%

From 8048e68c829edbf6c0c43e1bf3ade1d142e0e250 Mon Sep 17 00:00:00 2001
From: Zihao Li <lizihao_anu@outlook.com>
Date: Tue, 20 Jun 2023 11:40:35 +0100
Subject: [PATCH 112/200] amend costs

---
 R/create_individual_file.R | 40 ++++++++++++++++++++++++--------------
 1 file changed, 25 insertions(+), 15 deletions(-)

diff --git a/R/create_individual_file.R b/R/create_individual_file.R
index fa6c43e0f..46d825bad 100644
--- a/R/create_individual_file.R
+++ b/R/create_individual_file.R
@@ -51,12 +51,12 @@ add_cij_columns <- function(episode_file) {
         .data$cij_marker,
         NA_real_
       ),
-      # assume cij_delay is logic variable
-      cij_delay = dplyr::if_else(
-        (.data$cij_delay & .data$cij_marker == 1),
-        1,
-        0
-      ),
+      # # assume cij_delay is logic variable
+      # cij_delay = dplyr::if_else(
+      #   (.data$cij_delay & .data$cij_marker == 1),
+      #   1,
+      #   0
+      # ),
       preventable_admissions = dplyr::if_else((.data$cij_ppa == 1 &
         .data$cij_marker == 1),
       1,
@@ -102,15 +102,25 @@ add_all_columns <- function(episode_file) {
     add_at_columns("AT", .data$recid == "AT") %>%
     add_sds_columns("SDS", .data$recid == "SDS") %>%
     dplyr::mutate(
-      health_net_cost = Acute_cost +
-        Mat_cost +
-        MH_cost +
-        GLS_cost +
-        OP_cost_attend +
-        AE_cost +
-        PIS_cost +
-        OoH_cost,
-      health_net_costincdnas = health_net_cost + OP_cost_dnas
+      health_net_cost = rowSums(dplyr::select(
+        .,
+        c(
+          Acute_cost,
+          Mat_cost,
+          MH_cost,
+          GLS_cost,
+          OP_cost_attend,
+          AE_cost,
+          PIS_cost,
+          OoH_cost
+        )),
+        na.rm = TRUE),
+      health_net_costincdnas = rowSums(dplyr::select(.,
+                                                     c(
+                                                       health_net_cost,
+                                                       OP_cost_dnas
+                                                     )),
+                                       na.rm = TRUE)
     )
 }
 

From 78197c61ae2ee001e33a26788480954278e9a32c Mon Sep 17 00:00:00 2001
From: lizihao-anu <lizihao-anu@users.noreply.github.com>
Date: Tue, 20 Jun 2023 10:43:07 +0000
Subject: [PATCH 113/200] Style code

---
 R/create_individual_file.R | 45 ++++++++++++++++++++++----------------
 1 file changed, 26 insertions(+), 19 deletions(-)

diff --git a/R/create_individual_file.R b/R/create_individual_file.R
index 46d825bad..8f7ca9a50 100644
--- a/R/create_individual_file.R
+++ b/R/create_individual_file.R
@@ -102,25 +102,32 @@ add_all_columns <- function(episode_file) {
     add_at_columns("AT", .data$recid == "AT") %>%
     add_sds_columns("SDS", .data$recid == "SDS") %>%
     dplyr::mutate(
-      health_net_cost = rowSums(dplyr::select(
-        .,
-        c(
-          Acute_cost,
-          Mat_cost,
-          MH_cost,
-          GLS_cost,
-          OP_cost_attend,
-          AE_cost,
-          PIS_cost,
-          OoH_cost
-        )),
-        na.rm = TRUE),
-      health_net_costincdnas = rowSums(dplyr::select(.,
-                                                     c(
-                                                       health_net_cost,
-                                                       OP_cost_dnas
-                                                     )),
-                                       na.rm = TRUE)
+      health_net_cost = rowSums(
+        dplyr::select(
+          .,
+          c(
+            Acute_cost,
+            Mat_cost,
+            MH_cost,
+            GLS_cost,
+            OP_cost_attend,
+            AE_cost,
+            PIS_cost,
+            OoH_cost
+          )
+        ),
+        na.rm = TRUE
+      ),
+      health_net_costincdnas = rowSums(
+        dplyr::select(
+          .,
+          c(
+            health_net_cost,
+            OP_cost_dnas
+          )
+        ),
+        na.rm = TRUE
+      )
     )
 }
 

From 4fd8ac4bb880b893095f74c1b195896d15e77fed Mon Sep 17 00:00:00 2001
From: Jennifer Thom <jennifer.thom@phs.scot>
Date: Tue, 20 Jun 2023 14:06:52 +0100
Subject: [PATCH 114/200] Revert "amend costs"

This reverts commit 8048e68c829edbf6c0c43e1bf3ade1d142e0e250.
---
 R/create_individual_file.R | 47 ++++++++++++--------------------------
 1 file changed, 15 insertions(+), 32 deletions(-)

diff --git a/R/create_individual_file.R b/R/create_individual_file.R
index 8f7ca9a50..fa6c43e0f 100644
--- a/R/create_individual_file.R
+++ b/R/create_individual_file.R
@@ -51,12 +51,12 @@ add_cij_columns <- function(episode_file) {
         .data$cij_marker,
         NA_real_
       ),
-      # # assume cij_delay is logic variable
-      # cij_delay = dplyr::if_else(
-      #   (.data$cij_delay & .data$cij_marker == 1),
-      #   1,
-      #   0
-      # ),
+      # assume cij_delay is logic variable
+      cij_delay = dplyr::if_else(
+        (.data$cij_delay & .data$cij_marker == 1),
+        1,
+        0
+      ),
       preventable_admissions = dplyr::if_else((.data$cij_ppa == 1 &
         .data$cij_marker == 1),
       1,
@@ -102,32 +102,15 @@ add_all_columns <- function(episode_file) {
     add_at_columns("AT", .data$recid == "AT") %>%
     add_sds_columns("SDS", .data$recid == "SDS") %>%
     dplyr::mutate(
-      health_net_cost = rowSums(
-        dplyr::select(
-          .,
-          c(
-            Acute_cost,
-            Mat_cost,
-            MH_cost,
-            GLS_cost,
-            OP_cost_attend,
-            AE_cost,
-            PIS_cost,
-            OoH_cost
-          )
-        ),
-        na.rm = TRUE
-      ),
-      health_net_costincdnas = rowSums(
-        dplyr::select(
-          .,
-          c(
-            health_net_cost,
-            OP_cost_dnas
-          )
-        ),
-        na.rm = TRUE
-      )
+      health_net_cost = Acute_cost +
+        Mat_cost +
+        MH_cost +
+        GLS_cost +
+        OP_cost_attend +
+        AE_cost +
+        PIS_cost +
+        OoH_cost,
+      health_net_costincdnas = health_net_cost + OP_cost_dnas
     )
 }
 

From b6a1e6f8a5d7560a43828f96df2427fd1c58fb34 Mon Sep 17 00:00:00 2001
From: Jennifer Thom <jennifer.thom@phs.scot>
Date: Tue, 20 Jun 2023 14:10:37 +0100
Subject: [PATCH 115/200] Add DN and cij_delay back in

---
 R/aggregate_by_chi_zihao.R | 4 ++--
 R/create_individual_file.R | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/R/aggregate_by_chi_zihao.R b/R/aggregate_by_chi_zihao.R
index 270f8c87c..b665a8db2 100644
--- a/R/aggregate_by_chi_zihao.R
+++ b/R/aggregate_by_chi_zihao.R
@@ -72,7 +72,7 @@ aggregate_by_chi_zihao <- function(episode_file) {
     "cij_el",
     "cij_non_el",
     "cij_mat",
-    # "cij_delay",
+    "cij_delay",
     "ooh_cases",
     "preventable_admissions",
     "gpprac",
@@ -120,7 +120,7 @@ aggregate_by_chi_zihao <- function(episode_file) {
         "time",
         "assessment",
         "other",
-        # "dn",
+        "dn",
         "nhs24",
         "pcc",
         "_dnas"
diff --git a/R/create_individual_file.R b/R/create_individual_file.R
index fa6c43e0f..964f36a1b 100644
--- a/R/create_individual_file.R
+++ b/R/create_individual_file.R
@@ -91,7 +91,7 @@ add_all_columns <- function(episode_file) {
     add_ae_columns("AE", .data$recid == "AE2") %>%
     add_pis_columns("PIS", .data$recid == "PIS") %>%
     add_ooh_columns("OoH", .data$recid == "OoH") %>%
-    # add_dn_columns("DN", .data$recid == "DN") %>%
+    add_dn_columns("DN", .data$recid == "DN") %>%
     add_cmh_columns("CMH", .data$recid == "CMH") %>%
     add_dd_columns("DD", .data$recid == "DD") %>%
     add_nsu_columns("NSU", .data$recid == "NSU") %>%

From f32c7a2f6cdd0dfd9b1ba861986366ca73763dd9 Mon Sep 17 00:00:00 2001
From: Zihao Li <lizihao_anu@outlook.com>
Date: Tue, 20 Jun 2023 14:22:47 +0100
Subject: [PATCH 116/200] fix the issue

---
 R/create_individual_file.R | 13 +++----------
 1 file changed, 3 insertions(+), 10 deletions(-)

diff --git a/R/create_individual_file.R b/R/create_individual_file.R
index 8f7ca9a50..43de83f39 100644
--- a/R/create_individual_file.R
+++ b/R/create_individual_file.R
@@ -118,16 +118,9 @@ add_all_columns <- function(episode_file) {
         ),
         na.rm = TRUE
       ),
-      health_net_costincdnas = rowSums(
-        dplyr::select(
-          .,
-          c(
-            health_net_cost,
-            OP_cost_dnas
-          )
-        ),
-        na.rm = TRUE
-      )
+      health_net_costincdnas =
+        health_net_cost +
+        dplyr::if_else(is.na(OP_cost_dnas), 0, OP_cost_dnas)
     )
 }
 

From 04fe893ac0a55487fe915097d0b9d0594241643b Mon Sep 17 00:00:00 2001
From: lizihao-anu <lizihao-anu@users.noreply.github.com>
Date: Tue, 20 Jun 2023 13:57:02 +0000
Subject: [PATCH 117/200] Style code

---
 R/create_individual_file.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/R/create_individual_file.R b/R/create_individual_file.R
index dc3cc28ac..b7a7d27a6 100644
--- a/R/create_individual_file.R
+++ b/R/create_individual_file.R
@@ -120,7 +120,7 @@ add_all_columns <- function(episode_file) {
       ),
       health_net_costincdnas =
         health_net_cost +
-        dplyr::if_else(is.na(OP_cost_dnas), 0, OP_cost_dnas)
+          dplyr::if_else(is.na(OP_cost_dnas), 0, OP_cost_dnas)
     )
 }
 

From b468271f4d6ff31de1b6e4e8c31d305b349ffa13 Mon Sep 17 00:00:00 2001
From: Zihao Li <lizihao_anu@outlook.com>
Date: Wed, 21 Jun 2023 09:50:51 +0100
Subject: [PATCH 118/200] remove running in chunks

---
 R/aggregate_by_chi_zihao.R | 142 ++++++++++++++++---------------------
 R/create_individual_file.R |  12 ++--
 2 files changed, 69 insertions(+), 85 deletions(-)

diff --git a/R/aggregate_by_chi_zihao.R b/R/aggregate_by_chi_zihao.R
index 270f8c87c..884ca56a1 100644
--- a/R/aggregate_by_chi_zihao.R
+++ b/R/aggregate_by_chi_zihao.R
@@ -7,10 +7,10 @@
 #' @importFrom data.table .SD
 #'
 #' @inheritParams create_individual_file
-aggregate_by_chi_zihao <- function(episode_file) {
+aggregate_by_chi_zihao <- function(individual_file) {
   cli::cli_alert_info("Aggregate by CHI function started at {Sys.time()}")
 
-  episode_file <- episode_file %>%
+  individual_file <- individual_file %>%
     dplyr::select(-c(postcode, gpprac)) %>%
     dplyr::rename(
       "gpprac" = "most_recent_gpprac",
@@ -22,13 +22,13 @@ aggregate_by_chi_zihao <- function(episode_file) {
       dplyr::ends_with("_DoB")
     ))
 
-  names(episode_file) <- tolower(names(episode_file))
+  names(individual_file) <- tolower(names(individual_file))
 
-  data.table::setDT(episode_file) # Convert to data.table
+  data.table::setDT(individual_file) # Convert to data.table
 
   # Sort the data within each chunk
   data.table::setkeyv(
-    episode_file,
+    individual_file,
     c(
       "chi",
       "record_keydate1",
@@ -39,7 +39,7 @@ aggregate_by_chi_zihao <- function(episode_file) {
   )
 
   data.table::setnames(
-    episode_file,
+    individual_file,
     c(
       "ch_chi_cis", "cij_marker", "ooh_case_id"
       # ,"hh_in_fy"
@@ -50,19 +50,10 @@ aggregate_by_chi_zihao <- function(episode_file) {
     )
   )
 
-  # Initialize an empty data.table for the aggregated results
-  aggregated_data <- data.table::data.table()
-
-  # Process the data in chunks
-  chunk_size <- min(nrow(episode_file), 5e7)
-  # Adjust the chunk size as per your system's memory capacity
-  n_chunks <- nrow(episode_file) %/% chunk_size
-
-
   # colums specification
   # columns to select last
   cols2 <- vars_end_with(
-    episode_file,
+    individual_file,
     c("postcode", "dob", "ggprac")
   )
   # columns to select last unique rows
@@ -98,12 +89,12 @@ aggregate_by_chi_zihao <- function(episode_file) {
     "hb2019",
     "hscp2019",
     "ca2019",
-    vars_start_with(episode_file, "sc_")
+    vars_start_with(individual_file, "sc_")
   )
   # columns to sum up
   cols4 <- c(
     vars_end_with(
-      episode_file,
+      individual_file,
       c(
         "episodes",
         "beddays",
@@ -120,20 +111,22 @@ aggregate_by_chi_zihao <- function(episode_file) {
         "time",
         "assessment",
         "other",
-        # "dn",
+        "dn",
         "nhs24",
         "pcc",
         "_dnas"
       )
     ),
     vars_start_with(
-      episode_file,
+      individual_file,
       "sds_option"
-    )
+    ),
+    "health_net_costincdnas"
   )
   cols4 <- cols4[!(cols4 %in% c("ch_cis_episodes"))]
-  # # columns to select maximum
-  # cols5 <- vars_contain(episode_file, "nsu")
+  # columns to select maximum
+  cols5 <- vars_contain(individual_file, c("nsu", "hl1_in_fy"))
+  cols5 <- cols5[!(cols5 %in% c("ooh_consultation_time"))]
   # columns to select first row
   cols6 <- c(
     condition_cols(),
@@ -141,68 +134,59 @@ aggregate_by_chi_zihao <- function(episode_file) {
     # "deceased",
     "year",
     vars_end_with(
-      episode_file,
+      individual_file,
       c("_cohort", "end_fy", "start_fy")
     )
   )
 
-  for (i in 1:n_chunks) {
-    start <- (i - 1) * chunk_size + 1
-    end <- i * chunk_size
-    # Subset the data to the current chunk
-    chunk <- episode_file[start:end]
-
-    # compute
-    chunk_cols1 <- chunk[,
-      .(gender = mean(gender)),
-      by = chi
-    ]
-    chunk_cols2 <- chunk[,
-      .SD[.N],
-      .SDcols = cols2,
-      by = chi
-    ]
-    chunk_cols3 <- chunk[,
-      lapply(.SD, function(x) {
-        data.table::uniqueN(x, na.rm = TRUE)
-      }),
-      .SDcols = cols3,
-      by = chi
-    ]
-    chunk_cols4 <- chunk[,
-      lapply(.SD, function(x) {
-        sum(x, na.rm = TRUE)
-      }),
-      .SDcols = cols4,
-      by = chi
-    ]
-    # chunk_cols5 <- chunk[,
-    #                      lapply(.SD, function(x) max(x, na.rm = TRUE)),
-    #                      .SDcols = cols5,
-    #                      by = chi]
-    chunk_cols6 <- chunk[,
-      lapply(.SD, function(x) {
-        x[!is.na(x)][1]
-      }),
-      .SDcols = cols6,
-      by = chi
-    ]
-    chunk_agg <- dplyr::bind_cols(
-      chunk_cols1,
-      chunk_cols2[, chi := NULL],
-      chunk_cols3[, chi := NULL],
-      chunk_cols4[, chi := NULL],
-      # chunk_cols5[, chi := NULL],
-      chunk_cols6[, chi := NULL]
-    )
+  # compute
+  individual_file_cols1 <- individual_file[,
+    .(gender = mean(gender)),
+    by = chi
+  ]
+  individual_file_cols2 <- individual_file[,
+    .SD[.N],
+    .SDcols = cols2,
+    by = chi
+  ]
+  individual_file_cols3 <- individual_file[,
+    lapply(.SD, function(x) {
+      data.table::uniqueN(x, na.rm = TRUE)
+    }),
+    .SDcols = cols3,
+    by = chi
+  ]
+  individual_file_cols4 <- individual_file[,
+    lapply(.SD, function(x) {
+      sum(x, na.rm = TRUE)
+    }),
+    .SDcols = cols4,
+    by = chi
+  ]
+  individual_file_cols5 <- individual_file[,
+                       lapply(.SD, function(x) max(x, na.rm = TRUE)),
+                       .SDcols = cols5,
+                       by = chi]
+  individual_file_cols6 <- individual_file[,
+    lapply(.SD, function(x) {
+      x[!is.na(x)][1]
+    }),
+    .SDcols = cols6,
+    by = chi
+  ]
+  individual_file <- dplyr::bind_cols(
+    individual_file_cols1,
+    individual_file_cols2[, chi := NULL],
+    individual_file_cols3[, chi := NULL],
+    individual_file_cols4[, chi := NULL],
+    individual_file_cols5[, chi := NULL],
+    individual_file_cols6[, chi := NULL]
+  )
 
-    # Append the aggregated chunk to the overall result
-    aggregated_data <-
-      data.table::rbindlist(list(aggregated_data, chunk_agg))
-  }
-  aggregated_data <- dplyr::as_tibble(aggregated_data)
+  # convert back to tibble
+  individual_file <- dplyr::as_tibble(individual_file)
 
-  return(aggregated_data)
+  return(individual_file)
 }
 
 
diff --git a/R/create_individual_file.R b/R/create_individual_file.R
index b7a7d27a6..76473807f 100644
--- a/R/create_individual_file.R
+++ b/R/create_individual_file.R
@@ -51,12 +51,12 @@ add_cij_columns <- function(episode_file) {
         .data$cij_marker,
         NA_real_
       ),
-      # assume cij_delay is logic variable
-      cij_delay = dplyr::if_else(
-        (.data$cij_delay & .data$cij_marker == 1),
-        1,
-        0
-      ),
+      # # assume cij_delay is logic variable
+      # cij_delay = dplyr::if_else(
+      #   (.data$cij_delay & .data$cij_marker == 1),
+      #   1,
+      #   0
+      # ),
       preventable_admissions = dplyr::if_else((.data$cij_ppa == 1 &
         .data$cij_marker == 1),
       1,

From 55a075cfae4601759cf1ae807fe0b2b64c0f4785 Mon Sep 17 00:00:00 2001
From: lizihao-anu <lizihao-anu@users.noreply.github.com>
Date: Wed, 21 Jun 2023 08:53:07 +0000
Subject: [PATCH 119/200] Style code

---
 R/aggregate_by_chi_zihao.R | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/R/aggregate_by_chi_zihao.R b/R/aggregate_by_chi_zihao.R
index 884ca56a1..4fa1fdfd2 100644
--- a/R/aggregate_by_chi_zihao.R
+++ b/R/aggregate_by_chi_zihao.R
@@ -164,9 +164,10 @@ aggregate_by_chi_zihao <- function(individual_file) {
     by = chi
   ]
   individual_file_cols5 <- individual_file[,
-                       lapply(.SD, function(x) max(x, na.rm = TRUE)),
-                       .SDcols = cols5,
-                       by = chi]
+    lapply(.SD, function(x) max(x, na.rm = TRUE)),
+    .SDcols = cols5,
+    by = chi
+  ]
   individual_file_cols6 <- individual_file[,
     lapply(.SD, function(x) {
       x[!is.na(x)][1]

From b9fbf295d87d2b1182a1a1172c082ea91046139c Mon Sep 17 00:00:00 2001
From: Jennifer Thom <jennifer.thom@phs.scot>
Date: Wed, 21 Jun 2023 10:32:51 +0100
Subject: [PATCH 120/200] Update tests to include missing variables

---
 R/process_tests_individual_file.R | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/R/process_tests_individual_file.R b/R/process_tests_individual_file.R
index 2eb3503e2..26d3439d1 100644
--- a/R/process_tests_individual_file.R
+++ b/R/process_tests_individual_file.R
@@ -12,10 +12,10 @@ process_tests_individual_file <- function(data, year) {
       "year",
       "chi",
       "gender",
-      # "postcode", # Add back in once postcode is fixed
+      "postcode",
       "dob",
-      # "hbrescode", #add back in when available
-      # "health_net_cost",
+      "hbrescode",
+      "health_net_cost",
       slfhelper::ltc_vars,
       dplyr::contains(c(
         "beddays",
@@ -61,8 +61,8 @@ produce_individual_file_tests <- function(data) {
   test_flags <- data %>%
     # use functions to create HB and partnership flags
     create_demog_test_flags() %>%
-    # create_hb_test_flags(.data$hbrescode) %>%
-    # create_hb_cost_test_flags(.data$hbrescode, .data$health_net_cost) %>%
+    create_hb_test_flags(.data$hbrescode) %>%
+    create_hb_cost_test_flags(.data$hbrescode, .data$health_net_cost) %>%
     # keep variables for comparison
     dplyr::select(c("valid_chi":dplyr::last_col())) %>%
     # use function to sum new test flags
@@ -82,13 +82,13 @@ produce_individual_file_tests <- function(data) {
       measure = "all"
     )
 
-  # min_max_measures <- data %>%
-  #   calculate_measures(
-  #     vars = c(
-  #       "health_net_cost",
-  #     ),
-  #     measure = "min-max"
-  #   )
+  min_max_measures <- data %>%
+    calculate_measures(
+      vars = c(
+        "health_net_cost",
+      ),
+      measure = "min-max"
+    )
 
   sum_measures <- data %>%
     dplyr::select(slfhelper::ltc_vars) %>%
@@ -102,7 +102,7 @@ produce_individual_file_tests <- function(data) {
   join_output <- list(
     test_flags,
     all_measures,
-    # min_max_measures,
+    min_max_measures,
     sum_measures
   ) %>%
     purrr::reduce(dplyr::full_join, by = c("measure", "value"))

From 74da47c10701427954c995c706a0f77d66b65483 Mon Sep 17 00:00:00 2001
From: Jennifer Thom <jennifer.thom@phs.scot>
Date: Wed, 21 Jun 2023 10:43:56 +0100
Subject: [PATCH 121/200] Remove unnecessary comma

---
 R/process_tests_individual_file.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/R/process_tests_individual_file.R b/R/process_tests_individual_file.R
index 26d3439d1..32bbd8d3a 100644
--- a/R/process_tests_individual_file.R
+++ b/R/process_tests_individual_file.R
@@ -85,7 +85,7 @@ produce_individual_file_tests <- function(data) {
   min_max_measures <- data %>%
     calculate_measures(
       vars = c(
-        "health_net_cost",
+        "health_net_cost"
       ),
       measure = "min-max"
     )

From 79981a328163e9a2410aa04ee8f0ff2bd8369243 Mon Sep 17 00:00:00 2001
From: Zihao Li <lizihao_anu@outlook.com>
Date: Wed, 21 Jun 2023 12:41:34 +0100
Subject: [PATCH 122/200] fix the bug of preventable_beddays

---
 R/create_individual_file.R | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/R/create_individual_file.R b/R/create_individual_file.R
index 76473807f..5b5fef0ef 100644
--- a/R/create_individual_file.R
+++ b/R/create_individual_file.R
@@ -65,8 +65,8 @@ add_cij_columns <- function(episode_file) {
       preventable_beddays = dplyr::if_else((.data$cij_ppa == 1 &
         .data$cij_marker == 1),
       as.numeric(
-        min(.data$cij_end_date, end_fy(year)) -
-          min(.data$cij_start_date, start_fy(year))
+        pmin(.data$cij_end_date, end_fy(year)) -
+          pmin(.data$cij_start_date, start_fy(year))
       ),
       0
       )

From a029a10f27d150f6f6e5db605e020fea45467420 Mon Sep 17 00:00:00 2001
From: lizihao-anu <lizihao-anu@users.noreply.github.com>
Date: Wed, 21 Jun 2023 11:44:20 +0000
Subject: [PATCH 123/200] Update documentation

---
 man/aggregate_by_chi_zihao.Rd | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/man/aggregate_by_chi_zihao.Rd b/man/aggregate_by_chi_zihao.Rd
index 3d4961e19..a754fde4d 100644
--- a/man/aggregate_by_chi_zihao.Rd
+++ b/man/aggregate_by_chi_zihao.Rd
@@ -4,10 +4,7 @@
 \alias{aggregate_by_chi_zihao}
 \title{Aggregate by CHI}
 \usage{
-aggregate_by_chi_zihao(episode_file)
-}
-\arguments{
-\item{episode_file}{Tibble containing episodic data}
+aggregate_by_chi_zihao(individual_file)
 }
 \description{
 Aggregate episode file by CHI to convert into

From 71702e0b7cb1663b0818ae4aa15bed04e009aae3 Mon Sep 17 00:00:00 2001
From: Zihao Li <lizihao_anu@outlook.com>
Date: Wed, 21 Jun 2023 13:50:13 +0100
Subject: [PATCH 124/200] fix total ae_attendances

---
 R/create_individual_file.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/R/create_individual_file.R b/R/create_individual_file.R
index 5b5fef0ef..99a1a6e77 100644
--- a/R/create_individual_file.R
+++ b/R/create_individual_file.R
@@ -195,7 +195,7 @@ add_ae_columns <- function(episode_file, prefix, condition) {
   condition <- substitute(condition)
   episode_file %>%
     add_standard_cols(prefix, condition, cost = TRUE) %>%
-    dplyr::mutate("{prefix}_attendances" := dplyr::if_else(eval(condition), .data$cost_total_net, NA_real_))
+    dplyr::mutate("{prefix}_attendances" := dplyr::if_else(eval(condition), 1, NA_real_))
 }
 
 #' Add PIS columns

From 1667ff052d18e06bfebaeefbc4ace38898ef3c19 Mon Sep 17 00:00:00 2001
From: Zihao Li <lizihao_anu@outlook.com>
Date: Wed, 21 Jun 2023 14:16:54 +0100
Subject: [PATCH 125/200] fix the bug of preventable_admissions

---
 R/aggregate_by_chi_zihao.R | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/R/aggregate_by_chi_zihao.R b/R/aggregate_by_chi_zihao.R
index 4fa1fdfd2..818c17af0 100644
--- a/R/aggregate_by_chi_zihao.R
+++ b/R/aggregate_by_chi_zihao.R
@@ -65,7 +65,6 @@ aggregate_by_chi_zihao <- function(individual_file) {
     "cij_mat",
     # "cij_delay",
     "ooh_cases",
-    "preventable_admissions",
     "gpprac",
     "hbrescode",
     "hscp",
@@ -121,7 +120,8 @@ aggregate_by_chi_zihao <- function(individual_file) {
       individual_file,
       "sds_option"
     ),
-    "health_net_costincdnas"
+    "health_net_costincdnas",
+    "preventable_admissions"
   )
   cols4 <- cols4[!(cols4 %in% c("ch_cis_episodes"))]
   # columns to select maximum

From 0a517d3838897da8c9a5fc25bd718b7d7f898f03 Mon Sep 17 00:00:00 2001
From: Zihao Li <lizihao_anu@outlook.com>
Date: Wed, 21 Jun 2023 15:37:32 +0100
Subject: [PATCH 126/200] fix the bug of hbrescode etc

---
 R/aggregate_by_chi_zihao.R | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/R/aggregate_by_chi_zihao.R b/R/aggregate_by_chi_zihao.R
index 818c17af0..a6b95453f 100644
--- a/R/aggregate_by_chi_zihao.R
+++ b/R/aggregate_by_chi_zihao.R
@@ -54,16 +54,7 @@ aggregate_by_chi_zihao <- function(individual_file) {
   # columns to select last
   cols2 <- vars_end_with(
     individual_file,
-    c("postcode", "dob", "ggprac")
-  )
-  # columns to select last unique rows
-  cols3 <- c(
-    "ch_cis_episodes",
-    "cij_total",
-    "cij_el",
-    "cij_non_el",
-    "cij_mat",
-    # "cij_delay",
+    c("postcode", "dob", "ggprac"),
     "ooh_cases",
     "gpprac",
     "hbrescode",
@@ -90,6 +81,15 @@ aggregate_by_chi_zihao <- function(individual_file) {
     "ca2019",
     vars_start_with(individual_file, "sc_")
   )
+  # columns to select last unique rows
+  cols3 <- c(
+    "ch_cis_episodes",
+    "cij_total",
+    "cij_el",
+    "cij_non_el",
+    "cij_mat",
+    # "cij_delay"
+  )
   # columns to sum up
   cols4 <- c(
     vars_end_with(

From 3b24326066ccb47c73318116676da27d159712d3 Mon Sep 17 00:00:00 2001
From: Zihao Li <lizihao_anu@outlook.com>
Date: Wed, 21 Jun 2023 17:10:00 +0100
Subject: [PATCH 127/200] minor fix

---
 R/aggregate_by_chi_zihao.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/R/aggregate_by_chi_zihao.R b/R/aggregate_by_chi_zihao.R
index a6b95453f..7162b55db 100644
--- a/R/aggregate_by_chi_zihao.R
+++ b/R/aggregate_by_chi_zihao.R
@@ -87,7 +87,7 @@ aggregate_by_chi_zihao <- function(individual_file) {
     "cij_total",
     "cij_el",
     "cij_non_el",
-    "cij_mat",
+    "cij_mat"
     # "cij_delay"
   )
   # columns to sum up

From 4e4330cd8c8360c0c1470d9aa76467ae427d2306 Mon Sep 17 00:00:00 2001
From: Zihao Li <lizihao_anu@outlook.com>
Date: Wed, 21 Jun 2023 17:32:46 +0100
Subject: [PATCH 128/200] minor fix

---
 R/aggregate_by_chi_zihao.R | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/R/aggregate_by_chi_zihao.R b/R/aggregate_by_chi_zihao.R
index 7162b55db..3171a59dd 100644
--- a/R/aggregate_by_chi_zihao.R
+++ b/R/aggregate_by_chi_zihao.R
@@ -52,9 +52,9 @@ aggregate_by_chi_zihao <- function(individual_file) {
 
   # colums specification
   # columns to select last
-  cols2 <- vars_end_with(
-    individual_file,
-    c("postcode", "dob", "ggprac"),
+  cols2 <- c(
+    vars_end_with(individual_file,
+                  c("postcode", "dob", "ggprac")),
     "ooh_cases",
     "gpprac",
     "hbrescode",

From 6bdd780a798e85c796f7ae53b129ec5881e29e63 Mon Sep 17 00:00:00 2001
From: lizihao-anu <lizihao-anu@users.noreply.github.com>
Date: Wed, 21 Jun 2023 16:35:00 +0000
Subject: [PATCH 129/200] Style code

---
 R/aggregate_by_chi_zihao.R | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/R/aggregate_by_chi_zihao.R b/R/aggregate_by_chi_zihao.R
index 3171a59dd..9a7769d6a 100644
--- a/R/aggregate_by_chi_zihao.R
+++ b/R/aggregate_by_chi_zihao.R
@@ -53,8 +53,10 @@ aggregate_by_chi_zihao <- function(individual_file) {
   # colums specification
   # columns to select last
   cols2 <- c(
-    vars_end_with(individual_file,
-                  c("postcode", "dob", "ggprac")),
+    vars_end_with(
+      individual_file,
+      c("postcode", "dob", "ggprac")
+    ),
     "ooh_cases",
     "gpprac",
     "hbrescode",

From 06e1c7c893873e60bcce7e1a1e4606e23a7ad525 Mon Sep 17 00:00:00 2001
From: James McMahon <james.mcmahon@phs.scot>
Date: Tue, 20 Jun 2023 11:16:28 +0100
Subject: [PATCH 130/200] Fix some warnings being produced by the tests

---
 R/create_monthly_costs.R | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/R/create_monthly_costs.R b/R/create_monthly_costs.R
index 08f4ce59a..e9e5eedf3 100644
--- a/R/create_monthly_costs.R
+++ b/R/create_monthly_costs.R
@@ -65,14 +65,14 @@ create_monthly_costs <- function(data,
   add_months <- setdiff(full_cost_col, available_months)
 
   add_months_df <- dplyr::as_tibble(
-    matrix(0, nrow = nrow(data), ncol = length(add_months))
+    matrix(0, nrow = nrow(data), ncol = length(add_months)),
+    .name_repair = ~add_months
   )
-  names(add_months_df) <- add_months
 
   daycase_cost_months <- daycase_cost_months %>%
     dplyr::bind_cols(add_months_df) %>%
     dplyr::select(c(
-      full_cost_col,
+      dplyr::all_of(full_cost_col),
       "daycase_check"
     ))
 

From e14ae02b7127eed5d751d2ab2753a3d60e13bed7 Mon Sep 17 00:00:00 2001
From: James McMahon <james.mcmahon@phs.scot>
Date: Tue, 20 Jun 2023 11:33:17 +0100
Subject: [PATCH 131/200] Fix failing test

---
 tests/testthat/test-get_existing_data_for_tests.R | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/tests/testthat/test-get_existing_data_for_tests.R b/tests/testthat/test-get_existing_data_for_tests.R
index 57309f031..2b21d71db 100644
--- a/tests/testthat/test-get_existing_data_for_tests.R
+++ b/tests/testthat/test-get_existing_data_for_tests.R
@@ -11,7 +11,11 @@ test_that("Get existing data works", {
 
   slf_data <- suppressWarnings(get_existing_data_for_tests(dummy_new_data))
 
-  expect_named(slf_data, c("chi", "year", "recid", "diag1", "diag2"))
+  expect_named(
+    slf_data,
+    c("chi", "year", "recid", "diag1", "diag2"),
+    ignore.order = TRUE
+  )
   expect_gte(nrow(slf_data), 20000)
   expect_equal(unique(slf_data$recid), "04B")
   expect_equal(unique(slf_data$year), "1920")

From dc79a75d4f1809e66fc3d0208fa447724a6be3e5 Mon Sep 17 00:00:00 2001
From: Zihao Li <lizihao_anu@outlook.com>
Date: Wed, 21 Jun 2023 09:50:51 +0100
Subject: [PATCH 132/200] remove running in chunks

---
 R/aggregate_by_chi_zihao.R | 142 ++++++++++++++++---------------------
 R/create_individual_file.R |  12 ++--
 2 files changed, 69 insertions(+), 85 deletions(-)

diff --git a/R/aggregate_by_chi_zihao.R b/R/aggregate_by_chi_zihao.R
index 270f8c87c..884ca56a1 100644
--- a/R/aggregate_by_chi_zihao.R
+++ b/R/aggregate_by_chi_zihao.R
@@ -7,10 +7,10 @@
 #' @importFrom data.table .SD
 #'
 #' @inheritParams create_individual_file
-aggregate_by_chi_zihao <- function(episode_file) {
+aggregate_by_chi_zihao <- function(individual_file) {
   cli::cli_alert_info("Aggregate by CHI function started at {Sys.time()}")
 
-  episode_file <- episode_file %>%
+  individual_file <- individual_file %>%
     dplyr::select(-c(postcode, gpprac)) %>%
     dplyr::rename(
       "gpprac" = "most_recent_gpprac",
@@ -22,13 +22,13 @@ aggregate_by_chi_zihao <- function(episode_file) {
       dplyr::ends_with("_DoB")
     ))
 
-  names(episode_file) <- tolower(names(episode_file))
+  names(individual_file) <- tolower(names(individual_file))
 
-  data.table::setDT(episode_file) # Convert to data.table
+  data.table::setDT(individual_file) # Convert to data.table
 
   # Sort the data within each chunk
   data.table::setkeyv(
-    episode_file,
+    individual_file,
     c(
       "chi",
       "record_keydate1",
@@ -39,7 +39,7 @@ aggregate_by_chi_zihao <- function(episode_file) {
   )
 
   data.table::setnames(
-    episode_file,
+    individual_file,
     c(
       "ch_chi_cis", "cij_marker", "ooh_case_id"
       # ,"hh_in_fy"
@@ -50,19 +50,10 @@ aggregate_by_chi_zihao <- function(episode_file) {
     )
   )
 
-  # Initialize an empty data.table for the aggregated results
-  aggregated_data <- data.table::data.table()
-
-  # Process the data in chunks
-  chunk_size <- min(nrow(episode_file), 5e7)
-  # Adjust the chunk size as per your system's memory capacity
-  n_chunks <- nrow(episode_file) %/% chunk_size
-
-
   # colums specification
   # columns to select last
   cols2 <- vars_end_with(
-    episode_file,
+    individual_file,
     c("postcode", "dob", "ggprac")
   )
   # columns to select last unique rows
@@ -98,12 +89,12 @@ aggregate_by_chi_zihao <- function(episode_file) {
     "hb2019",
     "hscp2019",
     "ca2019",
-    vars_start_with(episode_file, "sc_")
+    vars_start_with(individual_file, "sc_")
   )
   # columns to sum up
   cols4 <- c(
     vars_end_with(
-      episode_file,
+      individual_file,
       c(
         "episodes",
         "beddays",
@@ -120,20 +111,22 @@ aggregate_by_chi_zihao <- function(episode_file) {
         "time",
         "assessment",
         "other",
-        # "dn",
+        "dn",
         "nhs24",
         "pcc",
         "_dnas"
       )
     ),
     vars_start_with(
-      episode_file,
+      individual_file,
       "sds_option"
-    )
+    ),
+    "health_net_costincdnas"
   )
   cols4 <- cols4[!(cols4 %in% c("ch_cis_episodes"))]
-  # # columns to select maximum
-  # cols5 <- vars_contain(episode_file, "nsu")
+  # columns to select maximum
+  cols5 <- vars_contain(individual_file, c("nsu", "hl1_in_fy"))
+  cols5 <- cols5[!(cols5 %in% c("ooh_consultation_time"))]
   # columns to select first row
   cols6 <- c(
     condition_cols(),
@@ -141,68 +134,59 @@ aggregate_by_chi_zihao <- function(episode_file) {
     # "deceased",
     "year",
     vars_end_with(
-      episode_file,
+      individual_file,
       c("_cohort", "end_fy", "start_fy")
     )
   )
 
-  for (i in 1:n_chunks) {
-    start <- (i - 1) * chunk_size + 1
-    end <- i * chunk_size
-    # Subset the data to the current chunk
-    chunk <- episode_file[start:end]
-
-    # compute
-    chunk_cols1 <- chunk[,
-      .(gender = mean(gender)),
-      by = chi
-    ]
-    chunk_cols2 <- chunk[,
-      .SD[.N],
-      .SDcols = cols2,
-      by = chi
-    ]
-    chunk_cols3 <- chunk[,
-      lapply(.SD, function(x) {
-        data.table::uniqueN(x, na.rm = TRUE)
-      }),
-      .SDcols = cols3,
-      by = chi
-    ]
-    chunk_cols4 <- chunk[,
-      lapply(.SD, function(x) {
-        sum(x, na.rm = TRUE)
-      }),
-      .SDcols = cols4,
-      by = chi
-    ]
-    # chunk_cols5 <- chunk[,
-    #                      lapply(.SD, function(x) max(x, na.rm = TRUE)),
-    #                      .SDcols = cols5,
-    #                      by = chi]
-    chunk_cols6 <- chunk[,
-      lapply(.SD, function(x) {
-        x[!is.na(x)][1]
-      }),
-      .SDcols = cols6,
-      by = chi
-    ]
-    chunk_agg <- dplyr::bind_cols(
-      chunk_cols1,
-      chunk_cols2[, chi := NULL],
-      chunk_cols3[, chi := NULL],
-      chunk_cols4[, chi := NULL],
-      # chunk_cols5[, chi := NULL],
-      chunk_cols6[, chi := NULL]
-    )
+  # compute
+  individual_file_cols1 <- individual_file[,
+    .(gender = mean(gender)),
+    by = chi
+  ]
+  individual_file_cols2 <- individual_file[,
+    .SD[.N],
+    .SDcols = cols2,
+    by = chi
+  ]
+  individual_file_cols3 <- individual_file[,
+    lapply(.SD, function(x) {
+      data.table::uniqueN(x, na.rm = TRUE)
+    }),
+    .SDcols = cols3,
+    by = chi
+  ]
+  individual_file_cols4 <- individual_file[,
+    lapply(.SD, function(x) {
+      sum(x, na.rm = TRUE)
+    }),
+    .SDcols = cols4,
+    by = chi
+  ]
+  individual_file_cols5 <- individual_file[,
+                       lapply(.SD, function(x) max(x, na.rm = TRUE)),
+                       .SDcols = cols5,
+                       by = chi]
+  individual_file_cols6 <- individual_file[,
+    lapply(.SD, function(x) {
+      x[!is.na(x)][1]
+    }),
+    .SDcols = cols6,
+    by = chi
+  ]
+  individual_file <- dplyr::bind_cols(
+    individual_file_cols1,
+    individual_file_cols2[, chi := NULL],
+    individual_file_cols3[, chi := NULL],
+    individual_file_cols4[, chi := NULL],
+    individual_file_cols5[, chi := NULL],
+    individual_file_cols6[, chi := NULL]
+  )
 
-    # Append the aggregated chunk to the overall result
-    aggregated_data <-
-      data.table::rbindlist(list(aggregated_data, chunk_agg))
-  }
-  aggregated_data <- dplyr::as_tibble(aggregated_data)
+  # convert back to tibble
+  individual_file <- dplyr::as_tibble(individual_file)
 
-  return(aggregated_data)
+  return(individual_file)
 }
 
 
diff --git a/R/create_individual_file.R b/R/create_individual_file.R
index b7a7d27a6..76473807f 100644
--- a/R/create_individual_file.R
+++ b/R/create_individual_file.R
@@ -51,12 +51,12 @@ add_cij_columns <- function(episode_file) {
         .data$cij_marker,
         NA_real_
       ),
-      # assume cij_delay is logic variable
-      cij_delay = dplyr::if_else(
-        (.data$cij_delay & .data$cij_marker == 1),
-        1,
-        0
-      ),
+      # # assume cij_delay is logic variable
+      # cij_delay = dplyr::if_else(
+      #   (.data$cij_delay & .data$cij_marker == 1),
+      #   1,
+      #   0
+      # ),
       preventable_admissions = dplyr::if_else((.data$cij_ppa == 1 &
         .data$cij_marker == 1),
       1,

From 28289fafb4c07ab090d291cc5586b5339b138066 Mon Sep 17 00:00:00 2001
From: lizihao-anu <lizihao-anu@users.noreply.github.com>
Date: Wed, 21 Jun 2023 08:53:07 +0000
Subject: [PATCH 133/200] Style code

---
 R/aggregate_by_chi_zihao.R | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/R/aggregate_by_chi_zihao.R b/R/aggregate_by_chi_zihao.R
index 884ca56a1..4fa1fdfd2 100644
--- a/R/aggregate_by_chi_zihao.R
+++ b/R/aggregate_by_chi_zihao.R
@@ -164,9 +164,10 @@ aggregate_by_chi_zihao <- function(individual_file) {
     by = chi
   ]
   individual_file_cols5 <- individual_file[,
-                       lapply(.SD, function(x) max(x, na.rm = TRUE)),
-                       .SDcols = cols5,
-                       by = chi]
+    lapply(.SD, function(x) max(x, na.rm = TRUE)),
+    .SDcols = cols5,
+    by = chi
+  ]
   individual_file_cols6 <- individual_file[,
     lapply(.SD, function(x) {
       x[!is.na(x)][1]

From 5ad5c783afc04f2ce9bdaa90ba29ae081f4145b7 Mon Sep 17 00:00:00 2001
From: James McMahon <james.mcmahon@phs.scot>
Date: Wed, 21 Jun 2023 10:32:42 +0100
Subject: [PATCH 134/200] Update the targets config to use
 `timestamp_positives` as the default reporter

---
 _targets.yaml | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/_targets.yaml b/_targets.yaml
index db0d98f0b..24c8a3733 100644
--- a/_targets.yaml
+++ b/_targets.yaml
@@ -1,3 +1,5 @@
 main:
   store: /conf/sourcedev/Source_Linkage_File_Updates/_targets
-  workers: '18'
+  workers: '16'
+  reporter_make: timestamp_positives
+  seconds_interval: 30

From f6e04ceaf8cd272a6c0e69efceaff8b9f66269fb Mon Sep 17 00:00:00 2001
From: Zihao Li <lizihao_anu@outlook.com>
Date: Wed, 21 Jun 2023 12:41:34 +0100
Subject: [PATCH 135/200] fix the bug of preventable_beddays

---
 R/create_individual_file.R | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/R/create_individual_file.R b/R/create_individual_file.R
index 76473807f..5b5fef0ef 100644
--- a/R/create_individual_file.R
+++ b/R/create_individual_file.R
@@ -65,8 +65,8 @@ add_cij_columns <- function(episode_file) {
       preventable_beddays = dplyr::if_else((.data$cij_ppa == 1 &
         .data$cij_marker == 1),
       as.numeric(
-        min(.data$cij_end_date, end_fy(year)) -
-          min(.data$cij_start_date, start_fy(year))
+        pmin(.data$cij_end_date, end_fy(year)) -
+          pmin(.data$cij_start_date, start_fy(year))
       ),
       0
       )

From 6bcf2b2fd1a52ba7190a1c2fcded14d178904e8f Mon Sep 17 00:00:00 2001
From: lizihao-anu <lizihao-anu@users.noreply.github.com>
Date: Wed, 21 Jun 2023 11:44:20 +0000
Subject: [PATCH 136/200] Update documentation

---
 man/aggregate_by_chi_zihao.Rd | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/man/aggregate_by_chi_zihao.Rd b/man/aggregate_by_chi_zihao.Rd
index 3d4961e19..a754fde4d 100644
--- a/man/aggregate_by_chi_zihao.Rd
+++ b/man/aggregate_by_chi_zihao.Rd
@@ -4,10 +4,7 @@
 \alias{aggregate_by_chi_zihao}
 \title{Aggregate by CHI}
 \usage{
-aggregate_by_chi_zihao(episode_file)
-}
-\arguments{
-\item{episode_file}{Tibble containing episodic data}
+aggregate_by_chi_zihao(individual_file)
 }
 \description{
 Aggregate episode file by CHI to convert into

From b0065c9b8cc6107d6dda81ce4408a0522bf6f1b6 Mon Sep 17 00:00:00 2001
From: Zihao Li <lizihao_anu@outlook.com>
Date: Wed, 21 Jun 2023 13:50:13 +0100
Subject: [PATCH 137/200] fix total ae_attendances

---
 R/create_individual_file.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/R/create_individual_file.R b/R/create_individual_file.R
index 5b5fef0ef..99a1a6e77 100644
--- a/R/create_individual_file.R
+++ b/R/create_individual_file.R
@@ -195,7 +195,7 @@ add_ae_columns <- function(episode_file, prefix, condition) {
   condition <- substitute(condition)
   episode_file %>%
     add_standard_cols(prefix, condition, cost = TRUE) %>%
-    dplyr::mutate("{prefix}_attendances" := dplyr::if_else(eval(condition), .data$cost_total_net, NA_real_))
+    dplyr::mutate("{prefix}_attendances" := dplyr::if_else(eval(condition), 1, NA_real_))
 }
 
 #' Add PIS columns

From 42f107a95a756f98eca162a2f56bfef2a6dbec51 Mon Sep 17 00:00:00 2001
From: Zihao Li <lizihao_anu@outlook.com>
Date: Wed, 21 Jun 2023 14:16:54 +0100
Subject: [PATCH 138/200] fix the bug of preventable_admissions

---
 R/aggregate_by_chi_zihao.R | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/R/aggregate_by_chi_zihao.R b/R/aggregate_by_chi_zihao.R
index 4fa1fdfd2..818c17af0 100644
--- a/R/aggregate_by_chi_zihao.R
+++ b/R/aggregate_by_chi_zihao.R
@@ -65,7 +65,6 @@ aggregate_by_chi_zihao <- function(individual_file) {
     "cij_mat",
     # "cij_delay",
     "ooh_cases",
-    "preventable_admissions",
     "gpprac",
     "hbrescode",
     "hscp",
@@ -121,7 +120,8 @@ aggregate_by_chi_zihao <- function(individual_file) {
       individual_file,
       "sds_option"
     ),
-    "health_net_costincdnas"
+    "health_net_costincdnas",
+    "preventable_admissions"
   )
   cols4 <- cols4[!(cols4 %in% c("ch_cis_episodes"))]
   # columns to select maximum

From 9612b9af3c89188dd3decd7c671e257c25bb4a4c Mon Sep 17 00:00:00 2001
From: Zihao Li <lizihao_anu@outlook.com>
Date: Wed, 21 Jun 2023 15:37:32 +0100
Subject: [PATCH 139/200] fix the bug of hbrescode etc

---
 R/aggregate_by_chi_zihao.R | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/R/aggregate_by_chi_zihao.R b/R/aggregate_by_chi_zihao.R
index 818c17af0..a6b95453f 100644
--- a/R/aggregate_by_chi_zihao.R
+++ b/R/aggregate_by_chi_zihao.R
@@ -54,16 +54,7 @@ aggregate_by_chi_zihao <- function(individual_file) {
   # columns to select last
   cols2 <- vars_end_with(
     individual_file,
-    c("postcode", "dob", "ggprac")
-  )
-  # columns to select last unique rows
-  cols3 <- c(
-    "ch_cis_episodes",
-    "cij_total",
-    "cij_el",
-    "cij_non_el",
-    "cij_mat",
-    # "cij_delay",
+    c("postcode", "dob", "ggprac"),
     "ooh_cases",
     "gpprac",
     "hbrescode",
@@ -90,6 +81,15 @@ aggregate_by_chi_zihao <- function(individual_file) {
     "ca2019",
     vars_start_with(individual_file, "sc_")
   )
+  # columns to select last unique rows
+  cols3 <- c(
+    "ch_cis_episodes",
+    "cij_total",
+    "cij_el",
+    "cij_non_el",
+    "cij_mat",
+    # "cij_delay"
+  )
   # columns to sum up
   cols4 <- c(
     vars_end_with(

From 4750913e54b505775d7b0526367fc0d330f659ac Mon Sep 17 00:00:00 2001
From: Zihao Li <lizihao_anu@outlook.com>
Date: Wed, 21 Jun 2023 17:10:00 +0100
Subject: [PATCH 140/200] minor fix

---
 R/aggregate_by_chi_zihao.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/R/aggregate_by_chi_zihao.R b/R/aggregate_by_chi_zihao.R
index a6b95453f..7162b55db 100644
--- a/R/aggregate_by_chi_zihao.R
+++ b/R/aggregate_by_chi_zihao.R
@@ -87,7 +87,7 @@ aggregate_by_chi_zihao <- function(individual_file) {
     "cij_total",
     "cij_el",
     "cij_non_el",
-    "cij_mat",
+    "cij_mat"
     # "cij_delay"
   )
   # columns to sum up

From 338479f1022b0c0857e4580fda48aa7e53851246 Mon Sep 17 00:00:00 2001
From: Zihao Li <lizihao_anu@outlook.com>
Date: Wed, 21 Jun 2023 17:32:46 +0100
Subject: [PATCH 141/200] minor fix

---
 R/aggregate_by_chi_zihao.R | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/R/aggregate_by_chi_zihao.R b/R/aggregate_by_chi_zihao.R
index 7162b55db..3171a59dd 100644
--- a/R/aggregate_by_chi_zihao.R
+++ b/R/aggregate_by_chi_zihao.R
@@ -52,9 +52,9 @@ aggregate_by_chi_zihao <- function(individual_file) {
 
   # colums specification
   # columns to select last
-  cols2 <- vars_end_with(
-    individual_file,
-    c("postcode", "dob", "ggprac"),
+  cols2 <- c(
+    vars_end_with(individual_file,
+                  c("postcode", "dob", "ggprac")),
     "ooh_cases",
     "gpprac",
     "hbrescode",

From 724f31902f15a0ddb7ad47b9bd0a1e3e22ba18a4 Mon Sep 17 00:00:00 2001
From: lizihao-anu <lizihao-anu@users.noreply.github.com>
Date: Wed, 21 Jun 2023 16:35:00 +0000
Subject: [PATCH 142/200] Style code

---
 R/aggregate_by_chi_zihao.R | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/R/aggregate_by_chi_zihao.R b/R/aggregate_by_chi_zihao.R
index 3171a59dd..9a7769d6a 100644
--- a/R/aggregate_by_chi_zihao.R
+++ b/R/aggregate_by_chi_zihao.R
@@ -53,8 +53,10 @@ aggregate_by_chi_zihao <- function(individual_file) {
   # colums specification
   # columns to select last
   cols2 <- c(
-    vars_end_with(individual_file,
-                  c("postcode", "dob", "ggprac")),
+    vars_end_with(
+      individual_file,
+      c("postcode", "dob", "ggprac")
+    ),
     "ooh_cases",
     "gpprac",
     "hbrescode",

From e9c8ef020aec636386ad2c16c54fcfe9a104f0f2 Mon Sep 17 00:00:00 2001
From: Zihao Li <lizihao_anu@outlook.com>
Date: Tue, 27 Jun 2023 13:08:01 +0100
Subject: [PATCH 143/200] fix home care cost

---
 R/process_extract_home_care.R | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/R/process_extract_home_care.R b/R/process_extract_home_care.R
index a556dd8e6..d541cc86a 100644
--- a/R/process_extract_home_care.R
+++ b/R/process_extract_home_care.R
@@ -66,7 +66,10 @@ process_extract_home_care <- function(
     # remove cost variables not from current year
     dplyr::select(-(tidyselect::contains("hc_cost_2"))) %>%
     # create cost total net
-    dplyr::mutate(cost_total_net = rowSums(dplyr::pick(tidyselect::contains("hc_cost_q"))))
+    dplyr::mutate(cost_total_net = rowSums(
+      dplyr::pick(tidyselect::contains("hc_cost_q"))),
+      na.rm = TRUE
+    )
 
 
   # Outfile ---------------------------------------

From 9a951a53d4c415dafc246c5fe18a0013a1fdd97f Mon Sep 17 00:00:00 2001
From: Zihao Li <lizihao_anu@outlook.com>
Date: Tue, 27 Jun 2023 13:09:05 +0100
Subject: [PATCH 144/200] add ipdc to fix maternity

---
 R/process_extract_maternity.R | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/R/process_extract_maternity.R b/R/process_extract_maternity.R
index 188d55ab5..04fa46ced 100644
--- a/R/process_extract_maternity.R
+++ b/R/process_extract_maternity.R
@@ -55,7 +55,12 @@ process_extract_maternity <- function(data, year, write_to_disk = TRUE) {
       discondition = factor(.data$discondition,
         levels = c(1L:5L, 8L)
       ),
-      smrtype = add_smr_type(.data$recid, .data$mpat)
+      smrtype = add_smr_type(.data$recid, .data$mpat),
+      ipdc = dplyr::case_match(
+        .data$smrtype,
+        "Matern-IP" ~ "I",
+        "Matern-DC" ~ "D"
+      )
     )
 
 
@@ -102,7 +107,8 @@ process_extract_maternity <- function(data, year, write_to_disk = TRUE) {
       "cost_total_net",
       tidyselect::ends_with("_beddays"),
       tidyselect::ends_with("_cost"),
-      "uri"
+      "uri",
+      "ipdc"
     ) %>%
     dplyr::arrange(.data$chi, .data$record_keydate1)
 

From 4b63ee87af6e89f9b35922befcc57defb222a859 Mon Sep 17 00:00:00 2001
From: Zihao Li <lizihao_anu@outlook.com>
Date: Tue, 27 Jun 2023 14:05:59 +0100
Subject: [PATCH 145/200] fix preventable addmission and care home cost

---
 R/create_individual_file.R | 52 ++++++++++++++++++++------------------
 1 file changed, 27 insertions(+), 25 deletions(-)

diff --git a/R/create_individual_file.R b/R/create_individual_file.R
index 99a1a6e77..b94e3e027 100644
--- a/R/create_individual_file.R
+++ b/R/create_individual_file.R
@@ -40,17 +40,14 @@ add_cij_columns <- function(episode_file) {
   episode_file %>%
     dplyr::mutate(
       CIJ_non_el = dplyr::if_else(.data$cij_pattype_code == 0,
-        .data$cij_marker,
-        NA_real_
-      ),
+                                  .data$cij_marker,
+                                  NA_real_),
       CIJ_el = dplyr::if_else(.data$cij_pattype_code == 1,
-        .data$cij_marker,
-        NA_real_
-      ),
+                              .data$cij_marker,
+                              NA_real_),
       CIJ_mat = dplyr::if_else(.data$cij_pattype_code == 2,
-        .data$cij_marker,
-        NA_real_
-      ),
+                               .data$cij_marker,
+                               NA_real_),
       # # assume cij_delay is logic variable
       # cij_delay = dplyr::if_else(
       #   (.data$cij_delay & .data$cij_marker == 1),
@@ -58,18 +55,10 @@ add_cij_columns <- function(episode_file) {
       #   0
       # ),
       preventable_admissions = dplyr::if_else((.data$cij_ppa == 1 &
-        .data$cij_marker == 1),
-      1,
-      0
-      ),
-      preventable_beddays = dplyr::if_else((.data$cij_ppa == 1 &
-        .data$cij_marker == 1),
-      as.numeric(
-        pmin(.data$cij_end_date, end_fy(year)) -
-          pmin(.data$cij_start_date, start_fy(year))
-      ),
-      0
-      )
+                                                 .data$cij_marker == 1),
+                                              1,
+                                              0)
+      # preventable_beddays is now added in aggragate_by_chi
     )
 }
 
@@ -313,12 +302,25 @@ add_ch_columns <- function(episode_file, prefix, condition) {
   episode_file %>%
     add_standard_cols(prefix, condition) %>%
     dplyr::mutate(
-      ch_cost_per_day = dplyr::if_else(eval(condition) & .data$yearstay > 0, .data$cost_total_net / .data$yearstay, NA_real_),
-      ch_cost_per_day = dplyr::if_else(eval(condition) & .data$yearstay == 0, .data$cost_total_net / .data$yearstay, .data$ch_cost_per_day),
+      ch_cost_per_day = dplyr::if_else(
+        eval(condition) &
+          .data$yearstay > 0,
+        .data$cost_total_net / .data$yearstay,
+        .data$cost_total_net
+      ),
       ch_no_cost = eval(condition) & is.na(.data$ch_cost_per_day),
-      ch_ep_end = dplyr::if_else(eval(condition), .data$record_keydate2, lubridate::NA_Date_),
+      ch_ep_end = dplyr::if_else(
+        eval(condition),
+        .data$record_keydate2,
+        lubridate::NA_Date_
+      ),
       # If end date is missing use the first day of next FY quarter
-      ch_ep_end = dplyr::if_else(eval(condition) & is.na(.data$ch_ep_end), start_next_fy_quarter(.data$sc_latest_submission), .data$ch_ep_end)
+      ch_ep_end = dplyr::if_else(
+        eval(condition) &
+          is.na(.data$ch_ep_end),
+        start_next_fy_quarter(.data$sc_latest_submission),
+        .data$ch_ep_end
+      )
     )
 }
 

From c42d7ba9d6fb8f2f8d66b092d347c24d7bf49da3 Mon Sep 17 00:00:00 2001
From: Zihao Li <lizihao_anu@outlook.com>
Date: Tue, 27 Jun 2023 14:07:54 +0100
Subject: [PATCH 146/200] fix preventable_admissions and calculate
 preventable_beddays here

---
 R/aggregate_by_chi_zihao.R | 21 +++++++++++++++++----
 1 file changed, 17 insertions(+), 4 deletions(-)

diff --git a/R/aggregate_by_chi_zihao.R b/R/aggregate_by_chi_zihao.R
index 3171a59dd..454ae920e 100644
--- a/R/aggregate_by_chi_zihao.R
+++ b/R/aggregate_by_chi_zihao.R
@@ -87,9 +87,12 @@ aggregate_by_chi_zihao <- function(individual_file) {
     "cij_total",
     "cij_el",
     "cij_non_el",
-    "cij_mat"
-    # "cij_delay"
+    "cij_mat",
+    # "cij_delay",
+    "preventable_admissions"
   )
+  # columns to select last unique rows group by chi and cij_marker
+  cols3.1 <- c("preventable_beddays")
   # columns to sum up
   cols4 <- c(
     vars_end_with(
@@ -120,8 +123,7 @@ aggregate_by_chi_zihao <- function(individual_file) {
       individual_file,
       "sds_option"
     ),
-    "health_net_costincdnas",
-    "preventable_admissions"
+    "health_net_costincdnas"
   )
   cols4 <- cols4[!(cols4 %in% c("ch_cis_episodes"))]
   # columns to select maximum
@@ -156,6 +158,14 @@ aggregate_by_chi_zihao <- function(individual_file) {
     .SDcols = cols3,
     by = chi
   ]
+  individual_file_cols3.1 <- individual_file[,
+    preventable_beddays :=
+      data.table::fifelse(cij_ppa == 1,
+                          max(cij_end_date) - min(cij_start_date),
+                          NA_real_),
+    .SDcols = cols3.1,
+    by = c("chi", "cij_marker")
+  ]
   individual_file_cols4 <- individual_file[,
     lapply(.SD, function(x) {
       sum(x, na.rm = TRUE)
@@ -183,6 +193,9 @@ aggregate_by_chi_zihao <- function(individual_file) {
     individual_file_cols5[, chi := NULL],
     individual_file_cols6[, chi := NULL]
   )
+  # cannot simply combine individual_file_cols3.1 as different group_by factors.
+  individual_file <- individual_file[individual_file_cols3.1,
+                                     on = "chi"]
 
   # convert back to tibble
   individual_file <- dplyr::as_tibble(individual_file)

From f0671fc3d76bcbf94a043870efe328d7d3fe7533 Mon Sep 17 00:00:00 2001
From: Zihao Li <lizihao_anu@outlook.com>
Date: Tue, 27 Jun 2023 14:43:42 +0100
Subject: [PATCH 147/200] add monthly_beddays and yearstay to dd

---
 R/link_delayed_discharge_eps.R | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/R/link_delayed_discharge_eps.R b/R/link_delayed_discharge_eps.R
index ee99503dd..67a6940c7 100644
--- a/R/link_delayed_discharge_eps.R
+++ b/R/link_delayed_discharge_eps.R
@@ -280,6 +280,12 @@ link_delayed_discharge_eps <- function(data, year) {
     dplyr::group_by(chi, cij_marker) %>%
     dplyr::mutate(cij_delay = max(has_delay)) %>%
     dplyr::ungroup() %>%
+    # add yearstay and monthly beddays
+    create_monthly_beddays() %>%
+    dplyr::mutate(yearstay = dplyr::rowSums(
+      paste0(month.abb[c(4:12,1:3)] %>% tolower(), "_beddays")
+    ))
+
     # tidy up and rename columns to match the format of episode files
     dplyr::select(
       "year" = "year_dd",
@@ -310,7 +316,9 @@ link_delayed_discharge_eps <- function(data, year) {
       "cij_delay",
       "location",
       "spec" = "spec_dd",
-      "dd_type"
+      "dd_type",
+      paste0(month.abb[c(4:12,1:3)] %>% tolower(), "_beddays"),
+      "yearstay"
     ) %>%
     # combine DD with episode data
     dplyr::bind_rows( # restore cij_end_date

From 9cc84f3832e6f7c46a8b72fbc8cb2a84d653c842 Mon Sep 17 00:00:00 2001
From: lizihao-anu <lizihao-anu@users.noreply.github.com>
Date: Tue, 27 Jun 2023 13:48:55 +0000
Subject: [PATCH 148/200] Style code

---
 R/aggregate_by_chi_zihao.R     | 11 ++++--
 R/create_individual_file.R     | 22 ++++++-----
 R/link_delayed_discharge_eps.R | 70 +++++++++++++++++-----------------
 R/process_extract_home_care.R  |  6 ++-
 4 files changed, 59 insertions(+), 50 deletions(-)

diff --git a/R/aggregate_by_chi_zihao.R b/R/aggregate_by_chi_zihao.R
index eea4ea898..a2e301a90 100644
--- a/R/aggregate_by_chi_zihao.R
+++ b/R/aggregate_by_chi_zihao.R
@@ -162,9 +162,11 @@ aggregate_by_chi_zihao <- function(individual_file) {
   ]
   individual_file_cols3.1 <- individual_file[,
     preventable_beddays :=
-      data.table::fifelse(cij_ppa == 1,
-                          max(cij_end_date) - min(cij_start_date),
-                          NA_real_),
+      data.table::fifelse(
+        cij_ppa == 1,
+        max(cij_end_date) - min(cij_start_date),
+        NA_real_
+      ),
     .SDcols = cols3.1,
     by = c("chi", "cij_marker")
   ]
@@ -197,7 +199,8 @@ aggregate_by_chi_zihao <- function(individual_file) {
   )
   # cannot simply combine individual_file_cols3.1 as different group_by factors.
   individual_file <- individual_file[individual_file_cols3.1,
-                                     on = "chi"]
+    on = "chi"
+  ]
 
   # convert back to tibble
   individual_file <- dplyr::as_tibble(individual_file)
diff --git a/R/create_individual_file.R b/R/create_individual_file.R
index b94e3e027..5444a850e 100644
--- a/R/create_individual_file.R
+++ b/R/create_individual_file.R
@@ -40,14 +40,17 @@ add_cij_columns <- function(episode_file) {
   episode_file %>%
     dplyr::mutate(
       CIJ_non_el = dplyr::if_else(.data$cij_pattype_code == 0,
-                                  .data$cij_marker,
-                                  NA_real_),
+        .data$cij_marker,
+        NA_real_
+      ),
       CIJ_el = dplyr::if_else(.data$cij_pattype_code == 1,
-                              .data$cij_marker,
-                              NA_real_),
+        .data$cij_marker,
+        NA_real_
+      ),
       CIJ_mat = dplyr::if_else(.data$cij_pattype_code == 2,
-                               .data$cij_marker,
-                               NA_real_),
+        .data$cij_marker,
+        NA_real_
+      ),
       # # assume cij_delay is logic variable
       # cij_delay = dplyr::if_else(
       #   (.data$cij_delay & .data$cij_marker == 1),
@@ -55,9 +58,10 @@ add_cij_columns <- function(episode_file) {
       #   0
       # ),
       preventable_admissions = dplyr::if_else((.data$cij_ppa == 1 &
-                                                 .data$cij_marker == 1),
-                                              1,
-                                              0)
+        .data$cij_marker == 1),
+      1,
+      0
+      )
       # preventable_beddays is now added in aggragate_by_chi
     )
 }
diff --git a/R/link_delayed_discharge_eps.R b/R/link_delayed_discharge_eps.R
index 67a6940c7..479545ed6 100644
--- a/R/link_delayed_discharge_eps.R
+++ b/R/link_delayed_discharge_eps.R
@@ -283,43 +283,43 @@ link_delayed_discharge_eps <- function(data, year) {
     # add yearstay and monthly beddays
     create_monthly_beddays() %>%
     dplyr::mutate(yearstay = dplyr::rowSums(
-      paste0(month.abb[c(4:12,1:3)] %>% tolower(), "_beddays")
+      paste0(month.abb[c(4:12, 1:3)] %>% tolower(), "_beddays")
     ))
 
-    # tidy up and rename columns to match the format of episode files
-    dplyr::select(
-      "year" = "year_dd",
-      "recid" = "recid_dd",
-      "record_keydate1" = "record_keydate1_dd",
-      "record_keydate2" = "record_keydate2_dd",
-      "smrtype",
-      "chi",
-      "gender",
-      "dob",
-      "age",
-      "gpprac",
-      "postcode" = "postcode_dd",
-      "lca" = "dd_responsible_lca",
-      "hbtreatcode" = "hbtreatcode_dd",
-      "original_admission_date",
-      "delay_end_reason",
-      "primary_delay_reason",
-      "secondary_delay_reason",
-      "cij_marker",
-      "cij_start_date",
-      "cij_end_date",
-      "cij_pattype_code",
-      "cij_ipdc",
-      "cij_admtype",
-      "cij_adm_spec",
-      "cij_dis_spec",
-      "cij_delay",
-      "location",
-      "spec" = "spec_dd",
-      "dd_type",
-      paste0(month.abb[c(4:12,1:3)] %>% tolower(), "_beddays"),
-      "yearstay"
-    ) %>%
+  # tidy up and rename columns to match the format of episode files
+  dplyr::select(
+    "year" = "year_dd",
+    "recid" = "recid_dd",
+    "record_keydate1" = "record_keydate1_dd",
+    "record_keydate2" = "record_keydate2_dd",
+    "smrtype",
+    "chi",
+    "gender",
+    "dob",
+    "age",
+    "gpprac",
+    "postcode" = "postcode_dd",
+    "lca" = "dd_responsible_lca",
+    "hbtreatcode" = "hbtreatcode_dd",
+    "original_admission_date",
+    "delay_end_reason",
+    "primary_delay_reason",
+    "secondary_delay_reason",
+    "cij_marker",
+    "cij_start_date",
+    "cij_end_date",
+    "cij_pattype_code",
+    "cij_ipdc",
+    "cij_admtype",
+    "cij_adm_spec",
+    "cij_dis_spec",
+    "cij_delay",
+    "location",
+    "spec" = "spec_dd",
+    "dd_type",
+    paste0(month.abb[c(4:12, 1:3)] %>% tolower(), "_beddays"),
+    "yearstay"
+  ) %>%
     # combine DD with episode data
     dplyr::bind_rows( # restore cij_end_date
       data %>%
diff --git a/R/process_extract_home_care.R b/R/process_extract_home_care.R
index d541cc86a..b510f5c3b 100644
--- a/R/process_extract_home_care.R
+++ b/R/process_extract_home_care.R
@@ -66,8 +66,10 @@ process_extract_home_care <- function(
     # remove cost variables not from current year
     dplyr::select(-(tidyselect::contains("hc_cost_2"))) %>%
     # create cost total net
-    dplyr::mutate(cost_total_net = rowSums(
-      dplyr::pick(tidyselect::contains("hc_cost_q"))),
+    dplyr::mutate(
+      cost_total_net = rowSums(
+        dplyr::pick(tidyselect::contains("hc_cost_q"))
+      ),
       na.rm = TRUE
     )
 

From d6391e54cd4e99310e2f4b8e24bcdee9d8fd682b Mon Sep 17 00:00:00 2001
From: Zihao Li <lizihao_anu@outlook.com>
Date: Tue, 27 Jun 2023 22:03:54 +0100
Subject: [PATCH 149/200] fix preventable_admissions and preventable_beddays

---
 R/aggregate_by_chi_zihao.R | 52 +++++++++++++++++++++-----------------
 R/create_individual_file.R |  8 +++---
 2 files changed, 32 insertions(+), 28 deletions(-)

diff --git a/R/aggregate_by_chi_zihao.R b/R/aggregate_by_chi_zihao.R
index a2e301a90..af65695e1 100644
--- a/R/aggregate_by_chi_zihao.R
+++ b/R/aggregate_by_chi_zihao.R
@@ -50,7 +50,7 @@ aggregate_by_chi_zihao <- function(individual_file) {
     )
   )
 
-  # colums specification
+  # colums specification, grouped by chi
   # columns to select last
   cols2 <- c(
     vars_end_with(
@@ -83,18 +83,15 @@ aggregate_by_chi_zihao <- function(individual_file) {
     "ca2019",
     vars_start_with(individual_file, "sc_")
   )
-  # columns to select last unique rows
+  # columns to count unique rows
   cols3 <- c(
     "ch_cis_episodes",
     "cij_total",
     "cij_el",
     "cij_non_el",
-    "cij_mat",
-    # "cij_delay",
-    "preventable_admissions"
+    "cij_mat"
+    # "cij_delay"
   )
-  # columns to select last unique rows group by chi and cij_marker
-  cols3.1 <- c("preventable_beddays")
   # columns to sum up
   cols4 <- c(
     vars_end_with(
@@ -142,6 +139,8 @@ aggregate_by_chi_zihao <- function(individual_file) {
       c("_cohort", "end_fy", "start_fy")
     )
   )
+  # columns to group by chi and cij_marker, mainly preventable
+  # cols7 <- c("preventable_admissions", "preventable_beddays")
 
   # compute
   individual_file_cols1 <- individual_file[,
@@ -160,16 +159,6 @@ aggregate_by_chi_zihao <- function(individual_file) {
     .SDcols = cols3,
     by = chi
   ]
-  individual_file_cols3.1 <- individual_file[,
-    preventable_beddays :=
-      data.table::fifelse(
-        cij_ppa == 1,
-        max(cij_end_date) - min(cij_start_date),
-        NA_real_
-      ),
-    .SDcols = cols3.1,
-    by = c("chi", "cij_marker")
-  ]
   individual_file_cols4 <- individual_file[,
     lapply(.SD, function(x) {
       sum(x, na.rm = TRUE)
@@ -189,19 +178,36 @@ aggregate_by_chi_zihao <- function(individual_file) {
     .SDcols = cols6,
     by = chi
   ]
+  individual_file_cols7 <- individual_file[,
+     `:=`(
+       preventable_beddays =
+         data.table::fifelse(
+           cij_ppa == 1,
+           max(cij_end_date) - min(cij_start_date),
+           NA_integer_
+         )
+     ),
+    by = c("chi", "cij_marker")
+  ]
+  individual_file_cols7 <- individual_file_cols7[,
+     `:=`(
+       preventable_admissions =
+         (unique(preventable_admissions) %>% uniqueN(na.rm = TRUE)),
+       preventable_beddays =
+         sum(preventable_beddays, na.rm = TRUE)
+     ),
+     by = "chi"
+  ]
+
   individual_file <- dplyr::bind_cols(
     individual_file_cols1,
     individual_file_cols2[, chi := NULL],
     individual_file_cols3[, chi := NULL],
     individual_file_cols4[, chi := NULL],
     individual_file_cols5[, chi := NULL],
-    individual_file_cols6[, chi := NULL]
+    individual_file_cols6[, chi := NULL],
+    individual_file_cols7[, chi := NULL],
   )
-  # cannot simply combine individual_file_cols3.1 as different group_by factors.
-  individual_file <- individual_file[individual_file_cols3.1,
-    on = "chi"
-  ]
-
   # convert back to tibble
   individual_file <- dplyr::as_tibble(individual_file)
 
diff --git a/R/create_individual_file.R b/R/create_individual_file.R
index 5444a850e..d677609df 100644
--- a/R/create_individual_file.R
+++ b/R/create_individual_file.R
@@ -57,11 +57,9 @@ add_cij_columns <- function(episode_file) {
       #   1,
       #   0
       # ),
-      preventable_admissions = dplyr::if_else((.data$cij_ppa == 1 &
-        .data$cij_marker == 1),
-      1,
-      0
-      )
+      preventable_admissions = dplyr::if_else((.data$cij_ppa == 1),
+                                              cij_marker,
+                                              NA_integer_)
       # preventable_beddays is now added in aggragate_by_chi
     )
 }

From 1a136fdae625a5d8c782cec84ddc93af3aaa3545 Mon Sep 17 00:00:00 2001
From: lizihao-anu <lizihao-anu@users.noreply.github.com>
Date: Tue, 27 Jun 2023 21:06:06 +0000
Subject: [PATCH 150/200] Style code

---
 R/aggregate_by_chi_zihao.R | 30 +++++++++++++++---------------
 R/create_individual_file.R |  5 +++--
 2 files changed, 18 insertions(+), 17 deletions(-)

diff --git a/R/aggregate_by_chi_zihao.R b/R/aggregate_by_chi_zihao.R
index af65695e1..5c3665e2a 100644
--- a/R/aggregate_by_chi_zihao.R
+++ b/R/aggregate_by_chi_zihao.R
@@ -179,24 +179,24 @@ aggregate_by_chi_zihao <- function(individual_file) {
     by = chi
   ]
   individual_file_cols7 <- individual_file[,
-     `:=`(
-       preventable_beddays =
-         data.table::fifelse(
-           cij_ppa == 1,
-           max(cij_end_date) - min(cij_start_date),
-           NA_integer_
-         )
-     ),
+    `:=`(
+      preventable_beddays =
+        data.table::fifelse(
+          cij_ppa == 1,
+          max(cij_end_date) - min(cij_start_date),
+          NA_integer_
+        )
+    ),
     by = c("chi", "cij_marker")
   ]
   individual_file_cols7 <- individual_file_cols7[,
-     `:=`(
-       preventable_admissions =
-         (unique(preventable_admissions) %>% uniqueN(na.rm = TRUE)),
-       preventable_beddays =
-         sum(preventable_beddays, na.rm = TRUE)
-     ),
-     by = "chi"
+    `:=`(
+      preventable_admissions =
+        (unique(preventable_admissions) %>% uniqueN(na.rm = TRUE)),
+      preventable_beddays =
+        sum(preventable_beddays, na.rm = TRUE)
+    ),
+    by = "chi"
   ]
 
   individual_file <- dplyr::bind_cols(
diff --git a/R/create_individual_file.R b/R/create_individual_file.R
index d677609df..91111b336 100644
--- a/R/create_individual_file.R
+++ b/R/create_individual_file.R
@@ -58,8 +58,9 @@ add_cij_columns <- function(episode_file) {
       #   0
       # ),
       preventable_admissions = dplyr::if_else((.data$cij_ppa == 1),
-                                              cij_marker,
-                                              NA_integer_)
+        cij_marker,
+        NA_integer_
+      )
       # preventable_beddays is now added in aggragate_by_chi
     )
 }

From c631f4e6d1de0cff9baca0d002fce0657483c78f Mon Sep 17 00:00:00 2001
From: Jennifer Thom <jennifer.thom@phs.scot>
Date: Wed, 28 Jun 2023 11:54:26 +0100
Subject: [PATCH 151/200] include parameter for write to disk/year

---
 NAMESPACE                     |  1 +
 R/create_individual_file.R    | 25 +++++++++++++++++++++++--
 man/create_individual_file.Rd | 10 +++++++++-
 3 files changed, 33 insertions(+), 3 deletions(-)

diff --git a/NAMESPACE b/NAMESPACE
index c3d083704..68529122d 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -13,6 +13,7 @@ export(convert_hscp_to_hscpnames)
 export(convert_numeric_to_date)
 export(convert_sending_location_to_lca)
 export(convert_year_to_fyyear)
+export(create_individual_file)
 export(create_service_use_cohorts)
 export(end_fy)
 export(end_fy_quarter)
diff --git a/R/create_individual_file.R b/R/create_individual_file.R
index 99a1a6e77..50e881ace 100644
--- a/R/create_individual_file.R
+++ b/R/create_individual_file.R
@@ -3,8 +3,15 @@
 #' @description Creates individual file from episode file
 #'
 #' @param episode_file Tibble containing episodic data
-create_individual_file <- function(episode_file) {
-  episode_file %>%
+#' @param year The year to process, in FY format.
+#' @param write_to_disk (optional) Should the data be written to disk default is
+#' `TRUE` i.e. write the data to disk.
+#'
+#' @return The processed individual file
+#' @export
+#'
+create_individual_file <- function(episode_file, year, write_to_disk = TRUE) {
+  individual_file <- episode_file %>%
     remove_blank_chi() %>%
     add_cij_columns() %>%
     add_all_columns() %>%
@@ -13,6 +20,20 @@ create_individual_file <- function(episode_file) {
     recode_gender() %>%
     aggregate_by_chi_zihao() %>%
     clean_individual_file()
+
+  if (write_to_disk) {
+    slf_path <- get_file_path(
+      get_year_dir(year),
+      stringr::str_glue(
+        "source-individual-file-{year}.parquet"
+      ),
+      check_mode = "write"
+    )
+
+    write_file(episode_file, slf_path)
+  }
+
+  return(individual_file)
 }
 
 #' Remove blank CHI
diff --git a/man/create_individual_file.Rd b/man/create_individual_file.Rd
index 8b0887565..d1feb23df 100644
--- a/man/create_individual_file.Rd
+++ b/man/create_individual_file.Rd
@@ -4,10 +4,18 @@
 \alias{create_individual_file}
 \title{Create individual file}
 \usage{
-create_individual_file(episode_file)
+create_individual_file(episode_file, year, write_to_disk = TRUE)
 }
 \arguments{
 \item{episode_file}{Tibble containing episodic data}
+
+\item{year}{The year to process, in FY format.}
+
+\item{write_to_disk}{(optional) Should the data be written to disk default is
+\code{TRUE} i.e. write the data to disk.}
+}
+\value{
+The processed individual file
 }
 \description{
 Creates individual file from episode file

From b7316768a72b5cd7a91cb3e5b1881d40a84d93f4 Mon Sep 17 00:00:00 2001
From: Jennifer Thom <jennifer.thom@phs.scot>
Date: Wed, 28 Jun 2023 13:45:26 +0100
Subject: [PATCH 152/200] Add lookups to indiv file creation pipeline

---
 R/create_individual_file.R | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/R/create_individual_file.R b/R/create_individual_file.R
index 50e881ace..075ae5de5 100644
--- a/R/create_individual_file.R
+++ b/R/create_individual_file.R
@@ -19,7 +19,11 @@ create_individual_file <- function(episode_file, year, write_to_disk = TRUE) {
     clean_up_ch() %>%
     recode_gender() %>%
     aggregate_by_chi_zihao() %>%
-    clean_individual_file()
+    clean_individual_file() %>%
+    join_cohort_lookups(year) %>%
+    match_on_ltcs(year) %>%
+    join_deaths_data(year) %>%
+    join_sparra_hhg(year)
 
   if (write_to_disk) {
     slf_path <- get_file_path(

From 5507a332cc0868c81eba806be490ad99fdf1106e Mon Sep 17 00:00:00 2001
From: Jennifer Thom <jennifer.thom@phs.scot>
Date: Wed, 28 Jun 2023 11:54:26 +0100
Subject: [PATCH 153/200] include parameter for write to disk/year

---
 NAMESPACE                     |  1 +
 R/create_individual_file.R    | 25 +++++++++++++++++++++++--
 man/create_individual_file.Rd | 10 +++++++++-
 3 files changed, 33 insertions(+), 3 deletions(-)

diff --git a/NAMESPACE b/NAMESPACE
index aefb84b62..d81c66bfe 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -13,6 +13,7 @@ export(convert_hscp_to_hscpnames)
 export(convert_numeric_to_date)
 export(convert_sending_location_to_lca)
 export(convert_year_to_fyyear)
+export(create_individual_file)
 export(create_service_use_cohorts)
 export(end_fy)
 export(end_fy_quarter)
diff --git a/R/create_individual_file.R b/R/create_individual_file.R
index 99a1a6e77..50e881ace 100644
--- a/R/create_individual_file.R
+++ b/R/create_individual_file.R
@@ -3,8 +3,15 @@
 #' @description Creates individual file from episode file
 #'
 #' @param episode_file Tibble containing episodic data
-create_individual_file <- function(episode_file) {
-  episode_file %>%
+#' @param year The year to process, in FY format.
+#' @param write_to_disk (optional) Should the data be written to disk default is
+#' `TRUE` i.e. write the data to disk.
+#'
+#' @return The processed individual file
+#' @export
+#'
+create_individual_file <- function(episode_file, year, write_to_disk = TRUE) {
+  individual_file <- episode_file %>%
     remove_blank_chi() %>%
     add_cij_columns() %>%
     add_all_columns() %>%
@@ -13,6 +20,20 @@ create_individual_file <- function(episode_file) {
     recode_gender() %>%
     aggregate_by_chi_zihao() %>%
     clean_individual_file()
+
+  if (write_to_disk) {
+    slf_path <- get_file_path(
+      get_year_dir(year),
+      stringr::str_glue(
+        "source-individual-file-{year}.parquet"
+      ),
+      check_mode = "write"
+    )
+
+    write_file(episode_file, slf_path)
+  }
+
+  return(individual_file)
 }
 
 #' Remove blank CHI
diff --git a/man/create_individual_file.Rd b/man/create_individual_file.Rd
index 8b0887565..d1feb23df 100644
--- a/man/create_individual_file.Rd
+++ b/man/create_individual_file.Rd
@@ -4,10 +4,18 @@
 \alias{create_individual_file}
 \title{Create individual file}
 \usage{
-create_individual_file(episode_file)
+create_individual_file(episode_file, year, write_to_disk = TRUE)
 }
 \arguments{
 \item{episode_file}{Tibble containing episodic data}
+
+\item{year}{The year to process, in FY format.}
+
+\item{write_to_disk}{(optional) Should the data be written to disk default is
+\code{TRUE} i.e. write the data to disk.}
+}
+\value{
+The processed individual file
 }
 \description{
 Creates individual file from episode file

From e8f1099c763e77788df943366e6ecef9d7c0c5b8 Mon Sep 17 00:00:00 2001
From: Zihao Li <lizihao_anu@outlook.com>
Date: Wed, 28 Jun 2023 17:42:08 +0100
Subject: [PATCH 154/200] fix delay discharge beddays and yearstay

---
 R/link_delayed_discharge_eps.R | 130 ++++++++++++++++-----------------
 1 file changed, 63 insertions(+), 67 deletions(-)

diff --git a/R/link_delayed_discharge_eps.R b/R/link_delayed_discharge_eps.R
index 479545ed6..9f6996baa 100644
--- a/R/link_delayed_discharge_eps.R
+++ b/R/link_delayed_discharge_eps.R
@@ -16,13 +16,15 @@ link_delayed_discharge_eps <- function(data, year) {
       cij_start_date_lower = .data$cij_start_date - lubridate::days(1L),
       cij_end_date_upper = .data$cij_end_date + lubridate::days(1L),
       cij_end_month = last_date_month(.data$cij_end_date),
-      is_dummy_cij_start = is.na(.data$cij_start_date) & !is.na(.data$cij_end_date),
+      is_dummy_cij_start = is.na(.data$cij_start_date) &
+        !is.na(.data$cij_end_date),
       dummy_cij_start = dplyr::if_else(
         .data$is_dummy_cij_start,
         lubridate::as_date("1900-01-01"),
         .data$cij_start_date_lower
       ),
-      is_dummy_cij_end = !is.na(.data$cij_start_date) & is.na(.data$cij_end_date),
+      is_dummy_cij_end = !is.na(.data$cij_start_date) &
+        is.na(.data$cij_end_date),
       dummy_cij_end = dplyr::if_else(
         .data$is_dummy_cij_end,
         lubridate::today(),
@@ -34,37 +36,31 @@ link_delayed_discharge_eps <- function(data, year) {
   # no flag for last reported
   dd_data <-
     read_file(get_source_extract_path(year_param, "DD")) %>%
-    dplyr::rename(
-      # TODO Change the name of the variables in the DD extract rather than here.
+    dplyr::rename(# TODO Change the name of the variables in the DD extract rather than here.
       record_keydate1 = "keydate1_dateformat",
-      record_keydate2 = "keydate2_dateformat"
-    ) %>%
+      record_keydate2 = "keydate2_dateformat") %>%
     dplyr::mutate(
       # remember to revoke the keydate2 and amended_dates with dummy_keydate2
       is_dummy_keydate2 = is.na(.data$record_keydate2),
-      dummy_keydate2 = dplyr::if_else(.data$is_dummy_keydate2,
+      dummy_keydate2 = dplyr::if_else(
+        .data$is_dummy_keydate2,
         lubridate::today(),
         .data$record_keydate2
       ),
       dummy_id = dplyr::row_number()
     )
 
-  by_dd <- dplyr::join_by(
-    chi,
-    x$record_keydate1 >= y$dummy_cij_start,
-    x$dummy_keydate2 <= y$dummy_cij_end
-  )
+  by_dd <- dplyr::join_by(chi,
+                          x$record_keydate1 >= y$dummy_cij_start,
+                          x$dummy_keydate2 <= y$dummy_cij_end)
   data <- dd_data %>%
     dplyr::inner_join(data,
-      by = by_dd,
-      suffix = c("_dd", "")
-    ) %>%
-    dplyr::arrange(
-      .data$cij_start_date,
-      .data$cij_end_date,
-      .data$cij_marker,
-      .data$postcode
-    ) %>%
+                      by = by_dd,
+                      suffix = c("_dd", "")) %>%
+    dplyr::arrange(.data$cij_start_date,
+                   .data$cij_end_date,
+                   .data$cij_marker,
+                   .data$postcode) %>%
     # remove duplicate rows, but still got some duplicate mismatches
     dplyr::distinct(
       .data$chi,
@@ -265,12 +261,10 @@ link_delayed_discharge_eps <- function(data, year) {
       .data$datediff_end,
       dplyr::desc(.data$datediff_start)
     ) %>%
-    dplyr::distinct(
-      .data$postcode,
-      .data$record_keydate1_dd,
-      .data$record_keydate2_dd,
-      .keep_all = TRUE
-    ) %>%
+    dplyr::distinct(.data$postcode,
+                    .data$record_keydate1_dd,
+                    .data$record_keydate2_dd,
+                    .keep_all = TRUE) %>%
     # add cij_delay
     dplyr::mutate(has_delay = dplyr::if_else(
       .data$chi != "" & !is.na(.data$cij_marker),
@@ -281,47 +275,50 @@ link_delayed_discharge_eps <- function(data, year) {
     dplyr::mutate(cij_delay = max(has_delay)) %>%
     dplyr::ungroup() %>%
     # add yearstay and monthly beddays
-    create_monthly_beddays() %>%
-    dplyr::mutate(yearstay = dplyr::rowSums(
+    create_monthly_beddays(year,
+                           .data$record_keydate1,
+                           .data$record_keydate2) %>%
+    dplyr::mutate(yearstay = rowSums(dplyr::select(
+      .,
       paste0(month.abb[c(4:12, 1:3)] %>% tolower(), "_beddays")
-    ))
+    ))) %>%
 
-  # tidy up and rename columns to match the format of episode files
-  dplyr::select(
-    "year" = "year_dd",
-    "recid" = "recid_dd",
-    "record_keydate1" = "record_keydate1_dd",
-    "record_keydate2" = "record_keydate2_dd",
-    "smrtype",
-    "chi",
-    "gender",
-    "dob",
-    "age",
-    "gpprac",
-    "postcode" = "postcode_dd",
-    "lca" = "dd_responsible_lca",
-    "hbtreatcode" = "hbtreatcode_dd",
-    "original_admission_date",
-    "delay_end_reason",
-    "primary_delay_reason",
-    "secondary_delay_reason",
-    "cij_marker",
-    "cij_start_date",
-    "cij_end_date",
-    "cij_pattype_code",
-    "cij_ipdc",
-    "cij_admtype",
-    "cij_adm_spec",
-    "cij_dis_spec",
-    "cij_delay",
-    "location",
-    "spec" = "spec_dd",
-    "dd_type",
-    paste0(month.abb[c(4:12, 1:3)] %>% tolower(), "_beddays"),
-    "yearstay"
-  ) %>%
+    # tidy up and rename columns to match the format of episode files
+    dplyr::select(
+      "year" = "year_dd",
+      "recid" = "recid_dd",
+      "record_keydate1" = "record_keydate1_dd",
+      "record_keydate2" = "record_keydate2_dd",
+      "smrtype",
+      "chi",
+      "gender",
+      "dob",
+      "age",
+      "gpprac",
+      "postcode" = "postcode_dd",
+      "lca" = "dd_responsible_lca",
+      "hbtreatcode" = "hbtreatcode_dd",
+      "original_admission_date",
+      "delay_end_reason",
+      "primary_delay_reason",
+      "secondary_delay_reason",
+      "cij_marker",
+      "cij_start_date",
+      "cij_end_date",
+      "cij_pattype_code",
+      "cij_ipdc",
+      "cij_admtype",
+      "cij_adm_spec",
+      "cij_dis_spec",
+      "cij_delay",
+      "location",
+      "spec" = "spec_dd",
+      "dd_type",
+      all_of(paste0(month.abb[c(4:12, 1:3)] %>% tolower(), "_beddays")),
+      "yearstay"
+    ) %>%
     # combine DD with episode data
-    dplyr::bind_rows( # restore cij_end_date
+    dplyr::bind_rows(# restore cij_end_date
       data %>%
         dplyr::select(
           -c(
@@ -333,8 +330,7 @@ link_delayed_discharge_eps <- function(data, year) {
             "is_dummy_cij_end",
             "dummy_cij_end"
           )
-        )
-    )
+        ))
 
   return(data)
 }

From ff36479cc7a80488bd4f32f0baea34648fb9165a Mon Sep 17 00:00:00 2001
From: lizihao-anu <lizihao-anu@users.noreply.github.com>
Date: Wed, 28 Jun 2023 16:44:42 +0000
Subject: [PATCH 155/200] Style code

---
 R/link_delayed_discharge_eps.R | 49 ++++++++++++++++++++--------------
 1 file changed, 29 insertions(+), 20 deletions(-)

diff --git a/R/link_delayed_discharge_eps.R b/R/link_delayed_discharge_eps.R
index 9f6996baa..884fb3043 100644
--- a/R/link_delayed_discharge_eps.R
+++ b/R/link_delayed_discharge_eps.R
@@ -36,9 +36,10 @@ link_delayed_discharge_eps <- function(data, year) {
   # no flag for last reported
   dd_data <-
     read_file(get_source_extract_path(year_param, "DD")) %>%
-    dplyr::rename(# TODO Change the name of the variables in the DD extract rather than here.
+    dplyr::rename( # TODO Change the name of the variables in the DD extract rather than here.
       record_keydate1 = "keydate1_dateformat",
-      record_keydate2 = "keydate2_dateformat") %>%
+      record_keydate2 = "keydate2_dateformat"
+    ) %>%
     dplyr::mutate(
       # remember to revoke the keydate2 and amended_dates with dummy_keydate2
       is_dummy_keydate2 = is.na(.data$record_keydate2),
@@ -50,17 +51,22 @@ link_delayed_discharge_eps <- function(data, year) {
       dummy_id = dplyr::row_number()
     )
 
-  by_dd <- dplyr::join_by(chi,
-                          x$record_keydate1 >= y$dummy_cij_start,
-                          x$dummy_keydate2 <= y$dummy_cij_end)
+  by_dd <- dplyr::join_by(
+    chi,
+    x$record_keydate1 >= y$dummy_cij_start,
+    x$dummy_keydate2 <= y$dummy_cij_end
+  )
   data <- dd_data %>%
     dplyr::inner_join(data,
-                      by = by_dd,
-                      suffix = c("_dd", "")) %>%
-    dplyr::arrange(.data$cij_start_date,
-                   .data$cij_end_date,
-                   .data$cij_marker,
-                   .data$postcode) %>%
+      by = by_dd,
+      suffix = c("_dd", "")
+    ) %>%
+    dplyr::arrange(
+      .data$cij_start_date,
+      .data$cij_end_date,
+      .data$cij_marker,
+      .data$postcode
+    ) %>%
     # remove duplicate rows, but still got some duplicate mismatches
     dplyr::distinct(
       .data$chi,
@@ -262,9 +268,10 @@ link_delayed_discharge_eps <- function(data, year) {
       dplyr::desc(.data$datediff_start)
     ) %>%
     dplyr::distinct(.data$postcode,
-                    .data$record_keydate1_dd,
-                    .data$record_keydate2_dd,
-                    .keep_all = TRUE) %>%
+      .data$record_keydate1_dd,
+      .data$record_keydate2_dd,
+      .keep_all = TRUE
+    ) %>%
     # add cij_delay
     dplyr::mutate(has_delay = dplyr::if_else(
       .data$chi != "" & !is.na(.data$cij_marker),
@@ -275,14 +282,15 @@ link_delayed_discharge_eps <- function(data, year) {
     dplyr::mutate(cij_delay = max(has_delay)) %>%
     dplyr::ungroup() %>%
     # add yearstay and monthly beddays
-    create_monthly_beddays(year,
-                           .data$record_keydate1,
-                           .data$record_keydate2) %>%
+    create_monthly_beddays(
+      year,
+      .data$record_keydate1,
+      .data$record_keydate2
+    ) %>%
     dplyr::mutate(yearstay = rowSums(dplyr::select(
       .,
       paste0(month.abb[c(4:12, 1:3)] %>% tolower(), "_beddays")
     ))) %>%
-
     # tidy up and rename columns to match the format of episode files
     dplyr::select(
       "year" = "year_dd",
@@ -318,7 +326,7 @@ link_delayed_discharge_eps <- function(data, year) {
       "yearstay"
     ) %>%
     # combine DD with episode data
-    dplyr::bind_rows(# restore cij_end_date
+    dplyr::bind_rows( # restore cij_end_date
       data %>%
         dplyr::select(
           -c(
@@ -330,7 +338,8 @@ link_delayed_discharge_eps <- function(data, year) {
             "is_dummy_cij_end",
             "dummy_cij_end"
           )
-        ))
+        )
+    )
 
   return(data)
 }

From 23e851306a24d151576807aee52dca3bc69fa752 Mon Sep 17 00:00:00 2001
From: Zihao Li <lizihao_anu@outlook.com>
Date: Wed, 28 Jun 2023 17:58:07 +0100
Subject: [PATCH 156/200] fix preventable issues

---
 R/aggregate_by_chi_zihao.R | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/R/aggregate_by_chi_zihao.R b/R/aggregate_by_chi_zihao.R
index 5c3665e2a..8e3889260 100644
--- a/R/aggregate_by_chi_zihao.R
+++ b/R/aggregate_by_chi_zihao.R
@@ -181,18 +181,20 @@ aggregate_by_chi_zihao <- function(individual_file) {
   individual_file_cols7 <- individual_file[,
     `:=`(
       preventable_beddays =
-        data.table::fifelse(
+        # ifelse is faster than dplyr::if_else here
+        ifelse(
           cij_ppa == 1,
           max(cij_end_date) - min(cij_start_date),
-          NA_integer_
+          NA
         )
     ),
-    by = c("chi", "cij_marker")
+    # cij_marker has been renamed as cij_total
+    by = c("chi", "cij_total")
   ]
   individual_file_cols7 <- individual_file_cols7[,
     `:=`(
       preventable_admissions =
-        (unique(preventable_admissions) %>% uniqueN(na.rm = TRUE)),
+        (unique(preventable_admissions) %>% data.table::uniqueN(na.rm = TRUE)),
       preventable_beddays =
         sum(preventable_beddays, na.rm = TRUE)
     ),

From 3022576ff2b856b77963548de1c44a947f70404e Mon Sep 17 00:00:00 2001
From: lizihao-anu <lizihao-anu@users.noreply.github.com>
Date: Wed, 28 Jun 2023 17:01:41 +0000
Subject: [PATCH 157/200] Style code

---
 R/aggregate_by_chi_zihao.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/R/aggregate_by_chi_zihao.R b/R/aggregate_by_chi_zihao.R
index 8e3889260..405d5a795 100644
--- a/R/aggregate_by_chi_zihao.R
+++ b/R/aggregate_by_chi_zihao.R
@@ -181,7 +181,7 @@ aggregate_by_chi_zihao <- function(individual_file) {
   individual_file_cols7 <- individual_file[,
     `:=`(
       preventable_beddays =
-        # ifelse is faster than dplyr::if_else here
+      # ifelse is faster than dplyr::if_else here
         ifelse(
           cij_ppa == 1,
           max(cij_end_date) - min(cij_start_date),

From 9a7b8e0bbcb440e7017d3767162bfaac742662d0 Mon Sep 17 00:00:00 2001
From: Zihao Li <lizihao_anu@outlook.com>
Date: Wed, 28 Jun 2023 18:14:44 +0100
Subject: [PATCH 158/200] fix the issue of preventable stuff

---
 R/aggregate_by_chi_zihao.R | 31 ++++++++++++++++---------------
 1 file changed, 16 insertions(+), 15 deletions(-)

diff --git a/R/aggregate_by_chi_zihao.R b/R/aggregate_by_chi_zihao.R
index 405d5a795..387a093dc 100644
--- a/R/aggregate_by_chi_zihao.R
+++ b/R/aggregate_by_chi_zihao.R
@@ -179,26 +179,27 @@ aggregate_by_chi_zihao <- function(individual_file) {
     by = chi
   ]
   individual_file_cols7 <- individual_file[,
-    `:=`(
-      preventable_beddays =
-      # ifelse is faster than dplyr::if_else here
-        ifelse(
-          cij_ppa == 1,
-          max(cij_end_date) - min(cij_start_date),
-          NA
-        )
+    .(
+     preventable_admissions = preventable_admissions,
+     preventable_beddays =
+       # ifelse is faster than dplyr::if_else here
+       ifelse(
+         cij_ppa == 1,
+         max(cij_end_date) - min(cij_start_date),
+         NA
+       )
     ),
     # cij_marker has been renamed as cij_total
     by = c("chi", "cij_total")
   ]
   individual_file_cols7 <- individual_file_cols7[,
-    `:=`(
-      preventable_admissions =
-        (unique(preventable_admissions) %>% data.table::uniqueN(na.rm = TRUE)),
-      preventable_beddays =
-        sum(preventable_beddays, na.rm = TRUE)
-    ),
-    by = "chi"
+     .(
+       preventable_admissions =
+         data.table::uniqueN(unique(preventable_admissions), na.rm = TRUE),
+       preventable_beddays =
+         sum(preventable_beddays, na.rm = TRUE)
+     ),
+     by = "chi"
   ]
 
   individual_file <- dplyr::bind_cols(

From d2649439692bc4a1b78e8c6d9eadf6293d197c92 Mon Sep 17 00:00:00 2001
From: lizihao-anu <lizihao-anu@users.noreply.github.com>
Date: Wed, 28 Jun 2023 17:18:21 +0000
Subject: [PATCH 159/200] Style code

---
 R/aggregate_by_chi_zihao.R | 30 +++++++++++++++---------------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/R/aggregate_by_chi_zihao.R b/R/aggregate_by_chi_zihao.R
index 387a093dc..b4951bbc6 100644
--- a/R/aggregate_by_chi_zihao.R
+++ b/R/aggregate_by_chi_zihao.R
@@ -180,26 +180,26 @@ aggregate_by_chi_zihao <- function(individual_file) {
   ]
   individual_file_cols7 <- individual_file[,
     .(
-     preventable_admissions = preventable_admissions,
-     preventable_beddays =
-       # ifelse is faster than dplyr::if_else here
-       ifelse(
-         cij_ppa == 1,
-         max(cij_end_date) - min(cij_start_date),
-         NA
-       )
+      preventable_admissions = preventable_admissions,
+      preventable_beddays =
+      # ifelse is faster than dplyr::if_else here
+        ifelse(
+          cij_ppa == 1,
+          max(cij_end_date) - min(cij_start_date),
+          NA
+        )
     ),
     # cij_marker has been renamed as cij_total
     by = c("chi", "cij_total")
   ]
   individual_file_cols7 <- individual_file_cols7[,
-     .(
-       preventable_admissions =
-         data.table::uniqueN(unique(preventable_admissions), na.rm = TRUE),
-       preventable_beddays =
-         sum(preventable_beddays, na.rm = TRUE)
-     ),
-     by = "chi"
+    .(
+      preventable_admissions =
+        data.table::uniqueN(unique(preventable_admissions), na.rm = TRUE),
+      preventable_beddays =
+        sum(preventable_beddays, na.rm = TRUE)
+    ),
+    by = "chi"
   ]
 
   individual_file <- dplyr::bind_cols(

From 7433fb86c4f2beee64b0eb27ee5c91470097cab3 Mon Sep 17 00:00:00 2001
From: Zihao Li <zihao.li@phs.scot>
Date: Wed, 28 Jun 2023 18:41:04 +0100
Subject: [PATCH 160/200] Update R/aggregate_by_chi_zihao.R

---
 R/aggregate_by_chi_zihao.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/R/aggregate_by_chi_zihao.R b/R/aggregate_by_chi_zihao.R
index b4951bbc6..f855d918e 100644
--- a/R/aggregate_by_chi_zihao.R
+++ b/R/aggregate_by_chi_zihao.R
@@ -209,7 +209,7 @@ aggregate_by_chi_zihao <- function(individual_file) {
     individual_file_cols4[, chi := NULL],
     individual_file_cols5[, chi := NULL],
     individual_file_cols6[, chi := NULL],
-    individual_file_cols7[, chi := NULL],
+    individual_file_cols7[, chi := NULL]
   )
   # convert back to tibble
   individual_file <- dplyr::as_tibble(individual_file)

From 8f31277a0569264231be18c871b1170230e0788d Mon Sep 17 00:00:00 2001
From: Moohan <Moohan@users.noreply.github.com>
Date: Mon, 3 Jul 2023 10:49:27 +0000
Subject: [PATCH 161/200] Update documentation

---
 man/process_slf_deaths_lookup.Rd | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/man/process_slf_deaths_lookup.Rd b/man/process_slf_deaths_lookup.Rd
index c512777fa..2ecde97ce 100644
--- a/man/process_slf_deaths_lookup.Rd
+++ b/man/process_slf_deaths_lookup.Rd
@@ -18,7 +18,7 @@ process_slf_deaths_lookup(
 \item{nrs_deaths_data}{NRS deaths data.}
 
 \item{chi_deaths_data}{IT CHI deaths data.}
-  
+
 \item{write_to_disk}{(optional) Should the data be written to disk default is
 \code{TRUE} i.e. write the data to disk.}
 }

From b3f2d11e9492956c5276dfc4f7802bb623c34040 Mon Sep 17 00:00:00 2001
From: James McMahon <james.mcmahon@phs.scot>
Date: Mon, 3 Jul 2023 11:51:30 +0100
Subject: [PATCH 162/200] Fix minor typos

---
 R/aggregate_by_chi_zihao.R | 2 +-
 R/create_individual_file.R | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/R/aggregate_by_chi_zihao.R b/R/aggregate_by_chi_zihao.R
index f855d918e..0e3389b85 100644
--- a/R/aggregate_by_chi_zihao.R
+++ b/R/aggregate_by_chi_zihao.R
@@ -50,7 +50,7 @@ aggregate_by_chi_zihao <- function(individual_file) {
     )
   )
 
-  # colums specification, grouped by chi
+  # column specification, grouped by chi
   # columns to select last
   cols2 <- c(
     vars_end_with(
diff --git a/R/create_individual_file.R b/R/create_individual_file.R
index 0b1b8fd30..bc9c1cd28 100644
--- a/R/create_individual_file.R
+++ b/R/create_individual_file.R
@@ -82,7 +82,7 @@ add_cij_columns <- function(episode_file) {
         cij_marker,
         NA_integer_
       )
-      # preventable_beddays is now added in aggragate_by_chi
+      # preventable_beddays is now added in aggregate_by_chi
     )
 }
 

From 1bc1d6c6244f4e8236d69254ba78285e25d2d785 Mon Sep 17 00:00:00 2001
From: James McMahon <james.mcmahon@phs.scot>
Date: Mon, 3 Jul 2023 10:52:41 +0000
Subject: [PATCH 163/200] [check-spelling] Update metadata

Update for https://github.com/Public-Health-Scotland/source-linkage-files/actions/runs/5443581387/attempts/1
Accepted in https://github.com/Public-Health-Scotland/source-linkage-files/pull/709#issuecomment-1617917895

Signed-off-by: check-spelling-bot <check-spelling-bot@users.noreply.github.com>
---
 .github/actions/spelling/expect.txt | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/.github/actions/spelling/expect.txt b/.github/actions/spelling/expect.txt
index 75e977ea6..394a4bbe0 100644
--- a/.github/actions/spelling/expect.txt
+++ b/.github/actions/spelling/expect.txt
@@ -28,6 +28,7 @@ cmh
 CNWs
 commhosp
 congen
+costincdnas
 costmonthnum
 costsfy
 covr
@@ -45,6 +46,7 @@ dbconnect
 dbplyr
 deathdiag
 demog
+dfc
 disch
 dischloc
 dischto
@@ -70,6 +72,7 @@ fyyear
 geogs
 ggplot
 GLS
+gls
 gms
 GPOo
 gpprac
@@ -99,6 +102,7 @@ keydate
 keyring
 keytime
 keytimex
+lgl
 los
 ltc
 ltcs
@@ -111,6 +115,7 @@ multiday
 multisession
 multistaff
 NAs
+newcons
 nhs
 nhshosp
 NRS
@@ -141,6 +146,7 @@ purrr
 quickstart
 Rbuildignore
 rds
+reabl
 reablement
 readcode
 readr
@@ -197,6 +203,6 @@ xintercept
 xlsx
 yearstay
 YYYYQX
+zihao
 zsav
 zstd
-zstd

From 31b3782e9515b433c4759c50b5d83e98c18df263 Mon Sep 17 00:00:00 2001
From: James McMahon <james.mcmahon@phs.scot>
Date: Mon, 3 Jul 2023 12:03:48 +0100
Subject: [PATCH 164/200] Remove some obsolete comments

---
 R/aggregate_by_chi_zihao.R | 2 --
 1 file changed, 2 deletions(-)

diff --git a/R/aggregate_by_chi_zihao.R b/R/aggregate_by_chi_zihao.R
index 0e3389b85..cebeae9d6 100644
--- a/R/aggregate_by_chi_zihao.R
+++ b/R/aggregate_by_chi_zihao.R
@@ -139,8 +139,6 @@ aggregate_by_chi_zihao <- function(individual_file) {
       c("_cohort", "end_fy", "start_fy")
     )
   )
-  # columns to group by chi and cij_marker, mainly preventable
-  # cols7 <- c("preventable_admissions", "preventable_beddays")
 
   # compute
   individual_file_cols1 <- individual_file[,

From a1371eda09400a85977b31bace0129d0d91bec71 Mon Sep 17 00:00:00 2001
From: James McMahon <james.mcmahon@phs.scot>
Date: Mon, 3 Jul 2023 12:04:26 +0100
Subject: [PATCH 165/200] Remove some unnecessary brackets

---
 R/create_individual_file.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/R/create_individual_file.R b/R/create_individual_file.R
index bc9c1cd28..406eeae7f 100644
--- a/R/create_individual_file.R
+++ b/R/create_individual_file.R
@@ -78,7 +78,7 @@ add_cij_columns <- function(episode_file) {
       #   1,
       #   0
       # ),
-      preventable_admissions = dplyr::if_else((.data$cij_ppa == 1),
+      preventable_admissions = dplyr::if_else(.data$cij_ppa == 1,
         cij_marker,
         NA_integer_
       )

From 64081c87f93c565aa886878500091215fa5d4706 Mon Sep 17 00:00:00 2001
From: James McMahon <james.mcmahon@phs.scot>
Date: Mon, 3 Jul 2023 12:05:00 +0100
Subject: [PATCH 166/200] Reformat some code

---
 R/create_individual_file.R     | 6 ++----
 R/link_delayed_discharge_eps.R | 3 ++-
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/R/create_individual_file.R b/R/create_individual_file.R
index 406eeae7f..5fde82353 100644
--- a/R/create_individual_file.R
+++ b/R/create_individual_file.R
@@ -327,8 +327,7 @@ add_ch_columns <- function(episode_file, prefix, condition) {
     add_standard_cols(prefix, condition) %>%
     dplyr::mutate(
       ch_cost_per_day = dplyr::if_else(
-        eval(condition) &
-          .data$yearstay > 0,
+        eval(condition) & .data$yearstay > 0,
         .data$cost_total_net / .data$yearstay,
         .data$cost_total_net
       ),
@@ -340,8 +339,7 @@ add_ch_columns <- function(episode_file, prefix, condition) {
       ),
       # If end date is missing use the first day of next FY quarter
       ch_ep_end = dplyr::if_else(
-        eval(condition) &
-          is.na(.data$ch_ep_end),
+        eval(condition) & is.na(.data$ch_ep_end),
         start_next_fy_quarter(.data$sc_latest_submission),
         .data$ch_ep_end
       )
diff --git a/R/link_delayed_discharge_eps.R b/R/link_delayed_discharge_eps.R
index fcd1715a4..c3920f60c 100644
--- a/R/link_delayed_discharge_eps.R
+++ b/R/link_delayed_discharge_eps.R
@@ -36,7 +36,8 @@ link_delayed_discharge_eps <- function(data, year) {
   # no flag for last reported
   dd_data <-
     read_file(get_source_extract_path(year_param, "DD")) %>%
-    dplyr::rename( # TODO Change the name of the variables in the DD extract rather than here.
+ # TODO Change the name of the variables in the DD extract rather than here.
+    dplyr::rename(
       record_keydate1 = "keydate1_dateformat",
       record_keydate2 = "keydate2_dateformat"
     ) %>%

From 0800662389b559304c7e6f24e99e9399aed7bbab Mon Sep 17 00:00:00 2001
From: James McMahon <james.mcmahon@phs.scot>
Date: Mon, 3 Jul 2023 12:05:37 +0100
Subject: [PATCH 167/200] Use some `dplyr` functions for readability

---
 R/link_delayed_discharge_eps.R | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/R/link_delayed_discharge_eps.R b/R/link_delayed_discharge_eps.R
index c3920f60c..de21cc6b1 100644
--- a/R/link_delayed_discharge_eps.R
+++ b/R/link_delayed_discharge_eps.R
@@ -288,10 +288,7 @@ link_delayed_discharge_eps <- function(data, year) {
       .data$record_keydate1,
       .data$record_keydate2
     ) %>%
-    dplyr::mutate(yearstay = rowSums(dplyr::select(
-      .,
-      paste0(month.abb[c(4:12, 1:3)] %>% tolower(), "_beddays")
-    ))) %>%
+    dplyr::mutate(yearstay = rowSums(dplyr::pick(dplyr::ends_with("_beddays")))) %>%
     # tidy up and rename columns to match the format of episode files
     dplyr::select(
       "year" = "year_dd",
@@ -323,7 +320,7 @@ link_delayed_discharge_eps <- function(data, year) {
       "location",
       "spec" = "spec_dd",
       "dd_type",
-      all_of(paste0(month.abb[c(4:12, 1:3)] %>% tolower(), "_beddays")),
+dplyr::ends_with("_beddays"),
       "yearstay"
     ) %>%
     # combine DD with episode data

From a954611ed60f1d01d6ad2513798a97db94767bd4 Mon Sep 17 00:00:00 2001
From: Moohan <Moohan@users.noreply.github.com>
Date: Mon, 3 Jul 2023 11:07:32 +0000
Subject: [PATCH 168/200] Style code

---
 R/link_delayed_discharge_eps.R | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/R/link_delayed_discharge_eps.R b/R/link_delayed_discharge_eps.R
index de21cc6b1..e52031b77 100644
--- a/R/link_delayed_discharge_eps.R
+++ b/R/link_delayed_discharge_eps.R
@@ -36,7 +36,7 @@ link_delayed_discharge_eps <- function(data, year) {
   # no flag for last reported
   dd_data <-
     read_file(get_source_extract_path(year_param, "DD")) %>%
- # TODO Change the name of the variables in the DD extract rather than here.
+    # TODO Change the name of the variables in the DD extract rather than here.
     dplyr::rename(
       record_keydate1 = "keydate1_dateformat",
       record_keydate2 = "keydate2_dateformat"
@@ -320,7 +320,7 @@ link_delayed_discharge_eps <- function(data, year) {
       "location",
       "spec" = "spec_dd",
       "dd_type",
-dplyr::ends_with("_beddays"),
+      dplyr::ends_with("_beddays"),
       "yearstay"
     ) %>%
     # combine DD with episode data

From 689dac2eaef5185bf05d72037b240080337e1f0a Mon Sep 17 00:00:00 2001
From: James McMahon <james.mcmahon@phs.scot>
Date: Mon, 3 Jul 2023 12:23:46 +0100
Subject: [PATCH 169/200] Update R/link_delayed_discharge_eps.R

---
 R/link_delayed_discharge_eps.R | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/R/link_delayed_discharge_eps.R b/R/link_delayed_discharge_eps.R
index e8ef322d7..3a65bda58 100644
--- a/R/link_delayed_discharge_eps.R
+++ b/R/link_delayed_discharge_eps.R
@@ -268,7 +268,8 @@ link_delayed_discharge_eps <- function(data, year) {
       .data$datediff_end,
       dplyr::desc(.data$datediff_start)
     ) %>%
-    dplyr::distinct(.data$postcode,
+    dplyr::distinct(
+    .data$postcode,
       .data$record_keydate1_dd,
       .data$record_keydate2_dd,
       .keep_all = TRUE

From fa6120d906f4b410a0433d561e5c832ffb403183 Mon Sep 17 00:00:00 2001
From: Moohan <Moohan@users.noreply.github.com>
Date: Mon, 3 Jul 2023 11:25:48 +0000
Subject: [PATCH 170/200] Style code

---
 R/link_delayed_discharge_eps.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/R/link_delayed_discharge_eps.R b/R/link_delayed_discharge_eps.R
index 3a65bda58..574b1a6fd 100644
--- a/R/link_delayed_discharge_eps.R
+++ b/R/link_delayed_discharge_eps.R
@@ -269,7 +269,7 @@ link_delayed_discharge_eps <- function(data, year) {
       dplyr::desc(.data$datediff_start)
     ) %>%
     dplyr::distinct(
-    .data$postcode,
+      .data$postcode,
       .data$record_keydate1_dd,
       .data$record_keydate2_dd,
       .keep_all = TRUE

From 16d6d22ebc54d751c738d22f2a0640085862e8fb Mon Sep 17 00:00:00 2001
From: James McMahon <james.mcmahon@phs.scot>
Date: Tue, 4 Jul 2023 15:46:25 +0100
Subject: [PATCH 171/200] Remove some code which is no longer needed

We now match on these variables after
---
 R/aggregate_by_chi_zihao.R | 23 ++---------------------
 1 file changed, 2 insertions(+), 21 deletions(-)

diff --git a/R/aggregate_by_chi_zihao.R b/R/aggregate_by_chi_zihao.R
index cebeae9d6..3bf1feeb1 100644
--- a/R/aggregate_by_chi_zihao.R
+++ b/R/aggregate_by_chi_zihao.R
@@ -128,17 +128,6 @@ aggregate_by_chi_zihao <- function(individual_file) {
   # columns to select maximum
   cols5 <- vars_contain(individual_file, c("nsu", "hl1_in_fy"))
   cols5 <- cols5[!(cols5 %in% c("ooh_consultation_time"))]
-  # columns to select first row
-  cols6 <- c(
-    condition_cols(),
-    # "death_date",
-    # "deceased",
-    "year",
-    vars_end_with(
-      individual_file,
-      c("_cohort", "end_fy", "start_fy")
-    )
-  )
 
   # compute
   individual_file_cols1 <- individual_file[,
@@ -170,13 +159,6 @@ aggregate_by_chi_zihao <- function(individual_file) {
     by = chi
   ]
   individual_file_cols6 <- individual_file[,
-    lapply(.SD, function(x) {
-      x[!is.na(x)][1]
-    }),
-    .SDcols = cols6,
-    by = chi
-  ]
-  individual_file_cols7 <- individual_file[,
     .(
       preventable_admissions = preventable_admissions,
       preventable_beddays =
@@ -190,7 +172,7 @@ aggregate_by_chi_zihao <- function(individual_file) {
     # cij_marker has been renamed as cij_total
     by = c("chi", "cij_total")
   ]
-  individual_file_cols7 <- individual_file_cols7[,
+  individual_file_cols6 <- individual_file_cols6[,
     .(
       preventable_admissions =
         data.table::uniqueN(unique(preventable_admissions), na.rm = TRUE),
@@ -206,8 +188,7 @@ aggregate_by_chi_zihao <- function(individual_file) {
     individual_file_cols3[, chi := NULL],
     individual_file_cols4[, chi := NULL],
     individual_file_cols5[, chi := NULL],
-    individual_file_cols6[, chi := NULL],
-    individual_file_cols7[, chi := NULL]
+    individual_file_cols6[, chi := NULL]
   )
   # convert back to tibble
   individual_file <- dplyr::as_tibble(individual_file)

From 77ddd9e27ab7687baba2daddf63bf13c0f0d391c Mon Sep 17 00:00:00 2001
From: James McMahon <james.mcmahon@phs.scot>
Date: Tue, 4 Jul 2023 15:49:03 +0100
Subject: [PATCH 172/200] Work out preventable admissions with similar
 indicators

---
 R/aggregate_by_chi_zihao.R | 20 ++++++++------------
 R/create_individual_file.R |  2 +-
 2 files changed, 9 insertions(+), 13 deletions(-)

diff --git a/R/aggregate_by_chi_zihao.R b/R/aggregate_by_chi_zihao.R
index 3bf1feeb1..cfece9f19 100644
--- a/R/aggregate_by_chi_zihao.R
+++ b/R/aggregate_by_chi_zihao.R
@@ -89,8 +89,9 @@ aggregate_by_chi_zihao <- function(individual_file) {
     "cij_total",
     "cij_el",
     "cij_non_el",
-    "cij_mat"
-    # "cij_delay"
+    "cij_mat",
+    # "cij_delay",
+    "preventable_admissions"
   )
   # columns to sum up
   cols4 <- c(
@@ -160,22 +161,17 @@ aggregate_by_chi_zihao <- function(individual_file) {
   ]
   individual_file_cols6 <- individual_file[,
     .(
-      preventable_admissions = preventable_admissions,
-      preventable_beddays =
-      # ifelse is faster than dplyr::if_else here
-        ifelse(
-          cij_ppa == 1,
-          max(cij_end_date) - min(cij_start_date),
-          NA
-        )
+      preventable_beddays = ifelse(
+        cij_ppa == 1,
+        max(cij_end_date) - min(cij_start_date),
+        NA_real_
+      )
     ),
     # cij_marker has been renamed as cij_total
     by = c("chi", "cij_total")
   ]
   individual_file_cols6 <- individual_file_cols6[,
     .(
-      preventable_admissions =
-        data.table::uniqueN(unique(preventable_admissions), na.rm = TRUE),
       preventable_beddays =
         sum(preventable_beddays, na.rm = TRUE)
     ),
diff --git a/R/create_individual_file.R b/R/create_individual_file.R
index 5fde82353..4d3555d71 100644
--- a/R/create_individual_file.R
+++ b/R/create_individual_file.R
@@ -79,7 +79,7 @@ add_cij_columns <- function(episode_file) {
       #   0
       # ),
       preventable_admissions = dplyr::if_else(.data$cij_ppa == 1,
-        cij_marker,
+        .data$cij_marker,
         NA_integer_
       )
       # preventable_beddays is now added in aggregate_by_chi

From 51a0767f0bc92034e69c350aefd051c5bd9ca50c Mon Sep 17 00:00:00 2001
From: James McMahon <james.mcmahon@phs.scot>
Date: Tue, 4 Jul 2023 15:56:11 +0100
Subject: [PATCH 173/200] Lowercase variable names

---
 R/create_individual_file.R | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/R/create_individual_file.R b/R/create_individual_file.R
index 4d3555d71..61663269d 100644
--- a/R/create_individual_file.R
+++ b/R/create_individual_file.R
@@ -60,15 +60,15 @@ add_cij_columns <- function(episode_file) {
 
   episode_file %>%
     dplyr::mutate(
-      CIJ_non_el = dplyr::if_else(.data$cij_pattype_code == 0,
+      cij_non_el = dplyr::if_else(.data$cij_pattype_code == 0,
         .data$cij_marker,
         NA_real_
       ),
-      CIJ_el = dplyr::if_else(.data$cij_pattype_code == 1,
+      cij_el = dplyr::if_else(.data$cij_pattype_code == 1,
         .data$cij_marker,
         NA_real_
       ),
-      CIJ_mat = dplyr::if_else(.data$cij_pattype_code == 2,
+      cij_mat = dplyr::if_else(.data$cij_pattype_code == 2,
         .data$cij_marker,
         NA_real_
       ),
@@ -631,9 +631,9 @@ aggregate_by_chi <- function(episode_file) {
         c(
           "ch_cis_episodes" = "ch_chi_cis",
           "cij_total" = "cij_marker",
-          "CIJ_el",
-          "CIJ_non_el",
-          "CIJ_mat",
+          "cij_el",
+          "cij_non_el",
+          "cij_mat",
           # "cij_delay",
           "ooh_cases" = "ooh_case_id",
           "preventable_admissions"

From b17f80653ccc1ecad209a88ae78c245ac93e8535 Mon Sep 17 00:00:00 2001
From: James McMahon <james.mcmahon@phs.scot>
Date: Tue, 4 Jul 2023 16:03:10 +0100
Subject: [PATCH 174/200] Restore `cij_delay`

---
 R/aggregate_by_chi_zihao.R |  2 +-
 R/create_individual_file.R | 24 +++++++++++++-----------
 2 files changed, 14 insertions(+), 12 deletions(-)

diff --git a/R/aggregate_by_chi_zihao.R b/R/aggregate_by_chi_zihao.R
index cfece9f19..ec7fe5b64 100644
--- a/R/aggregate_by_chi_zihao.R
+++ b/R/aggregate_by_chi_zihao.R
@@ -90,7 +90,7 @@ aggregate_by_chi_zihao <- function(individual_file) {
     "cij_el",
     "cij_non_el",
     "cij_mat",
-    # "cij_delay",
+    "cij_delay",
     "preventable_admissions"
   )
   # columns to sum up
diff --git a/R/create_individual_file.R b/R/create_individual_file.R
index 61663269d..2d2d5be38 100644
--- a/R/create_individual_file.R
+++ b/R/create_individual_file.R
@@ -60,29 +60,31 @@ add_cij_columns <- function(episode_file) {
 
   episode_file %>%
     dplyr::mutate(
-      cij_non_el = dplyr::if_else(.data$cij_pattype_code == 0,
+      cij_non_el = dplyr::if_else(
+        .data$cij_pattype_code == 0,
         .data$cij_marker,
         NA_real_
       ),
-      cij_el = dplyr::if_else(.data$cij_pattype_code == 1,
+      cij_el = dplyr::if_else(
+        .data$cij_pattype_code == 1,
         .data$cij_marker,
         NA_real_
       ),
-      cij_mat = dplyr::if_else(.data$cij_pattype_code == 2,
+      cij_mat = dplyr::if_else(
+        .data$cij_pattype_code == 2,
         .data$cij_marker,
         NA_real_
       ),
-      # # assume cij_delay is logic variable
-      # cij_delay = dplyr::if_else(
-      #   (.data$cij_delay & .data$cij_marker == 1),
-      #   1,
-      #   0
-      # ),
-      preventable_admissions = dplyr::if_else(.data$cij_ppa == 1,
+      cij_delay = dplyr::if_else(
+        .data$recid == "DD",
+        .data$cij_marker,
+        NA_real_
+      ),
+      preventable_admissions = dplyr::if_else(
+        .data$cij_ppa == 1,
         .data$cij_marker,
         NA_integer_
       )
-      # preventable_beddays is now added in aggregate_by_chi
     )
 }
 

From 12ec4f63cdf65a0cb35bdec9fd0b4767a885b4a1 Mon Sep 17 00:00:00 2001
From: James McMahon <james.mcmahon@phs.scot>
Date: Tue, 4 Jul 2023 16:08:01 +0100
Subject: [PATCH 175/200] Restore DN variables

---
 R/aggregate_by_chi_zihao.R | 2 +-
 R/create_individual_file.R | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/R/aggregate_by_chi_zihao.R b/R/aggregate_by_chi_zihao.R
index ec7fe5b64..e84f7aac6 100644
--- a/R/aggregate_by_chi_zihao.R
+++ b/R/aggregate_by_chi_zihao.R
@@ -103,7 +103,7 @@ aggregate_by_chi_zihao <- function(individual_file) {
         "cost",
         "attendances",
         "attend",
-        # "contacts",
+        "contacts",
         "hours",
         "alarms",
         "telecare",
diff --git a/R/create_individual_file.R b/R/create_individual_file.R
index 2d2d5be38..be31c2ed6 100644
--- a/R/create_individual_file.R
+++ b/R/create_individual_file.R
@@ -106,7 +106,7 @@ add_all_columns <- function(episode_file) {
     add_ae_columns("AE", .data$recid == "AE2") %>%
     add_pis_columns("PIS", .data$recid == "PIS") %>%
     add_ooh_columns("OoH", .data$recid == "OoH") %>%
-    # add_dn_columns("DN", .data$recid == "DN") %>%
+    add_dn_columns("DN", .data$recid == "DN") %>%
     add_cmh_columns("CMH", .data$recid == "CMH") %>%
     add_dd_columns("DD", .data$recid == "DD") %>%
     add_nsu_columns("NSU", .data$recid == "NSU") %>%

From 33681d3bf5501048e5336dd35419f32b02525f30 Mon Sep 17 00:00:00 2001
From: James McMahon <james.mcmahon@phs.scot>
Date: Tue, 4 Jul 2023 16:28:24 +0100
Subject: [PATCH 176/200] Tidy the code and use integers where possible

---
 R/aggregate_by_chi_zihao.R |  21 +++----
 R/create_individual_file.R | 116 ++++++++++++++++++++-----------------
 2 files changed, 70 insertions(+), 67 deletions(-)

diff --git a/R/aggregate_by_chi_zihao.R b/R/aggregate_by_chi_zihao.R
index e84f7aac6..5b2ccc1ff 100644
--- a/R/aggregate_by_chi_zihao.R
+++ b/R/aggregate_by_chi_zihao.R
@@ -11,7 +11,7 @@ aggregate_by_chi_zihao <- function(individual_file) {
   cli::cli_alert_info("Aggregate by CHI function started at {Sys.time()}")
 
   individual_file <- individual_file %>%
-    dplyr::select(-c(postcode, gpprac)) %>%
+    dplyr::select(-c("postcode", "gpprac")) %>%
     dplyr::rename(
       "gpprac" = "most_recent_gpprac",
       "postcode" = "most_recent_postcode"
@@ -133,31 +133,31 @@ aggregate_by_chi_zihao <- function(individual_file) {
   # compute
   individual_file_cols1 <- individual_file[,
     .(gender = mean(gender)),
-    by = chi
+    by = "chi"
   ]
   individual_file_cols2 <- individual_file[,
     .SD[.N],
     .SDcols = cols2,
-    by = chi
+    by = "chi"
   ]
   individual_file_cols3 <- individual_file[,
     lapply(.SD, function(x) {
       data.table::uniqueN(x, na.rm = TRUE)
     }),
     .SDcols = cols3,
-    by = chi
+    by = "chi"
   ]
   individual_file_cols4 <- individual_file[,
     lapply(.SD, function(x) {
       sum(x, na.rm = TRUE)
     }),
     .SDcols = cols4,
-    by = chi
+    by = "chi"
   ]
   individual_file_cols5 <- individual_file[,
     lapply(.SD, function(x) max(x, na.rm = TRUE)),
     .SDcols = cols5,
-    by = chi
+    by = "chi"
   ]
   individual_file_cols6 <- individual_file[,
     .(
@@ -172,8 +172,7 @@ aggregate_by_chi_zihao <- function(individual_file) {
   ]
   individual_file_cols6 <- individual_file_cols6[,
     .(
-      preventable_beddays =
-        sum(preventable_beddays, na.rm = TRUE)
+      preventable_beddays = sum(preventable_beddays, na.rm = TRUE)
     ),
     by = "chi"
   ]
@@ -195,7 +194,6 @@ aggregate_by_chi_zihao <- function(individual_file) {
 
 #' select columns ending with some patterns
 #' @describeIn select columns based on patterns
-#'
 vars_end_with <- function(data, vars, ignore_case = FALSE) {
   names(data)[stringr::str_ends(
     names(data),
@@ -207,7 +205,6 @@ vars_end_with <- function(data, vars, ignore_case = FALSE) {
 
 #' select columns starting with some patterns
 #' @describeIn select columns based on patterns
-#'
 vars_start_with <- function(data, vars, ignore_case = FALSE) {
   names(data)[stringr::str_starts(
     names(data),
@@ -219,7 +216,6 @@ vars_start_with <- function(data, vars, ignore_case = FALSE) {
 
 #' select columns contains some characters
 #' @describeIn select columns based on patterns
-#'
 vars_contain <- function(data, vars, ignore_case = FALSE) {
   names(data)[stringr::str_detect(
     names(data),
@@ -233,7 +229,6 @@ vars_contain <- function(data, vars, ignore_case = FALSE) {
 #'
 #' @description Aggregate CH variables by CHI and CIS.
 #'
-#'
 #' @inheritParams create_individual_file
 aggregate_ch_episodes_zihao <- function(episode_file) {
   cli::cli_alert_info("Aggregate ch episodes function started at {Sys.time()}")
@@ -247,7 +242,7 @@ aggregate_ch_episodes_zihao <- function(episode_file) {
     ch_ep_start = min(record_keydate1),
     ch_ep_end = max(ch_ep_end),
     ch_cost_per_day = mean(ch_cost_per_day)
-  ), by = .(chi, ch_chi_cis)]
+  ), by = .("chi", "ch_chi_cis")]
 
   # Convert back to tibble if needed
   episode_file <- tibble::as_tibble(episode_file)
diff --git a/R/create_individual_file.R b/R/create_individual_file.R
index be31c2ed6..54d1dbe93 100644
--- a/R/create_individual_file.R
+++ b/R/create_individual_file.R
@@ -9,7 +9,6 @@
 #'
 #' @return The processed individual file
 #' @export
-#'
 create_individual_file <- function(episode_file, year, write_to_disk = TRUE) {
   individual_file <- episode_file %>%
     remove_blank_chi() %>%
@@ -118,24 +117,23 @@ add_all_columns <- function(episode_file) {
     add_sds_columns("SDS", .data$recid == "SDS") %>%
     dplyr::mutate(
       health_net_cost = rowSums(
-        dplyr::select(
-          .,
-          c(
-            Acute_cost,
-            Mat_cost,
-            MH_cost,
-            GLS_cost,
-            OP_cost_attend,
-            AE_cost,
-            PIS_cost,
-            OoH_cost
-          )
+        dplyr::pick(
+          .data$Acute_cost,
+          .data$Mat_cost,
+          .data$MH_cost,
+          .data$GLS_cost,
+          .data$OP_cost_attend,
+          .data$AE_cost,
+          .data$PIS_cost,
+          .data$OoH_cost
         ),
         na.rm = TRUE
       ),
-      health_net_costincdnas =
-        health_net_cost +
-          dplyr::if_else(is.na(OP_cost_dnas), 0, OP_cost_dnas)
+      health_net_costincdnas = .data$health_net_cost + dplyr::if_else(
+        is.na(.data$OP_cost_dnas),
+        0,
+        .data$OP_cost_dnas
+      )
     )
 }
 
@@ -191,13 +189,13 @@ add_op_columns <- function(episode_file, prefix, condition) {
   condition_1 <- substitute(condition & attendance_status == 1)
   episode_file <- episode_file %>%
     dplyr::mutate(
-      "{prefix}_newcons_attendances" := dplyr::if_else(eval(condition_1), 1, NA_real_),
+      "{prefix}_newcons_attendances" := dplyr::if_else(eval(condition_1), 1L, NA_integer_),
       "{prefix}_cost_attend" := dplyr::if_else(eval(condition_1), .data$cost_total_net, NA_real_)
     )
   condition_5_8 <- substitute(condition & attendance_status %in% c(5, 8))
   episode_file <- episode_file %>%
     dplyr::mutate(
-      "{prefix}_newcons_dnas" := dplyr::if_else(eval(condition_5_8), 1, NA_real_),
+      "{prefix}_newcons_dnas" := dplyr::if_else(eval(condition_5_8), 1L, NA_integer_),
       "{prefix}_cost_dnas" := dplyr::if_else(eval(condition_5_8), .data$cost_total_net_inc_dnas, NA_real_)
     )
   return(episode_file)
@@ -210,7 +208,7 @@ add_ae_columns <- function(episode_file, prefix, condition) {
   condition <- substitute(condition)
   episode_file %>%
     add_standard_cols(prefix, condition, cost = TRUE) %>%
-    dplyr::mutate("{prefix}_attendances" := dplyr::if_else(eval(condition), 1, NA_real_))
+    dplyr::mutate("{prefix}_attendances" := dplyr::if_else(eval(condition), 1L, NA_integer_))
 }
 
 #' Add PIS columns
@@ -220,7 +218,7 @@ add_pis_columns <- function(episode_file, prefix, condition) {
   condition <- substitute(condition)
   episode_file %>%
     add_standard_cols(prefix, condition, cost = TRUE) %>%
-    dplyr::mutate("{prefix}_paid_items" := dplyr::if_else(eval(condition), .data$no_paid_items, NA_real_))
+    dplyr::mutate("{prefix}_paid_items" := dplyr::if_else(eval(condition), .data$no_paid_items, NA_integer_))
 }
 
 #' Add OoH columns
@@ -231,21 +229,27 @@ add_ooh_columns <- function(episode_file, prefix, condition) {
   episode_file <- episode_file %>%
     add_standard_cols(prefix, condition, cost = TRUE) %>%
     dplyr::mutate(
-      "{prefix}_homeV" := dplyr::if_else(eval(condition) & .data$smrtype == "OOH-HomeV", 1, NA_real_),
-      "{prefix}_advice" := dplyr::if_else(eval(condition) & .data$smrtype == "OOH-Advice", 1, NA_real_),
-      "{prefix}_DN" := dplyr::if_else(eval(condition) & .data$smrtype == "OOH-DN", 1, NA_real_),
-      "{prefix}_NHS24" := dplyr::if_else(eval(condition) & .data$smrtype == "OOH-NHS24", 1, NA_real_),
-      "{prefix}_other" := dplyr::if_else(eval(condition) & .data$smrtype == "OOH-Other", 1, NA_real_),
-      "{prefix}_PCC" := dplyr::if_else(eval(condition) & .data$smrtype == "OOH-PCC", 1, NA_real_),
-      ooh_covid_advice = dplyr::if_else(eval(condition) & .data$smrtype == "OOH-C19Adv", 1, NA_real_),
-      ooh_covid_assessment = dplyr::if_else(eval(condition) & .data$smrtype == "OOH-C19Ass", 1, NA_real_),
-      ooh_covid_other = dplyr::if_else(eval(condition) & .data$smrtype == "OOH-C190th", 1, NA_real_)
+      "{prefix}_homeV" := dplyr::if_else(eval(condition) & .data$smrtype == "OOH-HomeV", 1L, NA_integer_),
+      "{prefix}_advice" := dplyr::if_else(eval(condition) & .data$smrtype == "OOH-Advice", 1L, NA_integer_),
+      "{prefix}_DN" := dplyr::if_else(eval(condition) & .data$smrtype == "OOH-DN", 1L, NA_integer_),
+      "{prefix}_NHS24" := dplyr::if_else(eval(condition) & .data$smrtype == "OOH-NHS24", 1L, NA_integer_),
+      "{prefix}_other" := dplyr::if_else(eval(condition) & .data$smrtype == "OOH-Other", 1L, NA_integer_),
+      "{prefix}_PCC" := dplyr::if_else(eval(condition) & .data$smrtype == "OOH-PCC", 1L, NA_integer_),
+      "{prefix}_covid_advice" := dplyr::if_else(eval(condition) & .data$smrtype == "OOH-C19Adv", 1L, NA_integer_),
+      "{prefix}_covid_assessment" := dplyr::if_else(eval(condition) & .data$smrtype == "OOH-C19Ass", 1L, NA_integer_),
+      "{prefix}_covid_other" := dplyr::if_else(eval(condition) & .data$smrtype == "OOH-C190th", 1L, NA_integer_)
     )
 
   episode_file <- episode_file %>%
     dplyr::mutate(
-      OoH_consultation_time = dplyr::if_else(eval(condition), as.numeric((lubridate::seconds_to_period(.data$keytime2) + .data$record_keydate2) - (lubridate::seconds_to_period(.data$keytime1) + .data$record_keydate1), units = "mins"), NA_real_),
-      OoH_consultation_time = dplyr::if_else(OoH_consultation_time < 0, 0, .data$OoH_consultation_time),
+      "{prefix}_consultation_time" := dplyr::if_else(
+        eval(condition),
+        pmax(
+          0,
+          as.numeric((lubridate::seconds_to_period(.data$keytime2) + .data$record_keydate2) - (lubridate::seconds_to_period(.data$keytime1) + .data$record_keydate1), units = "mins")
+        ),
+        NA_real_
+      ),
     )
 
   return(episode_file)
@@ -258,7 +262,7 @@ add_dn_columns <- function(episode_file, prefix, condition) {
   condition <- substitute(condition)
   episode_file %>%
     add_standard_cols(prefix, condition, episode = TRUE, cost = TRUE) %>%
-    dplyr::mutate("{prefix}_contacts" := dplyr::if_else(eval(condition), .data$total_no_dn_contacts, NA_real_))
+    dplyr::mutate("{prefix}_contacts" := dplyr::if_else(eval(condition), .data$total_no_dn_contacts, NA_integer_))
 }
 
 #' Add CMH columns
@@ -268,7 +272,7 @@ add_cmh_columns <- function(episode_file, prefix, condition) {
   condition <- substitute(condition)
   episode_file %>%
     add_standard_cols(prefix, condition) %>%
-    dplyr::mutate("{prefix}_contacts" := dplyr::if_else(eval(condition), 1, NA_real_))
+    dplyr::mutate("{prefix}_contacts" := dplyr::if_else(eval(condition), 1L, NA_integer_))
 }
 
 #' Add DD columns
@@ -279,13 +283,13 @@ add_dd_columns <- function(episode_file, prefix, condition) {
   condition_delay <- substitute(condition & primary_delay_reason != "9")
   episode_file <- episode_file %>%
     dplyr::mutate(
-      "{prefix}_NonCode9_episodes" := dplyr::if_else(eval(condition_delay), 1, NA_real_),
+      "{prefix}_NonCode9_episodes" := dplyr::if_else(eval(condition_delay), 1L, NA_integer_),
       "{prefix}_NonCode9_beddays" := dplyr::if_else(eval(condition_delay), .data$yearstay, NA_real_)
     )
   condition_delay_9 <- substitute(condition & primary_delay_reason == "9")
   episode_file <- episode_file %>%
     dplyr::mutate(
-      "{prefix}_Code9_episodes" := dplyr::if_else(eval(condition_delay_9), 1, NA_real_),
+      "{prefix}_Code9_episodes" := dplyr::if_else(eval(condition_delay_9), 1L, NA_integer_),
       "{prefix}_Code9_beddays" := dplyr::if_else(eval(condition_delay_9), .data$yearstay, NA_real_)
     )
   return(episode_file)
@@ -298,7 +302,7 @@ add_nsu_columns <- function(episode_file, prefix, condition) {
   condition <- substitute(condition)
   episode_file %>%
     add_standard_cols(prefix, condition) %>%
-    dplyr::mutate("{prefix}" := dplyr::if_else(eval(condition), 1, NA_real_))
+    dplyr::mutate("{prefix}" := dplyr::if_else(eval(condition), 1L, NA_integer_))
 }
 
 #' Add NRS columns
@@ -308,7 +312,7 @@ add_nrs_columns <- function(episode_file, prefix, condition) {
   condition <- substitute(condition)
   episode_file %>%
     add_standard_cols(prefix, condition) %>%
-    dplyr::mutate("{prefix}" := dplyr::if_else(eval(condition), 1, NA_real_))
+    dplyr::mutate("{prefix}" := dplyr::if_else(eval(condition), 1L, NA_integer_))
 }
 
 #' Add HL1 columns
@@ -362,21 +366,21 @@ add_hc_columns <- function(episode_file, prefix, condition) {
   condition_per <- substitute(condition & smrtype == "HC-Per")
   episode_file <- episode_file %>%
     dplyr::mutate(
-      "{prefix}_personal_episodes" := dplyr::if_else(eval(condition_per), 1, NA_real_),
+      "{prefix}_personal_episodes" := dplyr::if_else(eval(condition_per), 1L, NA_integer_),
       "{prefix}_personal_hours" := dplyr::if_else(eval(condition_per), .data$HC_total_hours, NA_real_),
       "{prefix}_personal_hours_cost" := dplyr::if_else(eval(condition_per), .data$cost_total_net, NA_real_)
     )
   condition_non_per <- substitute(condition & smrtype == "HC-Non-Per")
   episode_file <- episode_file %>%
     dplyr::mutate(
-      "{prefix}_non_personal_episodes" := dplyr::if_else(eval(condition_non_per), 1, NA_real_),
+      "{prefix}_non_personal_episodes" := dplyr::if_else(eval(condition_non_per), 1L, NA_integer_),
       "{prefix}_non_personal_hours" := dplyr::if_else(eval(condition_non_per), .data$hc_hours_annual, NA_real_),
       "{prefix}_non_personal_hours_cost" := dplyr::if_else(eval(condition_non_per), .data$cost_total_net, NA_real_)
     )
   condition_reabl <- substitute(condition & hc_reablement == 1)
   episode_file <- episode_file %>%
     dplyr::mutate(
-      "{prefix}_reablement_episodes" := dplyr::if_else(eval(condition_reabl), 1, NA_real_),
+      "{prefix}_reablement_episodes" := dplyr::if_else(eval(condition_reabl), 1L, NA_integer_),
       "{prefix}_reablement_hours" := dplyr::if_else(eval(condition_reabl), .data$hc_hours_annual, NA_real_),
       "{prefix}_reablement_hours_cost" := dplyr::if_else(eval(condition_reabl), .data$cost_total_net, NA_real_)
     )
@@ -390,8 +394,8 @@ add_at_columns <- function(episode_file, prefix, condition) {
   episode_file %>%
     add_standard_cols(prefix, condition) %>%
     dplyr::mutate(
-      "{prefix}_alarms" := dplyr::if_else(eval(condition) & .data$smrtype == "AT-Alarm", 1, NA_real_),
-      "{prefix}_telecare" := dplyr::if_else(eval(condition) & .data$smrtype == "AT-Tele", 1, NA_real_)
+      "{prefix}_alarms" := dplyr::if_else(eval(condition) & .data$smrtype == "AT-Alarm", 1L, NA_integer_),
+      "{prefix}_telecare" := dplyr::if_else(eval(condition) & .data$smrtype == "AT-Tele", 1L, NA_integer_)
     )
 }
 
@@ -403,10 +407,10 @@ add_sds_columns <- function(episode_file, prefix, condition) {
   episode_file %>%
     add_standard_cols(prefix, condition) %>%
     dplyr::mutate(
-      "{prefix}_option_1" := dplyr::if_else(eval(condition) & .data$smrtype == "SDS-1", 1, NA_real_),
-      "{prefix}_option_2" := dplyr::if_else(eval(condition) & .data$smrtype == "SDS-2", 1, NA_real_),
-      "{prefix}_option_3" := dplyr::if_else(eval(condition) & .data$smrtype == "SDS-3", 1, NA_real_),
-      "{prefix}_option_4" := dplyr::if_else(eval(condition) & .data$smrtype == "SDS-4", 1, NA_real_)
+      "{prefix}_option_1" := dplyr::if_else(eval(condition) & .data$smrtype == "SDS-1", 1L, NA_integer_),
+      "{prefix}_option_2" := dplyr::if_else(eval(condition) & .data$smrtype == "SDS-2", 1L, NA_integer_),
+      "{prefix}_option_3" := dplyr::if_else(eval(condition) & .data$smrtype == "SDS-3", 1L, NA_integer_),
+      "{prefix}_option_4" := dplyr::if_else(eval(condition) & .data$smrtype == "SDS-4", 1L, NA_integer_)
     )
 }
 
@@ -423,21 +427,21 @@ add_ipdc_cols <- function(episode_file, prefix, condition, ipdc_d = TRUE, electi
   episode_file <- episode_file %>%
     dplyr::mutate(
       "{prefix}_inpatient_cost" := dplyr::if_else(eval(condition_i), .data$cost_total_net, NA_real_),
-      "{prefix}_inpatient_episodes" := dplyr::if_else(eval(condition_i), 1, NA_real_),
+      "{prefix}_inpatient_episodes" := dplyr::if_else(eval(condition_i), 1L, NA_integer_),
       "{prefix}_inpatient_beddays" := dplyr::if_else(eval(condition_i), .data$yearstay, NA_real_)
     )
   if (elective) {
     condition_el <- substitute(condition_i & cij_pattype == "Elective")
     episode_file <- episode_file %>%
       dplyr::mutate(
-        "{prefix}_el_inpatient_episodes" := dplyr::if_else(eval(condition_el), 1, NA_real_),
+        "{prefix}_el_inpatient_episodes" := dplyr::if_else(eval(condition_el), 1L, NA_integer_),
         "{prefix}_el_inpatient_beddays" := dplyr::if_else(eval(condition_el), .data$yearstay, NA_real_),
         "{prefix}_el_inpatient_cost" := dplyr::if_else(eval(condition_el), .data$cost_total_net, NA_real_)
       )
     condition_non_el <- substitute(condition_i & cij_pattype == "Non-Elective")
     episode_file <- episode_file %>%
       dplyr::mutate(
-        "{prefix}_non_el_inpatient_episodes" := dplyr::if_else(eval(condition_non_el), 1, NA_real_),
+        "{prefix}_non_el_inpatient_episodes" := dplyr::if_else(eval(condition_non_el), 1L, NA_integer_),
         "{prefix}_non_el_inpatient_beddays" := dplyr::if_else(eval(condition_non_el), .data$yearstay, NA_real_),
         "{prefix}_non_el_inpatient_cost" := dplyr::if_else(eval(condition_non_el), .data$cost_total_net, NA_real_)
       )
@@ -446,7 +450,7 @@ add_ipdc_cols <- function(episode_file, prefix, condition, ipdc_d = TRUE, electi
     condition_d <- substitute(eval(condition) & ipdc == "D")
     episode_file <- episode_file %>%
       dplyr::mutate(
-        "{prefix}_daycase_episodes" := dplyr::if_else(eval(condition_d), 1, NA_real_),
+        "{prefix}_daycase_episodes" := dplyr::if_else(eval(condition_d), 1L, NA_integer_),
         "{prefix}_daycase_cost" := dplyr::if_else(eval(condition_d), .data$cost_total_net, NA_real_)
       )
   }
@@ -464,7 +468,7 @@ add_ipdc_cols <- function(episode_file, prefix, condition, ipdc_d = TRUE, electi
 add_standard_cols <- function(episode_file, prefix, condition, drop = NULL, episode = FALSE, cost = FALSE) {
   episode_file <- dplyr::bind_cols(episode_file, create_cols(episode_file, prefix, condition, drop))
   if (episode) {
-    episode_file <- dplyr::mutate(episode_file, "{prefix}_episodes" := dplyr::if_else(eval(condition), 1, NA_real_))
+    episode_file <- dplyr::mutate(episode_file, "{prefix}_episodes" := dplyr::if_else(eval(condition), 1L, NA_integer_))
   }
   if (cost) {
     episode_file <- dplyr::mutate(episode_file, "{prefix}_cost" := dplyr::if_else(eval(condition), .data$cost_total_net, NA_real_))
@@ -495,7 +499,11 @@ create_cols <- function(episode_file, prefix, condition, drop) {
 #' @inheritParams na_type
 create_col <- function(episode_file, col, prefix, condition) {
   episode_file %>%
-    dplyr::mutate("{prefix}_{col}" := dplyr::if_else(eval(condition), .data[[tolower(col)]], na_type(col))) %>%
+    dplyr::mutate("{prefix}_{col}" := dplyr::if_else(
+      eval(condition),
+      .data[[tolower(col)]],
+      na_type(col)
+    )) %>%
     dplyr::select(dplyr::last_col())
 }
 
@@ -598,9 +606,9 @@ recode_gender <- function(episode_file) {
   episode_file %>%
     dplyr::mutate(
       gender = dplyr::if_else(
-        gender == 0 | gender == 9,
+        .data$gender %in% c(0, 9),
         1.5,
-        gender
+        .data$gender
       )
     )
 }

From f9e6f81465a17e84c6162b7949a029ab0c83dacf Mon Sep 17 00:00:00 2001
From: James McMahon <james.mcmahon@phs.scot>
Date: Tue, 4 Jul 2023 18:03:36 +0100
Subject: [PATCH 177/200] Supply `year` as a parameter to `clean_up_ch`

---
 R/create_individual_file.R | 28 +++++++++++++---------------
 man/clean_up_ch.Rd         |  4 +++-
 2 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/R/create_individual_file.R b/R/create_individual_file.R
index 54d1dbe93..56afed567 100644
--- a/R/create_individual_file.R
+++ b/R/create_individual_file.R
@@ -15,7 +15,7 @@ create_individual_file <- function(episode_file, year, write_to_disk = TRUE) {
     add_cij_columns() %>%
     add_all_columns() %>%
     aggregate_ch_episodes_zihao() %>%
-    clean_up_ch() %>%
+    clean_up_ch(year) %>%
     recode_gender() %>%
     aggregate_by_chi_zihao() %>%
     clean_individual_file()
@@ -556,7 +556,7 @@ aggregate_ch_episodes <- function(episode_file) {
 #' @description Clean up CH-related columns.
 #'
 #' @inheritParams create_individual_file
-clean_up_ch <- function(episode_file) {
+clean_up_ch <- function(episode_file, year) {
   cli::cli_alert_info("Clean up CH function started at {Sys.time()}")
 
   episode_file %>%
@@ -565,34 +565,32 @@ clean_up_ch <- function(episode_file) {
       fy_start = start_fy(year)
     ) %>%
     dplyr::mutate(
-      term_1 = pmin(ch_ep_end, fy_end + 1),
-      term_2 = pmax(ch_ep_start, fy_start)
+      term_1 = pmin(.data$ch_ep_end, .data$fy_end + 1),
+      term_2 = pmax(.data$ch_ep_start, .data$fy_start)
     ) %>%
     dplyr::mutate(
       ch_beddays = dplyr::if_else(
-        recid == "CH",
-        as.numeric(term_1 - term_2),
+        .data$recid == "CH",
+        as.numeric(.data$term_1 - .data$term_2),
         NA_real_
       ),
       ch_cost = dplyr::if_else(
-        recid == "CH" & ch_no_cost == 0,
-        ch_beddays * ch_cost_per_day,
+        .data$recid == "CH" & .data$ch_no_cost == 0,
+        .data$ch_beddays * .data$ch_cost_per_day,
         NA_real_
       ),
       ch_beddays = dplyr::if_else(
-        recid == "CH" & ch_chi_cis == 0,
+        .data$recid == "CH" & .data$ch_chi_cis == 0,
         0,
-        ch_beddays
+        .data$ch_beddays
       ),
       ch_cost = dplyr::if_else(
-        recid == "CH" & ch_chi_cis == 0,
+        .data$recid == "CH" & .data$ch_chi_cis == 0,
         0,
-        ch_cost
+        .data$ch_cost
       )
     ) %>%
-    dplyr::select(
-      -fy_end, -fy_start, -term_1, -term_2
-    )
+    dplyr::select(-c("fy_end", "fy_start", "term_1", "term_2"))
 }
 
 #' Recode gender
diff --git a/man/clean_up_ch.Rd b/man/clean_up_ch.Rd
index 64bb3e330..0182c84e8 100644
--- a/man/clean_up_ch.Rd
+++ b/man/clean_up_ch.Rd
@@ -4,10 +4,12 @@
 \alias{clean_up_ch}
 \title{Clean up CH}
 \usage{
-clean_up_ch(episode_file)
+clean_up_ch(episode_file, year)
 }
 \arguments{
 \item{episode_file}{Tibble containing episodic data}
+
+\item{year}{The year to process, in FY format.}
 }
 \description{
 Clean up CH-related columns.

From cb73e0ff3098c1dc41c1dee12b65910e900c5ab8 Mon Sep 17 00:00:00 2001
From: James McMahon <james.mcmahon@phs.scot>
Date: Wed, 5 Jul 2023 10:17:04 +0100
Subject: [PATCH 178/200] Supply `year` as a parameter to
 `clean_individual_file`

---
 R/create_individual_file.R   | 4 ++--
 man/clean_individual_file.Rd | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/R/create_individual_file.R b/R/create_individual_file.R
index 56afed567..e20221c2e 100644
--- a/R/create_individual_file.R
+++ b/R/create_individual_file.R
@@ -18,7 +18,7 @@ create_individual_file <- function(episode_file, year, write_to_disk = TRUE) {
     clean_up_ch(year) %>%
     recode_gender() %>%
     aggregate_by_chi_zihao() %>%
-    clean_individual_file()
+    clean_individual_file(year)
 
   if (write_to_disk) {
     slf_path <- get_file_path(
@@ -742,7 +742,7 @@ min_no_inf <- function(x) {
 #' @description Clean up columns in individual file
 #'
 #' @param individual_file Individual file where each row represents a unique CHI
-clean_individual_file <- function(individual_file) {
+clean_individual_file <- function(individual_file, year) {
   cli::cli_alert_info("Clean individual file function started at {Sys.time()}")
 
   individual_file %>%
diff --git a/man/clean_individual_file.Rd b/man/clean_individual_file.Rd
index 30d5479c6..c56e4265f 100644
--- a/man/clean_individual_file.Rd
+++ b/man/clean_individual_file.Rd
@@ -4,7 +4,7 @@
 \alias{clean_individual_file}
 \title{Clean individual file}
 \usage{
-clean_individual_file(individual_file)
+clean_individual_file(individual_file, year)
 }
 \arguments{
 \item{individual_file}{Individual file where each row represents a unique CHI}

From 42cc15e9a2614cc5e864c374cd627df85555c13f Mon Sep 17 00:00:00 2001
From: James McMahon <james.mcmahon@phs.scot>
Date: Wed, 5 Jul 2023 10:49:08 +0100
Subject: [PATCH 179/200] Only keep required variables to save memory

---
 R/create_individual_file.R | 33 +++++++++++++++++++++++++++++++++
 1 file changed, 33 insertions(+)

diff --git a/R/create_individual_file.R b/R/create_individual_file.R
index e20221c2e..f6c92b99c 100644
--- a/R/create_individual_file.R
+++ b/R/create_individual_file.R
@@ -11,6 +11,39 @@
 #' @export
 create_individual_file <- function(episode_file, year, write_to_disk = TRUE) {
   individual_file <- episode_file %>%
+    dplyr::select(
+      "year",
+      "chi",
+      "dob",
+      "gender",
+      "record_keydate1",
+      "record_keydate2",
+      "keytime1",
+      "keytime2",
+      "recid",
+      "smrtype",
+      "ipdc",
+      "postcode",
+      "gpprac",
+      "cij_marker",
+      "cij_start_date",
+      "cij_end_date",
+      "cij_pattype",
+      "cij_pattype_code",
+      "cij_ppa",
+      "ch_chi_cis",
+      "yearstay",
+      "cost_total_net",
+      "cost_total_net_inc_dnas",
+      "attendance_status",
+      "no_paid_items",
+      "total_no_dn_contacts",
+      "primary_delay_reason",
+      "sc_latest_submission",
+      "hc_hours_annual",
+      "hc_reablement",
+      "ooh_case_id"
+    ) %>%
     remove_blank_chi() %>%
     add_cij_columns() %>%
     add_all_columns() %>%

From 35a6ef2155cf8270bc9bbdb9e47d7e3a9f4c7e9a Mon Sep 17 00:00:00 2001
From: James McMahon <james.mcmahon@phs.scot>
Date: Wed, 5 Jul 2023 10:51:25 +0100
Subject: [PATCH 180/200] Rename the parameter so the documentation works

---
 R/aggregate_by_chi_zihao.R    | 4 ++--
 man/aggregate_by_chi_zihao.Rd | 5 ++++-
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/R/aggregate_by_chi_zihao.R b/R/aggregate_by_chi_zihao.R
index 5b2ccc1ff..edd9dc980 100644
--- a/R/aggregate_by_chi_zihao.R
+++ b/R/aggregate_by_chi_zihao.R
@@ -7,10 +7,10 @@
 #' @importFrom data.table .SD
 #'
 #' @inheritParams create_individual_file
-aggregate_by_chi_zihao <- function(individual_file) {
+aggregate_by_chi_zihao <- function(episode_file) {
   cli::cli_alert_info("Aggregate by CHI function started at {Sys.time()}")
 
-  individual_file <- individual_file %>%
+  individual_file <- episode_file %>%
     dplyr::select(-c("postcode", "gpprac")) %>%
     dplyr::rename(
       "gpprac" = "most_recent_gpprac",
diff --git a/man/aggregate_by_chi_zihao.Rd b/man/aggregate_by_chi_zihao.Rd
index a754fde4d..3d4961e19 100644
--- a/man/aggregate_by_chi_zihao.Rd
+++ b/man/aggregate_by_chi_zihao.Rd
@@ -4,7 +4,10 @@
 \alias{aggregate_by_chi_zihao}
 \title{Aggregate by CHI}
 \usage{
-aggregate_by_chi_zihao(individual_file)
+aggregate_by_chi_zihao(episode_file)
+}
+\arguments{
+\item{episode_file}{Tibble containing episodic data}
 }
 \description{
 Aggregate episode file by CHI to convert into

From 978d9e83a637d0c502d82787dff2644287d9e475 Mon Sep 17 00:00:00 2001
From: James McMahon <james.mcmahon@phs.scot>
Date: Wed, 5 Jul 2023 14:44:47 +0100
Subject: [PATCH 181/200] Use `setnames` to change names to lower

---
 R/aggregate_by_chi_zihao.R | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/R/aggregate_by_chi_zihao.R b/R/aggregate_by_chi_zihao.R
index edd9dc980..b8ee86ff3 100644
--- a/R/aggregate_by_chi_zihao.R
+++ b/R/aggregate_by_chi_zihao.R
@@ -21,10 +21,11 @@ aggregate_by_chi_zihao <- function(episode_file) {
       dplyr::ends_with("_postcode"),
       dplyr::ends_with("_DoB")
     ))
+  # Convert to data.table
+  data.table::setDT(episode_file)
 
-  names(individual_file) <- tolower(names(individual_file))
-
-  data.table::setDT(individual_file) # Convert to data.table
+  # Ensure all variable names are lowercase
+  data.table::setnames(episode_file, stringr::str_to_lower)
 
   # Sort the data within each chunk
   data.table::setkeyv(
@@ -39,7 +40,7 @@ aggregate_by_chi_zihao <- function(episode_file) {
   )
 
   data.table::setnames(
-    individual_file,
+    episode_file,
     c(
       "ch_chi_cis", "cij_marker", "ooh_case_id"
       # ,"hh_in_fy"

From 9be6385e32e2d8445e8b1d9c864397ae5034191a Mon Sep 17 00:00:00 2001
From: James McMahon <james.mcmahon@phs.scot>
Date: Wed, 5 Jul 2023 15:08:59 +0100
Subject: [PATCH 182/200] Remove unneeded code

---
 R/aggregate_by_chi_zihao.R |  43 +---------
 R/create_individual_file.R | 162 ++-----------------------------------
 2 files changed, 11 insertions(+), 194 deletions(-)

diff --git a/R/aggregate_by_chi_zihao.R b/R/aggregate_by_chi_zihao.R
index b8ee86ff3..390f3a119 100644
--- a/R/aggregate_by_chi_zihao.R
+++ b/R/aggregate_by_chi_zihao.R
@@ -10,17 +10,6 @@
 aggregate_by_chi_zihao <- function(episode_file) {
   cli::cli_alert_info("Aggregate by CHI function started at {Sys.time()}")
 
-  individual_file <- episode_file %>%
-    dplyr::select(-c("postcode", "gpprac")) %>%
-    dplyr::rename(
-      "gpprac" = "most_recent_gpprac",
-      "postcode" = "most_recent_postcode"
-    ) %>%
-    dplyr::select(-c(
-      dplyr::ends_with("_gpprac"),
-      dplyr::ends_with("_postcode"),
-      dplyr::ends_with("_DoB")
-    ))
   # Convert to data.table
   data.table::setDT(episode_file)
 
@@ -54,35 +43,11 @@ aggregate_by_chi_zihao <- function(episode_file) {
   # column specification, grouped by chi
   # columns to select last
   cols2 <- c(
-    vars_end_with(
-      individual_file,
-      c("postcode", "dob", "ggprac")
-    ),
-    "ooh_cases",
+    "postcode",
+    "dob",
     "gpprac",
-    "hbrescode",
-    "hscp",
-    "lca",
-    "ca2018",
-    "locality",
-    "datazone2011",
-    "hbpraccode",
-    "cluster",
-    "simd2020v2_rank",
-    "simd2020v2_sc_decile",
-    "simd2020v2_sc_quintile",
-    "simd2020v2_hb2019_decile",
-    "simd2020v2_hb2019_quintile",
-    "simd2020v2_hscp2019_decile",
-    "simd2020v2_hscp2019_quintile",
-    "ur8_2020",
-    "ur6_2020",
-    "ur3_2020",
-    "ur2_2020",
-    "hb2019",
-    "hscp2019",
-    "ca2019",
-    vars_start_with(individual_file, "sc_")
+    "ooh_cases",
+    vars_start_with(episode_file, "sc_")
   )
   # columns to count unique rows
   cols3 <- c(
diff --git a/R/create_individual_file.R b/R/create_individual_file.R
index f6c92b99c..bcf17269c 100644
--- a/R/create_individual_file.R
+++ b/R/create_individual_file.R
@@ -354,7 +354,7 @@ add_nrs_columns <- function(episode_file, prefix, condition) {
 add_hl1_columns <- function(episode_file, prefix, condition) {
   condition <- substitute(condition)
   episode_file %>%
-    add_standard_cols(prefix, condition, drop = "gpprac")
+    add_standard_cols(prefix, condition)
 }
 
 #' Add CH columns
@@ -498,8 +498,7 @@ add_ipdc_cols <- function(episode_file, prefix, condition, ipdc_d = TRUE, electi
 #' @param drop Any columns out of "DoB", "postcode", and "gpprac" that should be dropped
 #' @param episode Whether to create prefix_episodes col, e.g. "Acute_episodes"
 #' @param cost Whether to create prefix_cost col, e.g. "Acute_cost"
-add_standard_cols <- function(episode_file, prefix, condition, drop = NULL, episode = FALSE, cost = FALSE) {
-  episode_file <- dplyr::bind_cols(episode_file, create_cols(episode_file, prefix, condition, drop))
+add_standard_cols <- function(episode_file, prefix, condition, episode = FALSE, cost = FALSE) {
   if (episode) {
     episode_file <- dplyr::mutate(episode_file, "{prefix}_episodes" := dplyr::if_else(eval(condition), 1L, NA_integer_))
   }
@@ -509,52 +508,6 @@ add_standard_cols <- function(episode_file, prefix, condition, drop = NULL, epis
   return(episode_file)
 }
 
-#' Create standard cols
-#'
-#' @description Create standard cols (DoB, postcode, gpprac).
-#'
-#' @inheritParams add_acute_columns
-#' @param drop Any columns out of "DoB", "postcode", and "gpprac" that should be dropped
-create_cols <- function(episode_file, prefix, condition, drop) {
-  cols <- c("DoB", "postcode", "gpprac")
-  if (!is.null(drop)) {
-    cols <- cols[cols != drop]
-  }
-  episode_file <- purrr::map_dfc(cols, ~ create_col(episode_file, .x, prefix, condition))
-  return(episode_file)
-}
-
-#' Create standard col
-#'
-#' @description Create single standard column.
-#'
-#' @inheritParams add_acute_columns
-#' @inheritParams na_type
-create_col <- function(episode_file, col, prefix, condition) {
-  episode_file %>%
-    dplyr::mutate("{prefix}_{col}" := dplyr::if_else(
-      eval(condition),
-      .data[[tolower(col)]],
-      na_type(col)
-    )) %>%
-    dplyr::select(dplyr::last_col())
-}
-
-#' NA type
-#'
-#' @description Helper function to use correct NA type depending on
-#' which type of column is created.
-#'
-#' @param col Which column to create ("DoB", "postcode", or "gpprac")
-na_type <- function(col = c("DoB", "postcode", "gpprac")) {
-  match.arg(col)
-  na_type <- switch(col,
-    "DoB" = lubridate::NA_Date_,
-    "postcode" = NA_character_,
-    "gpprac" = NA_real_
-  )
-  return(na_type)
-}
 
 #' Aggregate CIS episodes
 #'
@@ -778,41 +731,14 @@ min_no_inf <- function(x) {
 clean_individual_file <- function(individual_file, year) {
   cli::cli_alert_info("Clean individual file function started at {Sys.time()}")
 
-  individual_file %>%
-    drop_cols() %>%
-    clean_up_gender() %>%
-    dplyr::mutate(
-      age = compute_mid_year_age(year, .data$dob)
-    )
-}
-
-#' Drop redundant columns
-#'
-#' @description Drop redundant columns from individual file.
-#'
-#' @inheritParams clean_individual_file
-drop_cols <- function(individual_file) {
   individual_file %>%
     dplyr::select(
-      -month_cols(),
       -"ch_no_cost",
-      # -"dob",
-      # -"postcode",
-      # -"gpprac",
-      -"no_paid_items" # ,
-      #-"total_no_dn_contacts"
-    )
-}
-
-#' Month columns
-#'
-#' @description Return chr of column names following pattern
-#' "month_beddays" and "month_cost" e.g. apr_beddays" and "apr_cost"
-month_cols <- function() {
-  suffix <- c("_beddays", "_cost")
-  months <- tolower(c(rep(month.abb, each = 2)))
-  month_cols <- paste0(months, suffix)
-  return(month_cols)
+      -"no_paid_items",
+      -"total_no_dn_contacts"
+    ) %>%
+    clean_up_gender() %>%
+    dplyr::mutate(age = compute_mid_year_age(year, .data$dob))
 }
 
 #' Clean up gender column
@@ -829,77 +755,3 @@ clean_up_gender <- function(individual_file) {
       )
     )
 }
-
-
-#' Fill missing date of births
-#'
-#' @description Fill missing date of births with
-#' date of births from specific episode columns in hierarchy.
-#'
-#' @inheritParams clean_individual_file
-fill_dob <- function(individual_file) {
-  column_prefix <- c(
-    "PIS", "AE", "OoH", "OP", "Acute", "Mat", "DN", "CMH", "MH",
-    "GLS", "HL1", "CH", "HC", "AT", "SDS", "NSU", "NRS"
-  )
-  columns <- paste0(column_prefix, "_DoB")
-  for (i in length(columns)) {
-    individual_file <- replace_dob_with_col(individual_file, columns[i])
-  }
-  return(individual_file)
-}
-
-#' Fill missing date of births
-#'
-#' @description Fill missing date of births with
-#' date of births from an episode date of birth column.
-#'
-#' @inheritParams clean_individual_file
-#' @param col Column containing date of birth for episode
-replace_dob_with_col <- function(individual_file, col) {
-  individual_file %>%
-    dplyr::mutate(
-      DoB = dplyr::if_else(
-        is.na(.data$DoB) & !is.na(.data[[col]]),
-        .data[[col]],
-        .data$DoB
-      )
-    )
-}
-
-
-#' Fill missing postcodes
-#'
-#' @description Fill missing postcodes with
-#' postcodes from specific episode columns in hierarchy.
-#'
-#' @inheritParams clean_individual_file
-fill_dob <- function(individual_file) {
-  column_prefix <- c(
-    "PIS", "AE", "OoH", "OP", "Acute", "Mat", "HC", "DN", "CMH", "MH",
-    "GLS", "AT", "SDS", "CH", "NSU", "NRS", "HL1"
-  )
-  columns <- paste0(column_prefix, "_postcode")
-  for (i in length(columns)) {
-    individual_file <- replace_postcode_with_col(individual_file, columns[i])
-  }
-  return(individual_file)
-}
-
-#' Fill missing postcode
-#'
-#' @description Fill missing postcode with
-#' postcodes from an episode postcode column.
-#'
-#' @inheritParams clean_individual_file
-#' @param col Column containing postcode for episode
-replace_postcode_with_col <- function(individual_file, col) {
-  individual_file %>%
-    dplyr::mutate(
-      postcode = dplyr::if_else(
-        is.na(.data$postcode) & !is.na(.data[[col]]),
-        .data[[col]],
-        .data$postcode
-      )
-    )
-}

From 1ca40001731ef8542c09955e2133fc4e70a164f6 Mon Sep 17 00:00:00 2001
From: James McMahon <james.mcmahon@phs.scot>
Date: Wed, 5 Jul 2023 15:14:51 +0100
Subject: [PATCH 183/200] Update file path name

---
 R/create_individual_file.R | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/R/create_individual_file.R b/R/create_individual_file.R
index bcf17269c..051b10461 100644
--- a/R/create_individual_file.R
+++ b/R/create_individual_file.R
@@ -54,7 +54,7 @@ create_individual_file <- function(episode_file, year, write_to_disk = TRUE) {
     clean_individual_file(year)
 
   if (write_to_disk) {
-    slf_path <- get_file_path(
+    slf_indiv_path <- get_file_path(
       get_year_dir(year),
       stringr::str_glue(
         "source-individual-file-{year}.parquet"
@@ -62,7 +62,7 @@ create_individual_file <- function(episode_file, year, write_to_disk = TRUE) {
       check_mode = "write"
     )
 
-    write_file(episode_file, slf_path)
+    write_file(individual_file, slf_indiv_path)
   }
 
   return(individual_file)

From 3ebfecca09f955b8594352483eac8a5500110b60 Mon Sep 17 00:00:00 2001
From: James McMahon <james.mcmahon@phs.scot>
Date: Wed, 5 Jul 2023 15:16:06 +0100
Subject: [PATCH 184/200] Trim the return code

---
 R/aggregate_by_chi_zihao.R | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/R/aggregate_by_chi_zihao.R b/R/aggregate_by_chi_zihao.R
index 390f3a119..00029d469 100644
--- a/R/aggregate_by_chi_zihao.R
+++ b/R/aggregate_by_chi_zihao.R
@@ -151,10 +151,9 @@ aggregate_by_chi_zihao <- function(episode_file) {
     individual_file_cols5[, chi := NULL],
     individual_file_cols6[, chi := NULL]
   )
-  # convert back to tibble
-  individual_file <- dplyr::as_tibble(individual_file)
 
-  return(individual_file)
+  # convert back to tibble
+  return(dplyr::as_tibble(individual_file))
 }
 
 

From beae36a76bae412c06eb37f2e3a894da3818b3d3 Mon Sep 17 00:00:00 2001
From: James McMahon <james.mcmahon@phs.scot>
Date: Wed, 5 Jul 2023 15:20:23 +0100
Subject: [PATCH 185/200] Some fixes

---
 R/aggregate_by_chi_zihao.R | 29 ++++++++++++++---------------
 1 file changed, 14 insertions(+), 15 deletions(-)

diff --git a/R/aggregate_by_chi_zihao.R b/R/aggregate_by_chi_zihao.R
index 00029d469..2f05aebf5 100644
--- a/R/aggregate_by_chi_zihao.R
+++ b/R/aggregate_by_chi_zihao.R
@@ -16,9 +16,9 @@ aggregate_by_chi_zihao <- function(episode_file) {
   # Ensure all variable names are lowercase
   data.table::setnames(episode_file, stringr::str_to_lower)
 
-  # Sort the data within each chunk
+  # Sort the data
   data.table::setkeyv(
-    individual_file,
+    episode_file,
     c(
       "chi",
       "record_keydate1",
@@ -62,7 +62,7 @@ aggregate_by_chi_zihao <- function(episode_file) {
   # columns to sum up
   cols4 <- c(
     vars_end_with(
-      individual_file,
+      episode_file,
       c(
         "episodes",
         "beddays",
@@ -86,49 +86,48 @@ aggregate_by_chi_zihao <- function(episode_file) {
       )
     ),
     vars_start_with(
-      individual_file,
+      episode_file,
       "sds_option"
     ),
     "health_net_costincdnas"
   )
   cols4 <- cols4[!(cols4 %in% c("ch_cis_episodes"))]
   # columns to select maximum
-  cols5 <- vars_contain(individual_file, c("nsu", "hl1_in_fy"))
-  cols5 <- cols5[!(cols5 %in% c("ooh_consultation_time"))]
-
+  cols5 <- c("nsu", vars_contain(episode_file, c("hl1_in_fy")))
+  data.table::setnafill(episode_file, fill= 0L, cols = cols5)
   # compute
-  individual_file_cols1 <- individual_file[,
+  individual_file_cols1 <- episode_file[,
     .(gender = mean(gender)),
     by = "chi"
   ]
-  individual_file_cols2 <- individual_file[,
+  individual_file_cols2 <- episode_file[,
     .SD[.N],
     .SDcols = cols2,
     by = "chi"
   ]
-  individual_file_cols3 <- individual_file[,
+  individual_file_cols3 <- episode_file[,
     lapply(.SD, function(x) {
       data.table::uniqueN(x, na.rm = TRUE)
     }),
     .SDcols = cols3,
     by = "chi"
   ]
-  individual_file_cols4 <- individual_file[,
+  individual_file_cols4 <- episode_file[,
     lapply(.SD, function(x) {
       sum(x, na.rm = TRUE)
     }),
     .SDcols = cols4,
     by = "chi"
   ]
-  individual_file_cols5 <- individual_file[,
+  individual_file_cols5 <- episode_file[,
     lapply(.SD, function(x) max(x, na.rm = TRUE)),
     .SDcols = cols5,
     by = "chi"
   ]
-  individual_file_cols6 <- individual_file[,
+  individual_file_cols6 <- episode_file[,
     .(
       preventable_beddays = ifelse(
-        cij_ppa == 1,
+        max(cij_ppa, na.rm = TRUE),
         max(cij_end_date) - min(cij_start_date),
         NA_real_
       )
@@ -207,7 +206,7 @@ aggregate_ch_episodes_zihao <- function(episode_file) {
     ch_ep_start = min(record_keydate1),
     ch_ep_end = max(ch_ep_end),
     ch_cost_per_day = mean(ch_cost_per_day)
-  ), by = .("chi", "ch_chi_cis")]
+  ), by = c("chi", "ch_chi_cis")]
 
   # Convert back to tibble if needed
   episode_file <- tibble::as_tibble(episode_file)

From 13b7f1105083ed3e24683b8f734dfc1975d93d4c Mon Sep 17 00:00:00 2001
From: James McMahon <james.mcmahon@phs.scot>
Date: Wed, 5 Jul 2023 16:50:34 +0100
Subject: [PATCH 186/200] Correctly compute `ooh_cases`

---
 R/aggregate_by_chi_zihao.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/R/aggregate_by_chi_zihao.R b/R/aggregate_by_chi_zihao.R
index 2f05aebf5..a2638cbe1 100644
--- a/R/aggregate_by_chi_zihao.R
+++ b/R/aggregate_by_chi_zihao.R
@@ -46,7 +46,6 @@ aggregate_by_chi_zihao <- function(episode_file) {
     "postcode",
     "dob",
     "gpprac",
-    "ooh_cases",
     vars_start_with(episode_file, "sc_")
   )
   # columns to count unique rows
@@ -57,6 +56,7 @@ aggregate_by_chi_zihao <- function(episode_file) {
     "cij_non_el",
     "cij_mat",
     "cij_delay",
+    "ooh_cases",
     "preventable_admissions"
   )
   # columns to sum up

From c03a0ee47e021e0e95d6f12f009992041a7b80cc Mon Sep 17 00:00:00 2001
From: Moohan <Moohan@users.noreply.github.com>
Date: Wed, 5 Jul 2023 15:53:06 +0000
Subject: [PATCH 187/200] Update documentation

---
 man/add_standard_cols.Rd         |  5 ++---
 man/create_col.Rd                | 20 --------------------
 man/create_cols.Rd               | 20 --------------------
 man/drop_cols.Rd                 | 14 --------------
 man/fill_dob.Rd                  | 20 --------------------
 man/month_cols.Rd                | 12 ------------
 man/na_type.Rd                   | 15 ---------------
 man/replace_dob_with_col.Rd      | 17 -----------------
 man/replace_postcode_with_col.Rd | 17 -----------------
 9 files changed, 2 insertions(+), 138 deletions(-)
 delete mode 100644 man/create_col.Rd
 delete mode 100644 man/create_cols.Rd
 delete mode 100644 man/drop_cols.Rd
 delete mode 100644 man/fill_dob.Rd
 delete mode 100644 man/month_cols.Rd
 delete mode 100644 man/na_type.Rd
 delete mode 100644 man/replace_dob_with_col.Rd
 delete mode 100644 man/replace_postcode_with_col.Rd

diff --git a/man/add_standard_cols.Rd b/man/add_standard_cols.Rd
index becec0ddd..0a44e95ee 100644
--- a/man/add_standard_cols.Rd
+++ b/man/add_standard_cols.Rd
@@ -8,7 +8,6 @@ add_standard_cols(
   episode_file,
   prefix,
   condition,
-  drop = NULL,
   episode = FALSE,
   cost = FALSE
 )
@@ -20,11 +19,11 @@ add_standard_cols(
 
 \item{condition}{Condition to create new columns based on}
 
-\item{drop}{Any columns out of "DoB", "postcode", and "gpprac" that should be dropped}
-
 \item{episode}{Whether to create prefix_episodes col, e.g. "Acute_episodes"}
 
 \item{cost}{Whether to create prefix_cost col, e.g. "Acute_cost"}
+
+\item{drop}{Any columns out of "DoB", "postcode", and "gpprac" that should be dropped}
 }
 \description{
 Add standard columns (DoB, postcode, gpprac, episodes, cost) to episode file.
diff --git a/man/create_col.Rd b/man/create_col.Rd
deleted file mode 100644
index 7357adf5d..000000000
--- a/man/create_col.Rd
+++ /dev/null
@@ -1,20 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/create_individual_file.R
-\name{create_col}
-\alias{create_col}
-\title{Create standard col}
-\usage{
-create_col(episode_file, col, prefix, condition)
-}
-\arguments{
-\item{episode_file}{Tibble containing episodic data}
-
-\item{col}{Which column to create ("DoB", "postcode", or "gpprac")}
-
-\item{prefix}{Prefix to add to related columns, e.g. "Acute"}
-
-\item{condition}{Condition to create new columns based on}
-}
-\description{
-Create single standard column.
-}
diff --git a/man/create_cols.Rd b/man/create_cols.Rd
deleted file mode 100644
index 6bbe1d98a..000000000
--- a/man/create_cols.Rd
+++ /dev/null
@@ -1,20 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/create_individual_file.R
-\name{create_cols}
-\alias{create_cols}
-\title{Create standard cols}
-\usage{
-create_cols(episode_file, prefix, condition, drop)
-}
-\arguments{
-\item{episode_file}{Tibble containing episodic data}
-
-\item{prefix}{Prefix to add to related columns, e.g. "Acute"}
-
-\item{condition}{Condition to create new columns based on}
-
-\item{drop}{Any columns out of "DoB", "postcode", and "gpprac" that should be dropped}
-}
-\description{
-Create standard cols (DoB, postcode, gpprac).
-}
diff --git a/man/drop_cols.Rd b/man/drop_cols.Rd
deleted file mode 100644
index 8029d289c..000000000
--- a/man/drop_cols.Rd
+++ /dev/null
@@ -1,14 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/create_individual_file.R
-\name{drop_cols}
-\alias{drop_cols}
-\title{Drop redundant columns}
-\usage{
-drop_cols(individual_file)
-}
-\arguments{
-\item{individual_file}{Individual file where each row represents a unique CHI}
-}
-\description{
-Drop redundant columns from individual file.
-}
diff --git a/man/fill_dob.Rd b/man/fill_dob.Rd
deleted file mode 100644
index 3dc8e4295..000000000
--- a/man/fill_dob.Rd
+++ /dev/null
@@ -1,20 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/create_individual_file.R
-\name{fill_dob}
-\alias{fill_dob}
-\title{Fill missing date of births}
-\usage{
-fill_dob(individual_file)
-
-fill_dob(individual_file)
-}
-\arguments{
-\item{individual_file}{Individual file where each row represents a unique CHI}
-}
-\description{
-Fill missing date of births with
-date of births from specific episode columns in hierarchy.
-
-Fill missing postcodes with
-postcodes from specific episode columns in hierarchy.
-}
diff --git a/man/month_cols.Rd b/man/month_cols.Rd
deleted file mode 100644
index b8dd641e5..000000000
--- a/man/month_cols.Rd
+++ /dev/null
@@ -1,12 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/create_individual_file.R
-\name{month_cols}
-\alias{month_cols}
-\title{Month columns}
-\usage{
-month_cols()
-}
-\description{
-Return chr of column names following pattern
-"month_beddays" and "month_cost" e.g. apr_beddays" and "apr_cost"
-}
diff --git a/man/na_type.Rd b/man/na_type.Rd
deleted file mode 100644
index f8cbc9581..000000000
--- a/man/na_type.Rd
+++ /dev/null
@@ -1,15 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/create_individual_file.R
-\name{na_type}
-\alias{na_type}
-\title{NA type}
-\usage{
-na_type(col = c("DoB", "postcode", "gpprac"))
-}
-\arguments{
-\item{col}{Which column to create ("DoB", "postcode", or "gpprac")}
-}
-\description{
-Helper function to use correct NA type depending on
-which type of column is created.
-}
diff --git a/man/replace_dob_with_col.Rd b/man/replace_dob_with_col.Rd
deleted file mode 100644
index 61016ec2e..000000000
--- a/man/replace_dob_with_col.Rd
+++ /dev/null
@@ -1,17 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/create_individual_file.R
-\name{replace_dob_with_col}
-\alias{replace_dob_with_col}
-\title{Fill missing date of births}
-\usage{
-replace_dob_with_col(individual_file, col)
-}
-\arguments{
-\item{individual_file}{Individual file where each row represents a unique CHI}
-
-\item{col}{Column containing date of birth for episode}
-}
-\description{
-Fill missing date of births with
-date of births from an episode date of birth column.
-}
diff --git a/man/replace_postcode_with_col.Rd b/man/replace_postcode_with_col.Rd
deleted file mode 100644
index 3feb0fbcb..000000000
--- a/man/replace_postcode_with_col.Rd
+++ /dev/null
@@ -1,17 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/create_individual_file.R
-\name{replace_postcode_with_col}
-\alias{replace_postcode_with_col}
-\title{Fill missing postcode}
-\usage{
-replace_postcode_with_col(individual_file, col)
-}
-\arguments{
-\item{individual_file}{Individual file where each row represents a unique CHI}
-
-\item{col}{Column containing postcode for episode}
-}
-\description{
-Fill missing postcode with
-postcodes from an episode postcode column.
-}

From 00275769e0a1d03a7a201e64e014af12fe3e4c2d Mon Sep 17 00:00:00 2001
From: Moohan <Moohan@users.noreply.github.com>
Date: Wed, 5 Jul 2023 15:58:40 +0000
Subject: [PATCH 188/200] Style code

---
 R/aggregate_by_chi_zihao.R | 2 +-
 R/create_individual_file.R | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/R/aggregate_by_chi_zihao.R b/R/aggregate_by_chi_zihao.R
index a2638cbe1..735a549b2 100644
--- a/R/aggregate_by_chi_zihao.R
+++ b/R/aggregate_by_chi_zihao.R
@@ -94,7 +94,7 @@ aggregate_by_chi_zihao <- function(episode_file) {
   cols4 <- cols4[!(cols4 %in% c("ch_cis_episodes"))]
   # columns to select maximum
   cols5 <- c("nsu", vars_contain(episode_file, c("hl1_in_fy")))
-  data.table::setnafill(episode_file, fill= 0L, cols = cols5)
+  data.table::setnafill(episode_file, fill = 0L, cols = cols5)
   # compute
   individual_file_cols1 <- episode_file[,
     .(gender = mean(gender)),
diff --git a/R/create_individual_file.R b/R/create_individual_file.R
index a47d91997..ab1c0cc32 100644
--- a/R/create_individual_file.R
+++ b/R/create_individual_file.R
@@ -52,7 +52,7 @@ create_individual_file <- function(episode_file, year, write_to_disk = TRUE) {
     recode_gender() %>%
     aggregate_by_chi_zihao() %>%
     clean_individual_file(year)
-    join_cohort_lookups(year) %>%
+  join_cohort_lookups(year) %>%
     match_on_ltcs(year) %>%
     join_deaths_data(year) %>%
     join_sparra_hhg(year)

From 60e3f3a50e40668f13649d7b0cd4b10385d229c5 Mon Sep 17 00:00:00 2001
From: James McMahon <james.mcmahon@phs.scot>
Date: Thu, 6 Jul 2023 09:11:09 +0000
Subject: [PATCH 189/200] [check-spelling] Update metadata

Update for https://github.com/Public-Health-Scotland/source-linkage-files/actions/runs/5466392495/attempts/1
Accepted in https://github.com/Public-Health-Scotland/source-linkage-files/pull/719#issuecomment-1623280566

Signed-off-by: check-spelling-bot <check-spelling-bot@users.noreply.github.com>
---
 .github/actions/spelling/expect.txt | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/.github/actions/spelling/expect.txt b/.github/actions/spelling/expect.txt
index f2b3553d2..ea3bab150 100644
--- a/.github/actions/spelling/expect.txt
+++ b/.github/actions/spelling/expect.txt
@@ -89,6 +89,7 @@ hhg
 hjust
 hms
 homecare
+homev
 hscp
 hscpnames
 infyyear
@@ -165,8 +166,12 @@ rspm
 RStudio
 rstudioapi
 Rtype
+SDcols
 seealso
 selfharm
+setkeyv
+setnafill
+setnames
 Siar
 sigfac
 simd

From c8d86c5dffffadada834e507ec280951e63c5188 Mon Sep 17 00:00:00 2001
From: James McMahon <james.mcmahon@phs.scot>
Date: Mon, 10 Jul 2023 15:59:59 +0100
Subject: [PATCH 190/200] Add targets for the individual file

---
 _targets.R | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/_targets.R b/_targets.R
index ec887724c..587abfd38 100644
--- a/_targets.R
+++ b/_targets.R
@@ -533,6 +533,21 @@ list(
         data = episode_file,
         year = year
       )
+    ),
+    tar_target(
+      individual_file,
+      create_individual_file(
+        episode_file = episode_file,
+        year = year,
+        write_to_disk = write_to_disk
+      )
+    ),
+    tar_target(
+      individual_file_tests,
+      process_tests_individual_file(
+        data = individual_file,
+        year = year
+      )
     )
   )
 )

From 62c70c5fd65186a1669b753137ab99ddb22d341c Mon Sep 17 00:00:00 2001
From: James McMahon <james.mcmahon@phs.scot>
Date: Mon, 10 Jul 2023 16:42:06 +0100
Subject: [PATCH 191/200] Fix missed pipe

---
 R/create_individual_file.R | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/R/create_individual_file.R b/R/create_individual_file.R
index ab1c0cc32..6b45ef722 100644
--- a/R/create_individual_file.R
+++ b/R/create_individual_file.R
@@ -51,8 +51,8 @@ create_individual_file <- function(episode_file, year, write_to_disk = TRUE) {
     clean_up_ch(year) %>%
     recode_gender() %>%
     aggregate_by_chi_zihao() %>%
-    clean_individual_file(year)
-  join_cohort_lookups(year) %>%
+    clean_individual_file(year) %>%
+    join_cohort_lookups(year) %>%
     match_on_ltcs(year) %>%
     join_deaths_data(year) %>%
     join_sparra_hhg(year)

From 54252a12c88204f822e0447293c859c5c6d42d0e Mon Sep 17 00:00:00 2001
From: Moohan <Moohan@users.noreply.github.com>
Date: Tue, 11 Jul 2023 13:08:37 +0000
Subject: [PATCH 192/200] Style code

---
 R/create_demographic_lookup.R |  3 +--
 R/create_service_use_lookup.R |  3 +--
 R/run_episode_file.R          | 28 ++++++++++++++--------------
 3 files changed, 16 insertions(+), 18 deletions(-)

diff --git a/R/create_demographic_lookup.R b/R/create_demographic_lookup.R
index 8fabd39a5..dfc2e25cf 100644
--- a/R/create_demographic_lookup.R
+++ b/R/create_demographic_lookup.R
@@ -13,8 +13,7 @@ create_demographic_cohorts <- function(
     data,
     year,
     update = latest_update(),
-    write_to_disk = TRUE
-) {
+    write_to_disk = TRUE) {
   check_variables_exist(
     data,
     c(
diff --git a/R/create_service_use_lookup.R b/R/create_service_use_lookup.R
index 7164038bc..30d3b0789 100644
--- a/R/create_service_use_lookup.R
+++ b/R/create_service_use_lookup.R
@@ -10,8 +10,7 @@ create_service_use_cohorts <- function(
     data,
     year,
     update = latest_update(),
-    write_to_disk = TRUE
-) {
+    write_to_disk = TRUE) {
   check_variables_exist(data, variables = c(
     "chi",
     "recid",
diff --git a/R/run_episode_file.R b/R/run_episode_file.R
index 0ec10c474..f0c8478dc 100644
--- a/R/run_episode_file.R
+++ b/R/run_episode_file.R
@@ -313,19 +313,19 @@ create_cohort_lookups <- function(data, year, update = latest_update()) {
 
   future_demographic <- future::future({
     create_demographic_cohorts(
-    data,
-    year,
-    update,
-    write_to_disk = TRUE
-  )
-    })
+      data,
+      year,
+      update,
+      write_to_disk = TRUE
+    )
+  })
   future_service_use <- future::future({
-  create_service_use_cohorts(
-    data,
-    year,
-    update,
-    write_to_disk = TRUE
-  )
+    create_service_use_cohorts(
+      data,
+      year,
+      update,
+      write_to_disk = TRUE
+    )
   })
 
   # This 'blocks' the code until they have both finished executing
@@ -341,11 +341,11 @@ create_cohort_lookups <- function(data, year, update = latest_update()) {
 #'
 #' @return The data including the Demographic and Service Use lookups.
 join_cohort_lookups <- function(data, year, update = latest_update()) {
-    join_cohort_lookups <- data %>%
+  join_cohort_lookups <- data %>%
     dplyr::left_join(
       read_file(
         get_demographic_cohorts_path(year, update),
-        col_select = c("chi","demographic_cohort")
+        col_select = c("chi", "demographic_cohort")
       ),
       by = "chi"
     ) %>%

From 9ae871a312c9f4064dbdb16f1fdd4f41181302d6 Mon Sep 17 00:00:00 2001
From: James McMahon <james.mcmahon@phs.scot>
Date: Tue, 11 Jul 2023 12:04:08 +0100
Subject: [PATCH 193/200] Update some targets to only run once a week

---
 _targets.R | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/_targets.R b/_targets.R
index 587abfd38..45a081b14 100644
--- a/_targets.R
+++ b/_targets.R
@@ -27,7 +27,10 @@ list(
     file_path_ext_clean,
     make_lowercase_ext(),
     priority = 1,
-    cue = tar_cue(mode = "always")
+    cue = tar_cue_age(
+      name = file_path_ext_clean,
+      age = as.difftime(7, units = "days")
+    )
   ),
   ## Lookup data ##
   tar_target(gpprac_opendata, get_gpprac_opendata()),
@@ -182,7 +185,10 @@ list(
       compress_extracts,
       gzip_files(year),
       priority = 1,
-      cue = tar_cue(mode = "always")
+      cue = tar_cue_age(
+        name = compress_extracts,
+        age = as.difftime(7, units = "days")
+      )
     ),
     ### target data extracts ###
     tar_file_read(

From 486b51d54a4e73be3f2dc1fb6a81f5639f3298ad Mon Sep 17 00:00:00 2001
From: James McMahon <james.mcmahon@phs.scot>
Date: Tue, 11 Jul 2023 16:30:55 +0100
Subject: [PATCH 194/200] Make the deaths lookup unique

---
 R/process_lookup_deaths.R | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/R/process_lookup_deaths.R b/R/process_lookup_deaths.R
index 2141e14f2..50689d24b 100644
--- a/R/process_lookup_deaths.R
+++ b/R/process_lookup_deaths.R
@@ -22,7 +22,9 @@ process_slf_deaths_lookup <- function(
     chi_deaths_data = read_file(get_slf_chi_deaths_path()),
     write_to_disk = TRUE) {
   slf_deaths_lookup <- nrs_deaths_data %>%
-    dplyr::select("chi", "record_keydate1") %>%
+    # Only modification over 'raw' NRS is to keep the earliest death date
+    dplyr::arrange(.data$record_keydate1) %>%
+    dplyr::distinct(.data$chi, .data$record_keydate1) %>%
     dplyr::mutate(
       death_date = .data$record_keydate1,
       deceased = TRUE,

From 5ad1928255db9cae1bfd7bfb1bf171ec01c5a20f Mon Sep 17 00:00:00 2001
From: James McMahon <james.mcmahon@phs.scot>
Date: Wed, 12 Jul 2023 10:14:51 +0100
Subject: [PATCH 195/200] Add `year` back to the individual file

---
 R/create_individual_file.R | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/R/create_individual_file.R b/R/create_individual_file.R
index 6b45ef722..975f0317d 100644
--- a/R/create_individual_file.R
+++ b/R/create_individual_file.R
@@ -55,7 +55,8 @@ create_individual_file <- function(episode_file, year, write_to_disk = TRUE) {
     join_cohort_lookups(year) %>%
     match_on_ltcs(year) %>%
     join_deaths_data(year) %>%
-    join_sparra_hhg(year)
+    join_sparra_hhg(year) %>%
+    dplyr::mutate(year = year)
 
   if (write_to_disk) {
     slf_indiv_path <- get_file_path(

From 507fffee54ed6202e2b2903a74c196fc141e08b5 Mon Sep 17 00:00:00 2001
From: James McMahon <james.mcmahon@phs.scot>
Date: Fri, 14 Jul 2023 08:23:13 +0100
Subject: [PATCH 196/200] Remove `cost_total_net_inc_dnas` from the indiv file 
 (#737)

* Drop `cost_total_net_inc_dnas`

* Rename `health_net_costincdnas` to `health_net_cost_inc_dnas`
---
 R/aggregate_by_chi_zihao.R |  2 +-
 R/create_individual_file.R | 11 +++++++----
 2 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/R/aggregate_by_chi_zihao.R b/R/aggregate_by_chi_zihao.R
index 735a549b2..0eee203e8 100644
--- a/R/aggregate_by_chi_zihao.R
+++ b/R/aggregate_by_chi_zihao.R
@@ -89,7 +89,7 @@ aggregate_by_chi_zihao <- function(episode_file) {
       episode_file,
       "sds_option"
     ),
-    "health_net_costincdnas"
+    "health_net_cost_inc_dnas"
   )
   cols4 <- cols4[!(cols4 %in% c("ch_cis_episodes"))]
   # columns to select maximum
diff --git a/R/create_individual_file.R b/R/create_individual_file.R
index 975f0317d..0ecfaaaab 100644
--- a/R/create_individual_file.R
+++ b/R/create_individual_file.R
@@ -167,7 +167,7 @@ add_all_columns <- function(episode_file) {
         ),
         na.rm = TRUE
       ),
-      health_net_costincdnas = .data$health_net_cost + dplyr::if_else(
+      health_net_cost_inc_dnas = .data$health_net_cost + dplyr::if_else(
         is.na(.data$OP_cost_dnas),
         0,
         .data$OP_cost_dnas
@@ -738,9 +738,12 @@ clean_individual_file <- function(individual_file, year) {
 
   individual_file %>%
     dplyr::select(
-      -"ch_no_cost",
-      -"no_paid_items",
-      -"total_no_dn_contacts"
+      !c(
+        "ch_no_cost",
+        "no_paid_items",
+        "total_no_dn_contacts",
+        "cost_total_net_inc_dnas"
+      )
     ) %>%
     clean_up_gender() %>%
     dplyr::mutate(age = compute_mid_year_age(year, .data$dob))

From 292f4d814a9ea4dc1104bafd06b90f89f75f4c95 Mon Sep 17 00:00:00 2001
From: Jennit07 <67372904+Jennit07@users.noreply.github.com>
Date: Fri, 14 Jul 2023 15:07:21 +0100
Subject: [PATCH 197/200] Join slf lookups onto individual file (#724)

* Create function for matching on slf lookups

* fix some build warnings

* Add `hbrescode` to select list

* Pass lookups as parameters/deal with hbrescode

* Update R/create_individual_file.R

---------

Co-authored-by: James McMahon <james.mcmahon@phs.scot>
---
 R/create_individual_file.R   | 34 +++++++++++++++++++++++++++++++++-
 R/run_episode_file.R         |  1 +
 man/add_standard_cols.Rd     |  2 --
 man/clean_individual_file.Rd |  2 ++
 man/join_cohort_lookups.Rd   |  2 ++
 man/join_slf_lookup_vars.Rd  | 27 +++++++++++++++++++++++++++
 6 files changed, 65 insertions(+), 3 deletions(-)
 create mode 100644 man/join_slf_lookup_vars.Rd

diff --git a/R/create_individual_file.R b/R/create_individual_file.R
index 0ecfaaaab..7825951c6 100644
--- a/R/create_individual_file.R
+++ b/R/create_individual_file.R
@@ -56,6 +56,7 @@ create_individual_file <- function(episode_file, year, write_to_disk = TRUE) {
     match_on_ltcs(year) %>%
     join_deaths_data(year) %>%
     join_sparra_hhg(year) %>%
+    join_slf_lookup_vars() %>%
     dplyr::mutate(year = year)
 
   if (write_to_disk) {
@@ -500,7 +501,6 @@ add_ipdc_cols <- function(episode_file, prefix, condition, ipdc_d = TRUE, electi
 #' @description Add standard columns (DoB, postcode, gpprac, episodes, cost) to episode file.
 #'
 #' @inheritParams add_acute_columns
-#' @param drop Any columns out of "DoB", "postcode", and "gpprac" that should be dropped
 #' @param episode Whether to create prefix_episodes col, e.g. "Acute_episodes"
 #' @param cost Whether to create prefix_cost col, e.g. "Acute_cost"
 add_standard_cols <- function(episode_file, prefix, condition, episode = FALSE, cost = FALSE) {
@@ -733,6 +733,7 @@ min_no_inf <- function(x) {
 #' @description Clean up columns in individual file
 #'
 #' @param individual_file Individual file where each row represents a unique CHI
+#' @param year Financial year e.g 1718
 clean_individual_file <- function(individual_file, year) {
   cli::cli_alert_info("Clean individual file function started at {Sys.time()}")
 
@@ -763,3 +764,34 @@ clean_up_gender <- function(individual_file) {
       )
     )
 }
+
+#' Join slf lookup variables
+#'
+#' @description Join lookup variables from slf postcode lookup and slf gpprac
+#'              lookup.
+#'
+#' @param individual_file the processed individual file.
+#' @param slf_postcode_lookup SLF processed postcode lookup
+#' @param slf_gpprac_lookup SLF processed gpprac lookup
+#' @param hbrescode_var hbrescode variable
+#'
+join_slf_lookup_vars <- function(individual_file,
+                                 slf_postcode_lookup = read_file(get_slf_postcode_path()),
+                                 slf_gpprac_lookup = read_file(
+                                   get_slf_gpprac_path(),
+                                   col_select = c("gpprac", "cluster", "hbpraccode")
+                                 ),
+                                 hbrescode_var = "hb2018") {
+  individual_file <- individual_file %>%
+    dplyr::left_join(
+      slf_postcode_lookup,
+      by = "postcode"
+    ) %>%
+    dplyr::left_join(
+      slf_gpprac_lookup,
+      by = "gpprac"
+    ) %>%
+    dplyr::rename(hbrescode = hbrescode_var)
+
+  return(individual_file)
+}
diff --git a/R/run_episode_file.R b/R/run_episode_file.R
index ea22f776a..f640a4437 100644
--- a/R/run_episode_file.R
+++ b/R/run_episode_file.R
@@ -339,6 +339,7 @@ create_cohort_lookups <- function(data, year, update = latest_update()) {
 #' Join cohort lookups
 #'
 #' @inheritParams store_ep_file_vars
+#' @param update The latest update e.g. "Jun_2023"
 #'
 #' @return The data including the Demographic and Service Use lookups.
 join_cohort_lookups <- function(data, year, update = latest_update()) {
diff --git a/man/add_standard_cols.Rd b/man/add_standard_cols.Rd
index 0a44e95ee..744aa49de 100644
--- a/man/add_standard_cols.Rd
+++ b/man/add_standard_cols.Rd
@@ -22,8 +22,6 @@ add_standard_cols(
 \item{episode}{Whether to create prefix_episodes col, e.g. "Acute_episodes"}
 
 \item{cost}{Whether to create prefix_cost col, e.g. "Acute_cost"}
-
-\item{drop}{Any columns out of "DoB", "postcode", and "gpprac" that should be dropped}
 }
 \description{
 Add standard columns (DoB, postcode, gpprac, episodes, cost) to episode file.
diff --git a/man/clean_individual_file.Rd b/man/clean_individual_file.Rd
index c56e4265f..fb2d3ae13 100644
--- a/man/clean_individual_file.Rd
+++ b/man/clean_individual_file.Rd
@@ -8,6 +8,8 @@ clean_individual_file(individual_file, year)
 }
 \arguments{
 \item{individual_file}{Individual file where each row represents a unique CHI}
+
+\item{year}{Financial year e.g 1718}
 }
 \description{
 Clean up columns in individual file
diff --git a/man/join_cohort_lookups.Rd b/man/join_cohort_lookups.Rd
index 21f376bdc..7e18e022c 100644
--- a/man/join_cohort_lookups.Rd
+++ b/man/join_cohort_lookups.Rd
@@ -10,6 +10,8 @@ join_cohort_lookups(data, year, update = latest_update())
 \item{data}{The in progress episode file data.}
 
 \item{year}{The year to process, in FY format.}
+
+\item{update}{The latest update e.g. "Jun_2023"}
 }
 \value{
 The data including the Demographic and Service Use lookups.
diff --git a/man/join_slf_lookup_vars.Rd b/man/join_slf_lookup_vars.Rd
new file mode 100644
index 000000000..980c66f31
--- /dev/null
+++ b/man/join_slf_lookup_vars.Rd
@@ -0,0 +1,27 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/create_individual_file.R
+\name{join_slf_lookup_vars}
+\alias{join_slf_lookup_vars}
+\title{Join slf lookup variables}
+\usage{
+join_slf_lookup_vars(
+  individual_file,
+  slf_postcode_lookup = read_file(get_slf_postcode_path()),
+  slf_gpprac_lookup = read_file(get_slf_gpprac_path(), col_select = c("gpprac",
+    "cluster", "hbpraccode")),
+  hbrescode_var = "hb2018"
+)
+}
+\arguments{
+\item{individual_file}{the processed individual file.}
+
+\item{slf_postcode_lookup}{SLF processed postcode lookup}
+
+\item{slf_gpprac_lookup}{SLF processed gpprac lookup}
+
+\item{hbrescode_var}{hbrescode variable}
+}
+\description{
+Join lookup variables from slf postcode lookup and slf gpprac
+lookup.
+}

From c644992aced8341d866d6111eaeb1e493a41f6ad Mon Sep 17 00:00:00 2001
From: Jennit07 <67372904+Jennit07@users.noreply.github.com>
Date: Mon, 17 Jul 2023 13:17:06 +0100
Subject: [PATCH 198/200] Join sc client variables onto individual file (#740)

* New function for matching sc client to indiv file

* Style code

* [check-spelling] Update metadata

Update for https://github.com/Public-Health-Scotland/source-linkage-files/actions/runs/5555048903/attempts/1
Accepted in https://github.com/Public-Health-Scotland/source-linkage-files/pull/740#issuecomment-1635955654

Signed-off-by: check-spelling-bot <check-spelling-bot@users.noreply.github.com>

* Code layout

* Style code

* Remove redundant sc variables

Co-authored-by: James McMahon <james.mcmahon@phs.scot>

* Update comments

Co-authored-by: James McMahon <james.mcmahon@phs.scot>

* Update comments

Co-authored-by: James McMahon <james.mcmahon@phs.scot>

* Sort order of parameters to pass `data` first

* Update documentation

* Style code

* Update R/create_individual_file.R

* Update R/create_individual_file.R

* Update R/create_individual_file.R

* Style code

---------

Signed-off-by: check-spelling-bot <check-spelling-bot@users.noreply.github.com>
Co-authored-by: Jennit07 <Jennit07@users.noreply.github.com>
Co-authored-by: James McMahon <james.mcmahon@phs.scot>
Co-authored-by: Moohan <Moohan@users.noreply.github.com>
---
 .github/actions/spelling/expect.txt |  1 +
 R/create_individual_file.R          | 39 +++++++++++++++++++++++++++++
 man/join_sc_client.Rd               | 26 +++++++++++++++++++
 3 files changed, 66 insertions(+)
 create mode 100644 man/join_sc_client.Rd

diff --git a/.github/actions/spelling/expect.txt b/.github/actions/spelling/expect.txt
index ea3bab150..f6f191d5a 100644
--- a/.github/actions/spelling/expect.txt
+++ b/.github/actions/spelling/expect.txt
@@ -148,6 +148,7 @@ purrr
 quickstart
 Rbuildignore
 rcmdcheck
+rdd
 rds
 reabl
 reablement
diff --git a/R/create_individual_file.R b/R/create_individual_file.R
index 7825951c6..ab895926b 100644
--- a/R/create_individual_file.R
+++ b/R/create_individual_file.R
@@ -57,6 +57,7 @@ create_individual_file <- function(episode_file, year, write_to_disk = TRUE) {
     join_deaths_data(year) %>%
     join_sparra_hhg(year) %>%
     join_slf_lookup_vars() %>%
+    join_sc_client(year) %>%
     dplyr::mutate(year = year)
 
   if (write_to_disk) {
@@ -795,3 +796,41 @@ join_slf_lookup_vars <- function(individual_file,
 
   return(individual_file)
 }
+# TODO Remove the client data from the individual Social Care extracts
+# and instead, use this function in the episode file to match on the client
+# data to all episodes.
+#' Join sc client variables onto individual file
+#'
+#' @description Match on sc client variables.
+#'
+#' @param individual_file the processed individual file
+#' @param year financial year.
+#' @param sc_client SC client lookup
+#' @param sc_demographics SC Demographic lookup
+join_sc_client <- function(individual_file,
+                           year,
+                           sc_client = read_file(get_source_extract_path(year, "Client")),
+                           sc_demographics = read_file(get_sc_demog_lookup_path(),
+                             col_select = c("sending_location", "social_care_id", "chi")
+                           )) {
+  # TODO Update the client lookup processing script to match
+  # on demographics there so the client lookup already has CHI.
+
+  # Match to demographics lookup to get CHI
+  join_client_demog <- sc_client %>%
+    dplyr::left_join(
+      sc_demographics %>%
+        dplyr::select("sending_location", "social_care_id", "chi"),
+      by = c("sending_location", "social_care_id")
+    )
+
+  # Match on client variables by chi
+  individual_file <- individual_file %>%
+    dplyr::left_join(
+      join_client_demog,
+      by = "chi"
+    ) %>%
+    dplyr::select(!c("sending_location", "social_care_id", "sc_latest_submission"))
+
+  return(individual_file)
+}
diff --git a/man/join_sc_client.Rd b/man/join_sc_client.Rd
new file mode 100644
index 000000000..a30719698
--- /dev/null
+++ b/man/join_sc_client.Rd
@@ -0,0 +1,26 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/create_individual_file.R
+\name{join_sc_client}
+\alias{join_sc_client}
+\title{Join sc client variables onto individual file}
+\usage{
+join_sc_client(
+  individual_file,
+  year,
+  sc_client = read_file(get_source_extract_path(year, "Client")),
+  sc_demographics = read_file(get_sc_demog_lookup_path(), col_select =
+    c("sending_location", "social_care_id", "chi"))
+)
+}
+\arguments{
+\item{individual_file}{the processed individual file}
+
+\item{year}{financial year.}
+
+\item{sc_client}{SC client lookup}
+
+\item{sc_demographics}{SC Demographic lookup}
+}
+\description{
+Match on sc client variables.
+}

From 1bb52aa07f5f3e8dfef79b28957519b81f255a02 Mon Sep 17 00:00:00 2001
From: Moohan <Moohan@users.noreply.github.com>
Date: Mon, 17 Jul 2023 14:39:09 +0000
Subject: [PATCH 199/200] Update documentation

---
 man/join_cohort_lookups.Rd | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/man/join_cohort_lookups.Rd b/man/join_cohort_lookups.Rd
index 7e18e022c..fcd419a1b 100644
--- a/man/join_cohort_lookups.Rd
+++ b/man/join_cohort_lookups.Rd
@@ -11,7 +11,7 @@ join_cohort_lookups(data, year, update = latest_update())
 
 \item{year}{The year to process, in FY format.}
 
-\item{update}{The latest update e.g. "Jun_2023"}
+\item{update}{The update to use}
 }
 \value{
 The data including the Demographic and Service Use lookups.

From dabbf57cbd8dafa5340da323c54d4e389619835a Mon Sep 17 00:00:00 2001
From: James McMahon <james.mcmahon@phs.scot>
Date: Wed, 19 Jul 2023 11:15:18 +0100
Subject: [PATCH 200/200] Output the individual file with `anon_chi` (#748)

* Make episode file output with `anon_chi`

I've added this as a parameter so you can output CHI if desired, but the default is for anon_chi.

For the tests, it swaps back to CHI as there are some tests which specifically us the CHI number.

* Output `anon_chi` in the individual file

* Style code

* Sort variables with issues `hbrescode` (HB2018), `datazone` and `hscp` (#746)

* rename `hscp` to `hscp2018`

* rename `spd` as `slf_pc_lookup`

* Add `datazone2011` to coalesce code

* Rename `datazone` to `datazone2011`

* include `datazone2011_old` in selections

* Update R/fill_geographies.R

---------

Co-authored-by: James McMahon <james.mcmahon@phs.scot>

* Fix for anon_chi being NA

---------

Co-authored-by: Moohan <Moohan@users.noreply.github.com>
Co-authored-by: Jennit07 <67372904+Jennit07@users.noreply.github.com>
---
 .github/actions/spelling/expect.txt |  1 +
 R/create_individual_file.R          | 29 +++++++++++++++++++++++++----
 R/fill_geographies.R                | 11 ++++++-----
 R/process_tests_episode_file.R      |  5 +++--
 R/process_tests_individual_file.R   |  5 +++--
 R/run_episode_file.R                | 16 +++++++++++++++-
 man/create_individual_file.Rd       | 14 +++++++++++++-
 man/run_episode_file.Rd             | 10 +++++++++-
 8 files changed, 75 insertions(+), 16 deletions(-)

diff --git a/.github/actions/spelling/expect.txt b/.github/actions/spelling/expect.txt
index 6c13224ba..3236edd84 100644
--- a/.github/actions/spelling/expect.txt
+++ b/.github/actions/spelling/expect.txt
@@ -105,6 +105,7 @@ keydate
 keyring
 keytime
 keytimex
+kis
 lgl
 kis
 los
diff --git a/R/create_individual_file.R b/R/create_individual_file.R
index ab895926b..675e2066a 100644
--- a/R/create_individual_file.R
+++ b/R/create_individual_file.R
@@ -3,13 +3,27 @@
 #' @description Creates individual file from episode file
 #'
 #' @param episode_file Tibble containing episodic data
-#' @param year The year to process, in FY format.
-#' @param write_to_disk (optional) Should the data be written to disk default is
-#' `TRUE` i.e. write the data to disk.
+#' @param anon_chi_in (Default:TRUE) Is `anon_chi` used in the input
+#' (instead of chi)
+#' @inheritParams run_episode_file
 #'
 #' @return The processed individual file
 #' @export
-create_individual_file <- function(episode_file, year, write_to_disk = TRUE) {
+create_individual_file <- function(
+    episode_file,
+    year,
+    write_to_disk = TRUE,
+    anon_chi_in = TRUE,
+    anon_chi_out = TRUE) {
+  if (anon_chi_in) {
+    episode_file <- slfhelper::get_chi(
+      episode_file,
+      anon_chi_var = "anon_chi",
+      drop = TRUE
+    ) %>%
+      dplyr::mutate(chi = dplyr::na_if(.data$chi, ""))
+  }
+
   individual_file <- episode_file %>%
     dplyr::select(
       "year",
@@ -60,6 +74,13 @@ create_individual_file <- function(episode_file, year, write_to_disk = TRUE) {
     join_sc_client(year) %>%
     dplyr::mutate(year = year)
 
+  if (anon_chi_out) {
+    individual_file <- individual_file %>%
+      tidyr::replace_na(list(chi = "")) %>%
+      slfhelper::get_anon_chi() %>%
+      dplyr::mutate(anon_chi = dplyr::na_if(.data$anon_chi, ""))
+  }
+
   if (write_to_disk) {
     slf_indiv_path <- get_file_path(
       get_year_dir(year),
diff --git a/R/fill_geographies.R b/R/fill_geographies.R
index 28bab7fa2..58d001493 100644
--- a/R/fill_geographies.R
+++ b/R/fill_geographies.R
@@ -85,7 +85,7 @@ make_gpprac_lookup <- function(data) {
 }
 
 fill_postcode_geogs <- function(data) {
-  spd <- read_file(get_slf_postcode_path())
+  slf_pc_lookup <- read_file(get_slf_postcode_path())
 
   filled_postcodes <- dplyr::left_join(
     data,
@@ -102,7 +102,7 @@ fill_postcode_geogs <- function(data) {
     ) %>%
     # Fill geographies
     dplyr::left_join(
-      spd,
+      slf_pc_lookup,
       by = "postcode",
       suffix = c("_old", "")
     ) %>%
@@ -117,10 +117,11 @@ fill_postcode_geogs <- function(data) {
     cascade_geographies() %>%
     dplyr::mutate(
       hbrescode = dplyr::coalesce(.data$hb2018, .data$hbrescode),
-      hscp = dplyr::coalesce(.data$hscp2018, .data$hscp),
-      lca = dplyr::coalesce(.data$lca, .data$lca_old)
+      hscp2018 = dplyr::coalesce(.data$hscp2018, .data$hscp),
+      lca = dplyr::coalesce(.data$lca, .data$lca_old),
+      datazone2011 = dplyr::coalesce(.data$datazone2011, .data$datazone2011_old)
     ) %>%
-    dplyr::select(!c("hb2018", "hscp2018", "lca_old", "most_recent_postcode"))
+    dplyr::select(!c("hb2018", "hscp", "lca_old", "datazone2011_old", "most_recent_postcode"))
 
   return(filled_postcodes)
 }
diff --git a/R/process_tests_episode_file.R b/R/process_tests_episode_file.R
index b595d1d54..46e9e7171 100644
--- a/R/process_tests_episode_file.R
+++ b/R/process_tests_episode_file.R
@@ -10,7 +10,7 @@ process_tests_episode_file <- function(data, year) {
   data <- data %>%
     dplyr::select(
       "year",
-      "chi",
+      "anon_chi",
       "gender",
       "postcode",
       "hbtreatcode",
@@ -20,7 +20,8 @@ process_tests_episode_file <- function(data, year) {
       "record_keydate1",
       "record_keydate2",
       dplyr::contains(c("beddays", "cost", "cij"))
-    )
+    ) %>%
+    slfhelper::get_chi()
 
   old_data <- get_existing_data_for_tests(data)
 
diff --git a/R/process_tests_individual_file.R b/R/process_tests_individual_file.R
index 32bbd8d3a..695dc19a0 100644
--- a/R/process_tests_individual_file.R
+++ b/R/process_tests_individual_file.R
@@ -10,7 +10,7 @@ process_tests_individual_file <- function(data, year) {
   data <- data %>%
     dplyr::select(
       "year",
-      "chi",
+      "anon_chi",
       "gender",
       "postcode",
       "dob",
@@ -26,7 +26,8 @@ process_tests_individual_file <- function(data, year) {
         "cases",
         "consultations"
       ))
-    )
+    ) %>%
+    slfhelper::get_chi()
 
   old_data <- get_existing_data_for_tests(data, file_version = "individual")
 
diff --git a/R/run_episode_file.R b/R/run_episode_file.R
index 45a4e6ed5..1f2bb33ed 100644
--- a/R/run_episode_file.R
+++ b/R/run_episode_file.R
@@ -4,11 +4,17 @@
 #' @param year The year to process, in FY format.
 #' @param write_to_disk (optional) Should the data be written to disk default is
 #' `TRUE` i.e. write the data to disk.
+#' @param anon_chi_out (Default:TRUE) Should `anon_chi` be used in the output
+#' (instead of chi)
 #'
 #' @return a [tibble][tibble::tibble-package] containing the episode file
 #' @export
 #'
-run_episode_file <- function(processed_data_list, year, write_to_disk = TRUE) {
+run_episode_file <- function(
+    processed_data_list,
+    year,
+    write_to_disk = TRUE,
+    anon_chi_out = TRUE) {
   episode_file <- dplyr::bind_rows(processed_data_list) %>%
     create_cost_inc_dna() %>%
     apply_cost_uplift() %>%
@@ -103,6 +109,14 @@ run_episode_file <- function(processed_data_list, year, write_to_disk = TRUE) {
     join_deaths_data(year) %>%
     load_ep_file_vars(year)
 
+  if (anon_chi_out) {
+    episode_file <- slfhelper::get_anon_chi(
+      episode_file,
+      chi_var = "chi",
+      drop = TRUE
+    )
+  }
+
   if (write_to_disk) {
     slf_path <- get_file_path(
       get_year_dir(year),
diff --git a/man/create_individual_file.Rd b/man/create_individual_file.Rd
index d1feb23df..fa759e7b1 100644
--- a/man/create_individual_file.Rd
+++ b/man/create_individual_file.Rd
@@ -4,7 +4,13 @@
 \alias{create_individual_file}
 \title{Create individual file}
 \usage{
-create_individual_file(episode_file, year, write_to_disk = TRUE)
+create_individual_file(
+  episode_file,
+  year,
+  write_to_disk = TRUE,
+  anon_chi_in = TRUE,
+  anon_chi_out = TRUE
+)
 }
 \arguments{
 \item{episode_file}{Tibble containing episodic data}
@@ -13,6 +19,12 @@ create_individual_file(episode_file, year, write_to_disk = TRUE)
 
 \item{write_to_disk}{(optional) Should the data be written to disk default is
 \code{TRUE} i.e. write the data to disk.}
+
+\item{anon_chi_in}{(Default:TRUE) Is \code{anon_chi} used in the input
+(instead of chi)}
+
+\item{anon_chi_out}{(Default:TRUE) Should \code{anon_chi} be used in the output
+(instead of chi)}
 }
 \value{
 The processed individual file
diff --git a/man/run_episode_file.Rd b/man/run_episode_file.Rd
index e85621b59..59d5fea1d 100644
--- a/man/run_episode_file.Rd
+++ b/man/run_episode_file.Rd
@@ -4,7 +4,12 @@
 \alias{run_episode_file}
 \title{Produce the Source Episode file}
 \usage{
-run_episode_file(processed_data_list, year, write_to_disk = TRUE)
+run_episode_file(
+  processed_data_list,
+  year,
+  write_to_disk = TRUE,
+  anon_chi_out = TRUE
+)
 }
 \arguments{
 \item{processed_data_list}{containing data from processed extracts.}
@@ -13,6 +18,9 @@ run_episode_file(processed_data_list, year, write_to_disk = TRUE)
 
 \item{write_to_disk}{(optional) Should the data be written to disk default is
 \code{TRUE} i.e. write the data to disk.}
+
+\item{anon_chi_out}{(Default:TRUE) Should \code{anon_chi} be used in the output
+(instead of chi)}
 }
 \value{
 a \link[tibble:tibble-package]{tibble} containing the episode file