From 3f86f892653d72127228aeb4ba4e8dbe8ccec86f Mon Sep 17 00:00:00 2001
From: James McMahon <james.mcmahon@phs.scot>
Date: Mon, 17 Jul 2023 13:23:15 +0100
Subject: [PATCH 01/16] Fix some R CMD CHK build error and warnings (#743)

* Add {future} and {future.callr} as required packages

* Use `.data$`

* Document all parameters

* Simplify the code and avoid using `runif`

* [check-spelling] Update metadata

Update for https://github.com/Public-Health-Scotland/source-linkage-files/actions/runs/5573642848/attempts/1
Accepted in https://github.com/Public-Health-Scotland/source-linkage-files/pull/743#issuecomment-1637643299

Signed-off-by: check-spelling-bot <check-spelling-bot@users.noreply.github.com>

---------

Signed-off-by: check-spelling-bot <check-spelling-bot@users.noreply.github.com>
---
 .github/actions/spelling/expect.txt | 4 ++++
 DESCRIPTION                         | 2 ++
 R/produce_test_comparison.R         | 4 ++--
 R/run_episode_file.R                | 1 +
 R/write_tests_xlsx.R                | 4 ++--
 man/join_cohort_lookups.Rd          | 2 ++
 6 files changed, 13 insertions(+), 4 deletions(-)

diff --git a/.github/actions/spelling/expect.txt b/.github/actions/spelling/expect.txt
index 50162e23f..23691cb31 100644
--- a/.github/actions/spelling/expect.txt
+++ b/.github/actions/spelling/expect.txt
@@ -94,6 +94,7 @@ itle
 iwalk
 jaccard
 jan
+jennifer
 jul
 keydate
 keyring
@@ -104,6 +105,8 @@ ltc
 ltcs
 lubridate
 magrittr
+Mcbride
+mcmahon
 MMMYY
 monthflag
 mpat
@@ -200,5 +203,6 @@ xintercept
 xlsx
 yearstay
 YYYYQX
+zihao
 zsav
 zstd
diff --git a/DESCRIPTION b/DESCRIPTION
index 31b205b36..02b87f21b 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -28,6 +28,8 @@ Imports:
     dtplyr (>= 1.3.0),
     fs (>= 1.6.1),
     fst (>= 0.9.8),
+    future (>= 1.33.0),
+    future.callr (>= 0.8.1),
     glue (>= 1.6.2),
     haven (>= 2.5.2),
     hms (>= 1.1.0),
diff --git a/R/produce_test_comparison.R b/R/produce_test_comparison.R
index dda1f2411..624623806 100644
--- a/R/produce_test_comparison.R
+++ b/R/produce_test_comparison.R
@@ -23,7 +23,7 @@ produce_test_comparison <- function(old_data, new_data, recid = FALSE) {
       dplyr::mutate(
         difference = round(.data$value_new - .data$value_old, digits = 2L),
         pct_change = .data$difference / .data$value_old,
-        issue = !dplyr::between(pct_change, -0.05, 0.05)
+        issue = !dplyr::between(.data$pct_change, -0.05, 0.05)
       )
   } else {
     dplyr::full_join(old_data,
@@ -34,7 +34,7 @@ produce_test_comparison <- function(old_data, new_data, recid = FALSE) {
       dplyr::mutate(
         difference = round(.data$value_new - .data$value_old, digits = 2L),
         pct_change = .data$difference / .data$value_old,
-        issue = !dplyr::between(pct_change, -0.05, 0.05)
+        issue = !dplyr::between(.data$pct_change, -0.05, 0.05)
       )
   }
 }
diff --git a/R/run_episode_file.R b/R/run_episode_file.R
index 00e864f40..df51d430d 100644
--- a/R/run_episode_file.R
+++ b/R/run_episode_file.R
@@ -340,6 +340,7 @@ create_cohort_lookups <- function(data, year, update = latest_update()) {
 #' Join cohort lookups
 #'
 #' @inheritParams store_ep_file_vars
+#' @inheritParams get_demographic_cohorts_path
 #'
 #' @return The data including the Demographic and Service Use lookups.
 join_cohort_lookups <- function(data, year, update = latest_update()) {
diff --git a/R/write_tests_xlsx.R b/R/write_tests_xlsx.R
index c517e496e..e187149d5 100644
--- a/R/write_tests_xlsx.R
+++ b/R/write_tests_xlsx.R
@@ -47,7 +47,7 @@ write_tests_xlsx <- function(comparison_data, sheet_name, year = NULL) {
     while (fs::file_exists(path = in_use_path) && seconds < max_wait) {
       # While the tests are in use (wait a random number of seconds from 1 to 30)
       cli::cli_progress_update()
-      wait <- round(runif(1, 1, 15))
+      wait <- sample(x = 3:15, size = 1)
 
       Sys.sleep(wait)
       seconds <- seconds + wait
@@ -56,7 +56,7 @@ write_tests_xlsx <- function(comparison_data, sheet_name, year = NULL) {
   }
 
   # Final check to maybe avoid corrupting the workbook
-  Sys.sleep(round(runif(1, 1, 3)))
+  Sys.sleep(sample(x = 1:3, size = 1))
   if (!fs::file_exists(path = in_use_path)) {
     fs::file_create(path = in_use_path)
   } else {
diff --git a/man/join_cohort_lookups.Rd b/man/join_cohort_lookups.Rd
index 21f376bdc..fcd419a1b 100644
--- a/man/join_cohort_lookups.Rd
+++ b/man/join_cohort_lookups.Rd
@@ -10,6 +10,8 @@ join_cohort_lookups(data, year, update = latest_update())
 \item{data}{The in progress episode file data.}
 
 \item{year}{The year to process, in FY format.}
+
+\item{update}{The update to use}
 }
 \value{
 The data including the Demographic and Service Use lookups.

From 5272eede4345c39ef456f24507a2d88d8f9b93b1 Mon Sep 17 00:00:00 2001
From: James McMahon <james.mcmahon@phs.scot>
Date: Mon, 17 Jul 2023 13:37:42 +0100
Subject: [PATCH 02/16] Some updates to `read_file()` (#727)

* Use `!!` injection operator as a simpler workaround

https://github.com/apache/arrow/issues/36658

* Add explicit `.rds.gz` and `.csv.gz` extension handling

Previously this assumed any file ending in `.gz` was a CSV.

* Style code

---------

Co-authored-by: Moohan <Moohan@users.noreply.github.com>
Co-authored-by: Jennit07 <67372904+Jennit07@users.noreply.github.com>
---
 R/read_file.R                   | 43 ++++++++++++++++++++++++---------
 tests/testthat/test-read_file.R |  5 +++-
 2 files changed, 35 insertions(+), 13 deletions(-)

diff --git a/R/read_file.R b/R/read_file.R
index 53300c70b..2941b62ed 100644
--- a/R/read_file.R
+++ b/R/read_file.R
@@ -16,10 +16,27 @@
 #' @return the data a [tibble][tibble::tibble-package]
 #' @export
 read_file <- function(path, col_select = NULL, as_data_frame = TRUE, ...) {
-  valid_extensions <- c("rds", "fst", "sav", "zsav", "csv", "gz", "parquet")
+  valid_extensions <- c(
+    "rds",
+    "rds.gz",
+    "fst",
+    "sav",
+    "zsav",
+    "csv",
+    "csv.gz",
+    "parquet"
+  )
 
   ext <- fs::path_ext(path)
 
+  if (ext == "gz") {
+    ext <- paste(
+      fs::path_ext(fs::path_ext_remove(path)),
+      "gz",
+      sep = "."
+    )
+  }
+
   if (!(ext %in% valid_extensions)) {
     cli::cli_abort(c(
       "x" = "Invalid extension: {.val {ext}}",
@@ -36,17 +53,19 @@ read_file <- function(path, col_select = NULL, as_data_frame = TRUE, ...) {
   }
 
   data <- switch(ext,
-    "rds" = readr::read_rds(path),
-    "fst" = fst::read_fst(path),
-    "sav" = haven::read_spss(path, ...),
-    "zsav" = haven::read_spss(path, ...),
-    "csv" = readr::read_csv(path, ..., show_col_types = FALSE),
-    "gz" = readr::read_csv(path, ..., show_col_types = FALSE),
-    "parquet" = if (is.null(col_select)) {
-      arrow::read_parquet(path, as_data_frame = as_data_frame, ...)
-    } else {
-      arrow::read_parquet(path, col_select = col_select, as_data_frame = as_data_frame, ...)
-    }
+    "rds" = readr::read_rds(file = path),
+    "rds.gz" = readr::read_rds(file = path),
+    "fst" = tibble::as_tibble(fst::read_fst(path = path)),
+    "sav" = haven::read_spss(file = path, ...),
+    "zsav" = haven::read_spss(file = path, ...),
+    "csv" = readr::read_csv(file = path, ..., show_col_types = FALSE),
+    "csv.gz" = readr::read_csv(file = path, ..., show_col_types = FALSE),
+    "parquet" = arrow::read_parquet(
+      file = path,
+      col_select = !!col_select,
+      as_data_frame = as_data_frame,
+      ...
+    )
   )
 
   return(data)
diff --git a/tests/testthat/test-read_file.R b/tests/testthat/test-read_file.R
index 392ba4a49..e823180fb 100644
--- a/tests/testthat/test-read_file.R
+++ b/tests/testthat/test-read_file.R
@@ -1,5 +1,6 @@
 test_that("read_file works", {
   rds_path <- tempfile(fileext = ".rds")
+  rds_gz_path <- tempfile(fileext = ".rds.gz")
   fst_path <- tempfile(fileext = ".fst")
   sav_path <- tempfile(fileext = ".sav")
   zsav_path <- tempfile(fileext = ".zsav")
@@ -10,6 +11,7 @@ test_that("read_file works", {
   aq_data <- tibble::as_tibble(datasets::airquality)
 
   readr::write_rds(aq_data, rds_path)
+  readr::write_rds(aq_data, rds_gz_path)
   fst::write_fst(aq_data, fst_path)
   haven::write_sav(aq_data, sav_path)
   haven::write_sav(aq_data, zsav_path, compress = "zsav")
@@ -18,7 +20,8 @@ test_that("read_file works", {
   arrow::write_parquet(aq_data, parquet_path)
 
   expect_equal(aq_data, read_file(rds_path))
-  expect_equal(aq_data, tibble::as_tibble(read_file(fst_path)))
+  expect_equal(aq_data, read_file(rds_gz_path))
+  expect_equal(aq_data, read_file(fst_path))
   expect_equal(aq_data, haven::zap_formats(read_file(sav_path)))
   expect_equal(aq_data, haven::zap_formats(read_file(zsav_path)))
   expect_equal(aq_data, read_file(csv_gz_path))

From 018dced9ecf5c79649c103a7e682287ce2e2c781 Mon Sep 17 00:00:00 2001
From: Jennit07 <67372904+Jennit07@users.noreply.github.com>
Date: Mon, 17 Jul 2023 13:56:14 +0100
Subject: [PATCH 03/16] Rename `datazone` to `datazone2011` (#744)

* Rename `datazone` to `datazone2011`

* [check-spelling] Update metadata

Update for https://github.com/Public-Health-Scotland/source-linkage-files/actions/runs/5575756558/attempts/1
Accepted in https://github.com/Public-Health-Scotland/source-linkage-files/pull/744#issuecomment-1638065893

Signed-off-by: check-spelling-bot <check-spelling-bot@users.noreply.github.com>

---------

Signed-off-by: check-spelling-bot <check-spelling-bot@users.noreply.github.com>
Co-authored-by: James McMahon <james.mcmahon@phs.scot>
---
 .github/actions/spelling/expect.txt  | 2 ++
 R/fill_geographies.R                 | 2 +-
 R/process_extract_acute.R            | 2 +-
 R/process_extract_district_nursing.R | 2 +-
 R/process_extract_gp_ooh.R           | 2 +-
 R/process_extract_mental_health.R    | 2 +-
 R/read_extract_acute.R               | 2 +-
 R/read_extract_district_nursing.R    | 2 +-
 R/read_extract_mental_health.R       | 2 +-
 R/read_extract_nrs_deaths.R          | 2 +-
 R/read_extract_ooh_consultations.R   | 2 +-
 R/run_episode_file.R                 | 2 +-
 12 files changed, 13 insertions(+), 11 deletions(-)

diff --git a/.github/actions/spelling/expect.txt b/.github/actions/spelling/expect.txt
index 23691cb31..a1800b02f 100644
--- a/.github/actions/spelling/expect.txt
+++ b/.github/actions/spelling/expect.txt
@@ -88,6 +88,7 @@ hms
 homecare
 hscp
 hscpnames
+IDPC
 infyyear
 ipdc
 itle
@@ -100,6 +101,7 @@ keydate
 keyring
 keytime
 keytimex
+kis
 los
 ltc
 ltcs
diff --git a/R/fill_geographies.R b/R/fill_geographies.R
index 22b3f03ba..28bab7fa2 100644
--- a/R/fill_geographies.R
+++ b/R/fill_geographies.R
@@ -14,7 +14,7 @@ fill_geographies <- function(data) {
     "hbrescode",
     "hscp",
     "lca",
-    "datazone",
+    "datazone2011",
     "hbpraccode",
     "hbtreatcode",
     "gpprac"
diff --git a/R/process_extract_acute.R b/R/process_extract_acute.R
index db810b9fb..7d47d0ef4 100644
--- a/R/process_extract_acute.R
+++ b/R/process_extract_acute.R
@@ -79,7 +79,7 @@ process_extract_acute <- function(data, year, write_to_disk = TRUE) {
       "hbrescode",
       "lca",
       "hscp",
-      "datazone",
+      "datazone2011",
       "location",
       "hbtreatcode",
       "yearstay",
diff --git a/R/process_extract_district_nursing.R b/R/process_extract_district_nursing.R
index 2097b38d2..a1b3bf816 100644
--- a/R/process_extract_district_nursing.R
+++ b/R/process_extract_district_nursing.R
@@ -107,7 +107,7 @@ process_extract_district_nursing <- function(
           "gender",
           "gpprac",
           "postcode",
-          "datazone",
+          "datazone2011",
           "lca",
           "hscp",
           "hbrescode",
diff --git a/R/process_extract_gp_ooh.R b/R/process_extract_gp_ooh.R
index d85ce33f5..4add41cfa 100644
--- a/R/process_extract_gp_ooh.R
+++ b/R/process_extract_gp_ooh.R
@@ -111,7 +111,7 @@ process_extract_gp_ooh <- function(year, data_list, write_to_disk = TRUE) {
       "gpprac",
       "postcode",
       "hbrescode",
-      "datazone",
+      "datazone2011",
       "hscp",
       "hbtreatcode",
       "location",
diff --git a/R/process_extract_mental_health.R b/R/process_extract_mental_health.R
index f79eff35f..108c14c61 100644
--- a/R/process_extract_mental_health.R
+++ b/R/process_extract_mental_health.R
@@ -85,7 +85,7 @@ process_extract_mental_health <- function(data, year, write_to_disk = TRUE) {
       "hbrescode",
       "lca",
       "hscp",
-      "datazone",
+      "datazone2011",
       "location",
       "hbtreatcode",
       "stay",
diff --git a/R/read_extract_acute.R b/R/read_extract_acute.R
index 6c699d6b4..a0fba0707 100644
--- a/R/read_extract_acute.R
+++ b/R/read_extract_acute.R
@@ -107,7 +107,7 @@ read_extract_acute <- function(year, file_path = get_boxi_extract_path(year = ye
       disch = "Discharge Type Code",
       falls_adm = "Falls Related Admission (01)",
       lca = "Geo Council Area Code",
-      datazone = "Geo Data Zone 2011",
+      datazone2011 = "Geo Data Zone 2011",
       postcode = "Geo Postcode [C]",
       hscp = "Geo HSCP of Residence Code - current",
       conc = "Lead Consultant/HCP Code",
diff --git a/R/read_extract_district_nursing.R b/R/read_extract_district_nursing.R
index 5640fb7b7..607f9b47e 100644
--- a/R/read_extract_district_nursing.R
+++ b/R/read_extract_district_nursing.R
@@ -43,7 +43,7 @@ read_extract_district_nursing <- function(
       lca = "Patient Council Area Code (Contact)",
       postcode = "Patient Postcode [C] (Contact)",
       gpprac = "Practice Code (Contact)",
-      datazone = "Patient Data Zone 2011 (Contact)",
+      datazone2011 = "Patient Data Zone 2011 (Contact)",
       hbpraccode = "Practice NHS Board Code 9 (Contact)",
       hbtreatcode = "Treatment NHS Board Code 9",
       chi = "UPI Number [C]",
diff --git a/R/read_extract_mental_health.R b/R/read_extract_mental_health.R
index bbdd1d5f9..fe82732c8 100644
--- a/R/read_extract_mental_health.R
+++ b/R/read_extract_mental_health.R
@@ -83,7 +83,7 @@ read_extract_mental_health <- function(
       hbrescode = "NHS Board of Residence Code - current",
       lca = "Geo Council Area Code",
       hscp = "Geo HSCP of Residence Code - current",
-      datazone = "Geo Data Zone 2011",
+      datazone2011 = "Geo Data Zone 2011",
       location = "Treatment Location Code",
       hbtreatcode = "Treatment NHS Board Code - current",
       yearstay = "Occupied Bed Days (04)",
diff --git a/R/read_extract_nrs_deaths.R b/R/read_extract_nrs_deaths.R
index 8fd2f26e9..1734b23aa 100644
--- a/R/read_extract_nrs_deaths.R
+++ b/R/read_extract_nrs_deaths.R
@@ -39,7 +39,7 @@ read_extract_nrs_deaths <- function(
     dplyr::rename(
       death_location_code = "Death Location Code",
       lca = "Geo Council Area Code",
-      datazone = "Geo Data Zone 2011",
+      datazone2011 = "Geo Data Zone 2011",
       postcode = "Geo Postcode [C]",
       hscp = "Geo HSCP of Residence Code - current",
       death_board_occurrence = "NHS Board of Occurrence Code - current",
diff --git a/R/read_extract_ooh_consultations.R b/R/read_extract_ooh_consultations.R
index 1c32ca085..4e16527a3 100644
--- a/R/read_extract_ooh_consultations.R
+++ b/R/read_extract_ooh_consultations.R
@@ -34,7 +34,7 @@ read_extract_ooh_consultations <- function(
       postcode = "Patient Postcode [C]",
       hbrescode = "Patient NHS Board Code 9 - current",
       hscp = "HSCP of Residence Code Current",
-      datazone = "Patient Data Zone 2011",
+      datazone2011 = "Patient Data Zone 2011",
       gpprac = "Practice Code",
       ooh_case_id = "GUID",
       attendance_status = "Consultation Recorded",
diff --git a/R/run_episode_file.R b/R/run_episode_file.R
index df51d430d..45a4e6ed5 100644
--- a/R/run_episode_file.R
+++ b/R/run_episode_file.R
@@ -51,7 +51,7 @@ run_episode_file <- function(processed_data_list, year, write_to_disk = TRUE) {
         "cij_dis_spec",
         "cost_total_net",
         "hscp",
-        "datazone",
+        "datazone2011",
         "attendance_status",
         "deathdiag1",
         "deathdiag2",

From eae87509e39b9d066ff880617a8690a2b41a00d7 Mon Sep 17 00:00:00 2001
From: Jennit07 <67372904+Jennit07@users.noreply.github.com>
Date: Mon, 17 Jul 2023 16:02:07 +0100
Subject: [PATCH 04/16] Sort variables with issues `hbrescode` (HB2018),
 `datazone` and `hscp` (#746)

* rename `hscp` to `hscp2018`

* rename `spd` as `slf_pc_lookup`

* Add `datazone2011` to coalesce code

* Rename `datazone` to `datazone2011`

* include `datazone2011_old` in selections

* Update R/fill_geographies.R

---------

Co-authored-by: James McMahon <james.mcmahon@phs.scot>
---
 R/fill_geographies.R | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/R/fill_geographies.R b/R/fill_geographies.R
index 28bab7fa2..58d001493 100644
--- a/R/fill_geographies.R
+++ b/R/fill_geographies.R
@@ -85,7 +85,7 @@ make_gpprac_lookup <- function(data) {
 }
 
 fill_postcode_geogs <- function(data) {
-  spd <- read_file(get_slf_postcode_path())
+  slf_pc_lookup <- read_file(get_slf_postcode_path())
 
   filled_postcodes <- dplyr::left_join(
     data,
@@ -102,7 +102,7 @@ fill_postcode_geogs <- function(data) {
     ) %>%
     # Fill geographies
     dplyr::left_join(
-      spd,
+      slf_pc_lookup,
       by = "postcode",
       suffix = c("_old", "")
     ) %>%
@@ -117,10 +117,11 @@ fill_postcode_geogs <- function(data) {
     cascade_geographies() %>%
     dplyr::mutate(
       hbrescode = dplyr::coalesce(.data$hb2018, .data$hbrescode),
-      hscp = dplyr::coalesce(.data$hscp2018, .data$hscp),
-      lca = dplyr::coalesce(.data$lca, .data$lca_old)
+      hscp2018 = dplyr::coalesce(.data$hscp2018, .data$hscp),
+      lca = dplyr::coalesce(.data$lca, .data$lca_old),
+      datazone2011 = dplyr::coalesce(.data$datazone2011, .data$datazone2011_old)
     ) %>%
-    dplyr::select(!c("hb2018", "hscp2018", "lca_old", "most_recent_postcode"))
+    dplyr::select(!c("hb2018", "hscp", "lca_old", "datazone2011_old", "most_recent_postcode"))
 
   return(filled_postcodes)
 }

From 3b2c54df3e69523b8b16e2b5aa8fcf0b50cc66e7 Mon Sep 17 00:00:00 2001
From: James McMahon <james.mcmahon@phs.scot>
Date: Mon, 17 Jul 2023 16:23:45 +0100
Subject: [PATCH 05/16] Add priorities (default is 0) to targets (#745)

* Add priorities (default is 0) to targets

This should run the lookups and 'all year' files first.

* Use `qs` instead of `rds`

* Use {tarchetypes}

* Style code

---------

Co-authored-by: Moohan <Moohan@users.noreply.github.com>
Co-authored-by: Jennit07 <67372904+Jennit07@users.noreply.github.com>
---
 _targets.R | 36 ++++++++++++++++++++++--------------
 1 file changed, 22 insertions(+), 14 deletions(-)

diff --git a/_targets.R b/_targets.R
index 3625fa072..869d0d194 100644
--- a/_targets.R
+++ b/_targets.R
@@ -59,7 +59,8 @@ list(
     process_lookup_sc_demographics(
       sc_demog_data,
       write_to_disk = write_to_disk
-    )
+    ),
+    priority = 0.9
   ),
   tar_target(
     tests_sc_demog_lookup,
@@ -70,7 +71,8 @@ list(
     process_it_chi_deaths(
       data = it_chi_deaths_extract,
       write_to_disk = write_to_disk
-    )
+    ),
+    priority = 0.9
   ),
   tar_target(
     tests_it_chi_deaths,
@@ -83,7 +85,8 @@ list(
       gpprac_ref_path = gpprac_ref_path,
       spd_path = spd_path,
       write_to_disk = write_to_disk
-    )
+    ),
+    priority = 0.9
   ),
   tar_target(
     tests_source_gp_lookup,
@@ -96,16 +99,17 @@ list(
       simd_path = simd_path,
       locality_path = locality_path,
       write_to_disk = write_to_disk
-    )
+    ),
+    priority = 0.9
   ),
   tar_target(
     tests_source_pc_lookup,
     process_tests_lookup_pc(source_pc_lookup)
   ),
   ## Cost Lookups ##
-  tar_target(ch_cost_lookup, process_costs_ch_rmd()),
-  tar_target(dn_cost_lookup, process_costs_dn_rmd()),
-  tar_target(hc_cost_lookup, process_costs_hc_rmd()),
+  tar_target(ch_cost_lookup, process_costs_ch_rmd(), priority = 0.8),
+  tar_target(dn_cost_lookup, process_costs_dn_rmd(), priority = 0.8),
+  tar_target(hc_cost_lookup, process_costs_hc_rmd(), priority = 0.8),
   tar_target(gp_ooh_cost_lookup, process_costs_gp_ooh_rmd()),
   ## Social Care - 'All' data ##
   tar_target(
@@ -122,7 +126,8 @@ list(
       all_at_extract,
       sc_demog_lookup = sc_demog_lookup,
       write_to_disk = write_to_disk
-    )
+    ),
+    priority = 0.5
   ),
   tar_target(
     all_home_care_extract,
@@ -138,7 +143,8 @@ list(
       all_home_care_extract,
       sc_demog_lookup = sc_demog_lookup,
       write_to_disk = write_to_disk
-    )
+    ),
+    priority = 0.5
   ),
   tar_target(
     all_care_home_extract,
@@ -157,7 +163,8 @@ list(
       ch_name_lookup_path = slf_ch_name_lookup_path,
       spd_path = spd_path,
       write_to_disk = write_to_disk
-    )
+    ),
+    priority = 0.5
   ),
   tar_target(
     tests_all_care_home,
@@ -177,7 +184,8 @@ list(
       all_sds_extract,
       sc_demog_lookup = sc_demog_lookup,
       write_to_disk = write_to_disk
-    )
+    ),
+    priority = 0.5
   ),
   tar_map(
     list(year = years_to_run),
@@ -256,14 +264,14 @@ list(
       get_boxi_extract_path(year = year, type = "GP_OoH-c"),
       format = "file"
     ),
-    tar_target(ooh_data,
+    tar_qs(
+      ooh_data,
       read_extract_gp_ooh(
         year,
         diagnosis_data_path,
         outcomes_data_path,
         consultations_data_path
-      ),
-      format = "rds"
+      )
     ),
     ### Target source processed extracts ###
     tar_target(source_acute_extract, process_extract_acute(

From d09efcd7e736d14937ee7bdf1ab13b4c3caadbd6 Mon Sep 17 00:00:00 2001
From: James McMahon <james.mcmahon@phs.scot>
Date: Tue, 18 Jul 2023 08:15:11 +0100
Subject: [PATCH 06/16] Make episode file output with `anon_chi` (#747)

I've added this as a parameter so you can output CHI if desired, but the default is for anon_chi.

For the tests, it swaps back to CHI as there are some tests which specifically us the CHI number.

Co-authored-by: Jennit07 <67372904+Jennit07@users.noreply.github.com>
---
 R/process_tests_episode_file.R |  5 +++--
 R/run_episode_file.R           | 16 +++++++++++++++-
 man/run_episode_file.Rd        | 10 +++++++++-
 3 files changed, 27 insertions(+), 4 deletions(-)

diff --git a/R/process_tests_episode_file.R b/R/process_tests_episode_file.R
index b595d1d54..46e9e7171 100644
--- a/R/process_tests_episode_file.R
+++ b/R/process_tests_episode_file.R
@@ -10,7 +10,7 @@ process_tests_episode_file <- function(data, year) {
   data <- data %>%
     dplyr::select(
       "year",
-      "chi",
+      "anon_chi",
       "gender",
       "postcode",
       "hbtreatcode",
@@ -20,7 +20,8 @@ process_tests_episode_file <- function(data, year) {
       "record_keydate1",
       "record_keydate2",
       dplyr::contains(c("beddays", "cost", "cij"))
-    )
+    ) %>%
+    slfhelper::get_chi()
 
   old_data <- get_existing_data_for_tests(data)
 
diff --git a/R/run_episode_file.R b/R/run_episode_file.R
index 45a4e6ed5..1f2bb33ed 100644
--- a/R/run_episode_file.R
+++ b/R/run_episode_file.R
@@ -4,11 +4,17 @@
 #' @param year The year to process, in FY format.
 #' @param write_to_disk (optional) Should the data be written to disk default is
 #' `TRUE` i.e. write the data to disk.
+#' @param anon_chi_out (Default:TRUE) Should `anon_chi` be used in the output
+#' (instead of chi)
 #'
 #' @return a [tibble][tibble::tibble-package] containing the episode file
 #' @export
 #'
-run_episode_file <- function(processed_data_list, year, write_to_disk = TRUE) {
+run_episode_file <- function(
+    processed_data_list,
+    year,
+    write_to_disk = TRUE,
+    anon_chi_out = TRUE) {
   episode_file <- dplyr::bind_rows(processed_data_list) %>%
     create_cost_inc_dna() %>%
     apply_cost_uplift() %>%
@@ -103,6 +109,14 @@ run_episode_file <- function(processed_data_list, year, write_to_disk = TRUE) {
     join_deaths_data(year) %>%
     load_ep_file_vars(year)
 
+  if (anon_chi_out) {
+    episode_file <- slfhelper::get_anon_chi(
+      episode_file,
+      chi_var = "chi",
+      drop = TRUE
+    )
+  }
+
   if (write_to_disk) {
     slf_path <- get_file_path(
       get_year_dir(year),
diff --git a/man/run_episode_file.Rd b/man/run_episode_file.Rd
index e85621b59..59d5fea1d 100644
--- a/man/run_episode_file.Rd
+++ b/man/run_episode_file.Rd
@@ -4,7 +4,12 @@
 \alias{run_episode_file}
 \title{Produce the Source Episode file}
 \usage{
-run_episode_file(processed_data_list, year, write_to_disk = TRUE)
+run_episode_file(
+  processed_data_list,
+  year,
+  write_to_disk = TRUE,
+  anon_chi_out = TRUE
+)
 }
 \arguments{
 \item{processed_data_list}{containing data from processed extracts.}
@@ -13,6 +18,9 @@ run_episode_file(processed_data_list, year, write_to_disk = TRUE)
 
 \item{write_to_disk}{(optional) Should the data be written to disk default is
 \code{TRUE} i.e. write the data to disk.}
+
+\item{anon_chi_out}{(Default:TRUE) Should \code{anon_chi} be used in the output
+(instead of chi)}
 }
 \value{
 a \link[tibble:tibble-package]{tibble} containing the episode file

From 654dc00232a5ff87c1e97d614ad360cf42e03fcd Mon Sep 17 00:00:00 2001
From: James McMahon <james.mcmahon@phs.scot>
Date: Tue, 18 Jul 2023 08:19:11 +0100
Subject: [PATCH 07/16] Write out as a partitioned arrow dataset (#726)

* Also, write out as a partitioned arrow dataset

This will allow us to test if this format provides any additional benefits. It is still also written out as a (non-partitioned) parquet file.

* Fix a typo

---------

Co-authored-by: Jennit07 <67372904+Jennit07@users.noreply.github.com>
---
 R/run_episode_file.R | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/R/run_episode_file.R b/R/run_episode_file.R
index 1f2bb33ed..16b7ee3c2 100644
--- a/R/run_episode_file.R
+++ b/R/run_episode_file.R
@@ -118,7 +118,8 @@ run_episode_file <- function(
   }
 
   if (write_to_disk) {
-    slf_path <- get_file_path(
+    # TODO make the slf_path a function
+    slf_episode_path <- get_file_path(
       get_year_dir(year),
       stringr::str_glue(
         "source-episode-file-{year}.parquet"
@@ -126,7 +127,17 @@ run_episode_file <- function(
       check_mode = "write"
     )
 
-    write_file(episode_file, slf_path)
+    write_file(episode_file, slf_episode_path)
+
+    arrow::write_dataset(
+      dataset = episode_file,
+      path = fs::path_ext_remove(slf_episode_path),
+      format = "parquet",
+      # Should correspond to the available slfhelper filters
+      partitioning = c("recid", "hscp2018"),
+      compression = "zstd",
+      version = "latest"
+    )
   }
 
   return(episode_file)

From ee7445bf831b362bacdf3b899e0bbd70fb07b1fd Mon Sep 17 00:00:00 2001
From: James McMahon <james.mcmahon@phs.scot>
Date: Tue, 18 Jul 2023 14:39:40 +0100
Subject: [PATCH 08/16] Write the episode file as a partitioned dataset (#750)

* Revert "Write out as a partitioned arrow dataset (#726)"

This reverts commit 654dc00232a5ff87c1e97d614ad360cf42e03fcd.

* Write the episode file out as a partitioned dataset

This is its own target so it won't hold up the rest of the processing.
---
 R/run_episode_file.R | 10 ----------
 _targets.R           | 12 ++++++++++++
 2 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/R/run_episode_file.R b/R/run_episode_file.R
index 16b7ee3c2..668a40124 100644
--- a/R/run_episode_file.R
+++ b/R/run_episode_file.R
@@ -128,16 +128,6 @@ run_episode_file <- function(
     )
 
     write_file(episode_file, slf_episode_path)
-
-    arrow::write_dataset(
-      dataset = episode_file,
-      path = fs::path_ext_remove(slf_episode_path),
-      format = "parquet",
-      # Should correspond to the available slfhelper filters
-      partitioning = c("recid", "hscp2018"),
-      compression = "zstd",
-      version = "latest"
-    )
   }
 
   return(episode_file)
diff --git a/_targets.R b/_targets.R
index 869d0d194..58e5f573f 100644
--- a/_targets.R
+++ b/_targets.R
@@ -547,6 +547,18 @@ list(
         data = episode_file,
         year = year
       )
+    ),
+    tar_target(
+      episode_file_dataset,
+      arrow::write_dataset(
+        dataset = episode_file,
+        path = fs::path_ext_remove(slf_episode_path),
+        format = "parquet",
+        # Should correspond to the available slfhelper filters
+        partitioning = c("recid", "hscp2018"),
+        compression = "zstd",
+        version = "latest"
+      )
     )
   )
 )

From 74109bfdc7b9741922cbaad310af32a5045d8f18 Mon Sep 17 00:00:00 2001
From: James McMahon <james.mcmahon@phs.scot>
Date: Wed, 19 Jul 2023 11:06:04 +0100
Subject: [PATCH 09/16] Fix for anon_chi missing (#752)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Fix for anon_chi missing

`slfhelper::get_anon_chi` converts `NA` to "TkE=" which would then be converted back to "e1" 🤯

This fixes that by making NA CHIs blank `""`, then making blank anon_chis NA after.

slfhelper should be updated so that it always converts NA to NA.

* Update run_episode_file.R

* Style code

* Update documentation

---------

Co-authored-by: Moohan <Moohan@users.noreply.github.com>
---
 R/run_episode_file.R            | 20 ++++++++++----------
 man/correct_cij_vars.Rd         |  2 +-
 man/create_cohort_lookups.Rd    |  2 +-
 man/create_cost_inc_dna.Rd      |  2 +-
 man/fill_missing_cij_markers.Rd |  2 +-
 man/join_cohort_lookups.Rd      |  2 +-
 man/join_sparra_hhg.Rd          |  2 +-
 man/load_ep_file_vars.Rd        |  2 +-
 man/store_ep_file_vars.Rd       |  6 +++---
 9 files changed, 20 insertions(+), 20 deletions(-)

diff --git a/R/run_episode_file.R b/R/run_episode_file.R
index 668a40124..852a4fd8b 100644
--- a/R/run_episode_file.R
+++ b/R/run_episode_file.R
@@ -86,7 +86,7 @@ run_episode_file <- function(
       )
     ) %>%
     # Check chi is valid using phsmethods function
-    # If the CHI is invalid for whatever reason, set the CHI to blank string
+    # If the CHI is invalid for whatever reason, set the CHI to NA
     dplyr::mutate(
       chi = dplyr::if_else(
         phsmethods::chi_check(.data$chi) != "Valid CHI",
@@ -110,11 +110,11 @@ run_episode_file <- function(
     load_ep_file_vars(year)
 
   if (anon_chi_out) {
-    episode_file <- slfhelper::get_anon_chi(
-      episode_file,
-      chi_var = "chi",
-      drop = TRUE
-    )
+    # TODO When slfhelper is updated remove the unnecessary code
+    episode_file <- episode_file %>%
+      tidyr::replace_na(list(chi = "")) %>%
+      slfhelper::get_anon_chi() %>%
+      dplyr::mutate(anon_chi = dplyr::na_if(.data$anon_chi, ""))
   }
 
   if (write_to_disk) {
@@ -135,10 +135,10 @@ run_episode_file <- function(
 
 #' Store the unneeded episode file variables
 #'
-#' @param data The in progress episode file data.
+#' @param data The in-progress episode file data.
 #' @inheritParams run_episode_file
-#' @param vars_to_keep a character vector of variable to keep, all others will
-#' be stored.
+#' @param vars_to_keep a character vector of the variables to keep, all others
+#' will be stored.
 #'
 #' @return `data` with only the `vars_to_keep` kept
 store_ep_file_vars <- function(data, year, vars_to_keep) {
@@ -324,7 +324,7 @@ create_cost_inc_dna <- function(data) {
 #'
 #' @return The data unchanged (the cohorts are written to disk)
 create_cohort_lookups <- function(data, year, update = latest_update()) {
-  # Use future so the cohorts can be create simultaneously (in parallel)
+  # Use future so the cohorts can be created simultaneously (in parallel)
   future::plan(strategy = future.callr::callr, .skip = TRUE)
   options(future.globals.maxSize = 21474836480)
 
diff --git a/man/correct_cij_vars.Rd b/man/correct_cij_vars.Rd
index 18ce990f8..97a7f046f 100644
--- a/man/correct_cij_vars.Rd
+++ b/man/correct_cij_vars.Rd
@@ -7,7 +7,7 @@
 correct_cij_vars(data)
 }
 \arguments{
-\item{data}{The in progress episode file data.}
+\item{data}{The in-progress episode file data.}
 }
 \value{
 The data with CIJ variables corrected.
diff --git a/man/create_cohort_lookups.Rd b/man/create_cohort_lookups.Rd
index cbfc1442f..f0ad267aa 100644
--- a/man/create_cohort_lookups.Rd
+++ b/man/create_cohort_lookups.Rd
@@ -7,7 +7,7 @@
 create_cohort_lookups(data, year, update = latest_update())
 }
 \arguments{
-\item{data}{The in progress episode file data.}
+\item{data}{The in-progress episode file data.}
 
 \item{year}{The year to process, in FY format.}
 
diff --git a/man/create_cost_inc_dna.Rd b/man/create_cost_inc_dna.Rd
index 588c602be..69e7e37b5 100644
--- a/man/create_cost_inc_dna.Rd
+++ b/man/create_cost_inc_dna.Rd
@@ -7,7 +7,7 @@
 create_cost_inc_dna(data)
 }
 \arguments{
-\item{data}{The in progress episode file data.}
+\item{data}{The in-progress episode file data.}
 }
 \value{
 The data with cost including dna.
diff --git a/man/fill_missing_cij_markers.Rd b/man/fill_missing_cij_markers.Rd
index 002c8d927..03b64217e 100644
--- a/man/fill_missing_cij_markers.Rd
+++ b/man/fill_missing_cij_markers.Rd
@@ -7,7 +7,7 @@
 fill_missing_cij_markers(data)
 }
 \arguments{
-\item{data}{The in progress episode file data.}
+\item{data}{The in-progress episode file data.}
 }
 \value{
 A data frame with CIJ markers filled in for those missing.
diff --git a/man/join_cohort_lookups.Rd b/man/join_cohort_lookups.Rd
index fcd419a1b..445dcd7c0 100644
--- a/man/join_cohort_lookups.Rd
+++ b/man/join_cohort_lookups.Rd
@@ -7,7 +7,7 @@
 join_cohort_lookups(data, year, update = latest_update())
 }
 \arguments{
-\item{data}{The in progress episode file data.}
+\item{data}{The in-progress episode file data.}
 
 \item{year}{The year to process, in FY format.}
 
diff --git a/man/join_sparra_hhg.Rd b/man/join_sparra_hhg.Rd
index 9bbdd916a..ab4d3b946 100644
--- a/man/join_sparra_hhg.Rd
+++ b/man/join_sparra_hhg.Rd
@@ -7,7 +7,7 @@
 join_sparra_hhg(data, year)
 }
 \arguments{
-\item{data}{The in progress episode file data.}
+\item{data}{The in-progress episode file data.}
 
 \item{year}{The year to process, in FY format.}
 }
diff --git a/man/load_ep_file_vars.Rd b/man/load_ep_file_vars.Rd
index d290ba512..cee9cc440 100644
--- a/man/load_ep_file_vars.Rd
+++ b/man/load_ep_file_vars.Rd
@@ -7,7 +7,7 @@
 load_ep_file_vars(data, year)
 }
 \arguments{
-\item{data}{The in progress episode file data.}
+\item{data}{The in-progress episode file data.}
 
 \item{year}{The year to process, in FY format.}
 }
diff --git a/man/store_ep_file_vars.Rd b/man/store_ep_file_vars.Rd
index f31f63976..06316aac1 100644
--- a/man/store_ep_file_vars.Rd
+++ b/man/store_ep_file_vars.Rd
@@ -7,12 +7,12 @@
 store_ep_file_vars(data, year, vars_to_keep)
 }
 \arguments{
-\item{data}{The in progress episode file data.}
+\item{data}{The in-progress episode file data.}
 
 \item{year}{The year to process, in FY format.}
 
-\item{vars_to_keep}{a character vector of variable to keep, all others will
-be stored.}
+\item{vars_to_keep}{a character vector of the variables to keep, all others
+will be stored.}
 }
 \value{
 \code{data} with only the \code{vars_to_keep} kept

From 8db37690172f5de1a7a580dda0c2a2da293d6d94 Mon Sep 17 00:00:00 2001
From: James McMahon <james.mcmahon@phs.scot>
Date: Wed, 19 Jul 2023 12:43:20 +0100
Subject: [PATCH 10/16] Create individual file (#715)

* Until L594

* Converted until L677

* Until L731

* Update documentation

* Remove test ref

* Style code

* WIP writing functions to fill postcode in line with previous DOB functions

* Update documentation

* implement quick fix for running 22/23

* Style code

* Fix missed comma

* Exclude DD code for now - TEMP fix

* Correct/rename variables

* Style code

* Include NSU in `check_year_valid`

* Update `check_year_valid_tests`

* Update documentation

* Update `add_nsu_cohort` to pick up years valid

* Style code

* remove extra `!`

* Exclude `cij_delay`

* Style code

* improve `max_no_inf()`

* Use pmin/max instead of `rowwise`

* improve `min_no_inf()`

* Use n_distinct(cij_marker)

* deal with distinct(ch_chi_cis)

* use n_distinct(ooh_case_id)

* remove `find_non_duplicates`

* Use dplyr::if_else()

Co-authored-by: James McMahon <james.mcmahon@phs.scot>

* Fix typo in `ooh_covid_assessment`

* Move `ooh_case_id` to aggregate

* Use `slfhelper::ltc_vars`

* Remove `clean_up_dob`
Already done in `correct_demographics`

* Update documentation

* [check-spelling] Update metadata

Update for https://github.com/Public-Health-Scotland/source-linkage-files/actions/runs/4981058958/attempts/1
Accepted in https://github.com/Public-Health-Scotland/source-linkage-files/pull/654#issuecomment-1551009850

Signed-off-by: check-spelling-bot <check-spelling-bot@users.noreply.github.com>

* Use `start_next_fy_quarter` in place of rowwise

* Style code

* Use `compute_mid_year_age`

* convert code into data.table for improving speed

* Update `get_fy_dates`function

* remove `date_from_fy`, use `get_fy_dates`

* Update documentation

* Remove `clean_up_postcode` function
Not needed anymore

* Remove non duplicates function/move to aggregate

* Style code

* Update documentation

* Add time stamps to `create_individual_file`

* Style code

* remove `clean_up_postcode`

* Deal with ch cis episodes

* Style code

* add .data$

* Turn ch aggregate into a data table

* Style code

* use ch_chi_cis

* remove `preventable_admissions` from aggregate

* exclude `hh_in_fy` for now

* Style code

* Test - exclude `sc_` vars from aggregate

* Style code

* Exclude for now

* exclude for now

* Style code

* automate `check_year_valid`

* Return dummy file path for NSU not valid

* Style code

* Fix brackets in aggregate

* TEMP - exclude variables

* Use `phsmethods::sex_from_chi`

* Style code

* Add ungroup()

* lowercase dob

* Remove as.data.table

* rewrite aggregate_by_chi with data.table

* Style code

* minor changes

* Use the updated function

* to properly import data.table

* remove redundant columns dob postcode and gpprac

* minor changes to remove redundant postcode gpprac columns

* Style code

* rename columns with small letters

* Style code

* newaggregate_ch_episodes

* Update documentation

* add functions to replace regular expressions to select column/variables

* Update documentation

* Style code

* minor changes

* add a missing variable, cij_delay

* Style code

* add variables cij_delay, preventable_beddays

* add missing variables health_net_cost, health_net_costincdnas, and cmh, dd sds columns

* Style code

* add more variables needed

* Style code

* Update R/link_delayed_discharge_eps.R

* Style code

* amend costs

* Style code

* Revert "amend costs"

This reverts commit 8048e68c829edbf6c0c43e1bf3ade1d142e0e250.

* Add DN and cij_delay back in

* fix the issue

* Style code

* remove running in chunks

* Style code

* Update tests to include missing variables

* Remove unnecessary comma

* fix the bug of preventable_beddays

* Update documentation

* fix total ae_attendances

* fix the bug of preventable_admissions

* fix the bug of hbrescode etc

* minor fix

* minor fix

* Style code

* Fix some warnings being produced by the tests

* Fix failing test

* remove running in chunks

* Style code

* Update the targets config to use `timestamp_positives` as the default reporter

* fix the bug of preventable_beddays

* Update documentation

* fix total ae_attendances

* fix the bug of preventable_admissions

* fix the bug of hbrescode etc

* minor fix

* minor fix

* Style code

* fix home care cost

* add ipdc to fix maternity

* fix preventable addmission and care home cost

* fix preventable_admissions and calculate preventable_beddays here

* add monthly_beddays and yearstay to dd

* Style code

* fix preventable_admissions and preventable_beddays

* Style code

* include parameter for write to disk/year

* Add lookups to indiv file creation pipeline

* include parameter for write to disk/year

* fix delay discharge beddays and yearstay

* Style code

* fix preventable issues

* Style code

* fix the issue of preventable stuff

* Style code

* Update R/aggregate_by_chi_zihao.R

* Update documentation

* Fix minor typos

* [check-spelling] Update metadata

Update for https://github.com/Public-Health-Scotland/source-linkage-files/actions/runs/5443581387/attempts/1
Accepted in https://github.com/Public-Health-Scotland/source-linkage-files/pull/709#issuecomment-1617917895

Signed-off-by: check-spelling-bot <check-spelling-bot@users.noreply.github.com>

* Remove some obsolete comments

* Remove some unnecessary brackets

* Reformat some code

* Use some `dplyr` functions for readability

* Style code

* Update R/link_delayed_discharge_eps.R

* Style code

* Remove some code which is no longer needed

We now match on these variables after

* Work out preventable admissions with similar indicators

* Lowercase variable names

* Restore `cij_delay`

* Restore DN variables

* Tidy the code and use integers where possible

* Supply `year` as a parameter to `clean_up_ch`

* Supply `year` as a parameter to `clean_individual_file`

* Only keep required variables to save memory

* Rename the parameter so the documentation works

* Use `setnames` to change names to lower

* Remove unneeded code

* Update file path name

* Trim the return code

* Some fixes

* Correctly compute `ooh_cases`

* Update documentation

* Style code

* [check-spelling] Update metadata

Update for https://github.com/Public-Health-Scotland/source-linkage-files/actions/runs/5466392495/attempts/1
Accepted in https://github.com/Public-Health-Scotland/source-linkage-files/pull/719#issuecomment-1623280566

Signed-off-by: check-spelling-bot <check-spelling-bot@users.noreply.github.com>

* Add targets for the individual file

* Fix missed pipe

* Style code

* Update some targets to only run once a week

* Make the deaths lookup unique

* Add `year` back to the individual file

* Remove `cost_total_net_inc_dnas` from the indiv file  (#737)

* Drop `cost_total_net_inc_dnas`

* Rename `health_net_costincdnas` to `health_net_cost_inc_dnas`

* Join slf lookups onto individual file (#724)

* Create function for matching on slf lookups

* fix some build warnings

* Add `hbrescode` to select list

* Pass lookups as parameters/deal with hbrescode

* Update R/create_individual_file.R

---------

Co-authored-by: James McMahon <james.mcmahon@phs.scot>

* Join sc client variables onto individual file (#740)

* New function for matching sc client to indiv file

* Style code

* [check-spelling] Update metadata

Update for https://github.com/Public-Health-Scotland/source-linkage-files/actions/runs/5555048903/attempts/1
Accepted in https://github.com/Public-Health-Scotland/source-linkage-files/pull/740#issuecomment-1635955654

Signed-off-by: check-spelling-bot <check-spelling-bot@users.noreply.github.com>

* Code layout

* Style code

* Remove redundant sc variables

Co-authored-by: James McMahon <james.mcmahon@phs.scot>

* Update comments

Co-authored-by: James McMahon <james.mcmahon@phs.scot>

* Update comments

Co-authored-by: James McMahon <james.mcmahon@phs.scot>

* Sort order of parameters to pass `data` first

* Update documentation

* Style code

* Update R/create_individual_file.R

* Update R/create_individual_file.R

* Update R/create_individual_file.R

* Style code

---------

Signed-off-by: check-spelling-bot <check-spelling-bot@users.noreply.github.com>
Co-authored-by: Jennit07 <Jennit07@users.noreply.github.com>
Co-authored-by: James McMahon <james.mcmahon@phs.scot>
Co-authored-by: Moohan <Moohan@users.noreply.github.com>

* Update documentation

* Output the individual file with `anon_chi` (#748)

* Make episode file output with `anon_chi`

I've added this as a parameter so you can output CHI if desired, but the default is for anon_chi.

For the tests, it swaps back to CHI as there are some tests which specifically us the CHI number.

* Output `anon_chi` in the individual file

* Style code

* Sort variables with issues `hbrescode` (HB2018), `datazone` and `hscp` (#746)

* rename `hscp` to `hscp2018`

* rename `spd` as `slf_pc_lookup`

* Add `datazone2011` to coalesce code

* Rename `datazone` to `datazone2011`

* include `datazone2011_old` in selections

* Update R/fill_geographies.R

---------

Co-authored-by: James McMahon <james.mcmahon@phs.scot>

* Fix for anon_chi being NA

---------

Co-authored-by: Moohan <Moohan@users.noreply.github.com>
Co-authored-by: Jennit07 <67372904+Jennit07@users.noreply.github.com>

---------

Signed-off-by: check-spelling-bot <check-spelling-bot@users.noreply.github.com>
Co-authored-by: Mandy Norrbo <mandy@jumpingrivers.com>
Co-authored-by: jr-mandy <jr-mandy@users.noreply.github.com>
Co-authored-by: shintoLampgit config --global user.email bateman.mcbride@phs.scotm  git config --global user.name shintoLamp <bateman.mcbride@phs.scot>
Co-authored-by: shintoLamp <shintoLamp@users.noreply.github.com>
Co-authored-by: Jennit07 <67372904+Jennit07@users.noreply.github.com>
Co-authored-by: Jennifer Thom <jennifer.thom@phs.scot>
Co-authored-by: Jennit07 <Jennit07@users.noreply.github.com>
Co-authored-by: Zihao Li <lizihao_anu@outlook.com>
Co-authored-by: lizihao-anu <lizihao-anu@users.noreply.github.com>
Co-authored-by: Moohan <Moohan@users.noreply.github.com>
Co-authored-by: Zihao Li <zihao.li@phs.scot>
---
 .github/actions/spelling/expect.txt |  13 +
 DESCRIPTION                         |   3 +-
 NAMESPACE                           |   3 +
 R/aggregate_by_chi_zihao.R          | 215 +++++++
 R/create_individual_file.R          | 857 ++++++++++++++++++++++++++++
 R/process_tests_individual_file.R   |  31 +-
 _targets.R                          |  14 +
 man/add_acute_columns.Rd            |  18 +
 man/add_ae_columns.Rd               |  18 +
 man/add_all_columns.Rd              |  15 +
 man/add_at_columns.Rd               |  18 +
 man/add_ch_columns.Rd               |  18 +
 man/add_cij_columns.Rd              |  14 +
 man/add_cmh_columns.Rd              |  18 +
 man/add_dd_columns.Rd               |  18 +
 man/add_dn_columns.Rd               |  18 +
 man/add_gls_columns.Rd              |  18 +
 man/add_hc_columns.Rd               |  18 +
 man/add_hl1_columns.Rd              |  18 +
 man/add_ipdc_cols.Rd                |  23 +
 man/add_mat_columns.Rd              |  18 +
 man/add_mh_columns.Rd               |  18 +
 man/add_nrs_columns.Rd              |  18 +
 man/add_nsu_columns.Rd              |  18 +
 man/add_ooh_columns.Rd              |  18 +
 man/add_op_columns.Rd               |  18 +
 man/add_pis_columns.Rd              |  18 +
 man/add_sds_columns.Rd              |  18 +
 man/add_standard_cols.Rd            |  28 +
 man/aggregate_by_chi.Rd             |  15 +
 man/aggregate_by_chi_zihao.Rd       |  15 +
 man/aggregate_ch_episodes.Rd        |  14 +
 man/aggregate_ch_episodes_zihao.Rd  |  14 +
 man/clean_individual_file.Rd        |  16 +
 man/clean_up_ch.Rd                  |  16 +
 man/clean_up_gender.Rd              |  14 +
 man/condition_cols.Rd               |  13 +
 man/create_individual_file.Rd       |  34 ++
 man/join_sc_client.Rd               |  26 +
 man/join_slf_lookup_vars.Rd         |  27 +
 man/max_no_inf.Rd                   |  16 +
 man/min_no_inf.Rd                   |  16 +
 man/recode_gender.Rd                |  14 +
 man/remove_blank_chi.Rd             |  14 +
 man/select.Rd                       |  30 +
 45 files changed, 1818 insertions(+), 16 deletions(-)
 create mode 100644 R/aggregate_by_chi_zihao.R
 create mode 100644 R/create_individual_file.R
 create mode 100644 man/add_acute_columns.Rd
 create mode 100644 man/add_ae_columns.Rd
 create mode 100644 man/add_all_columns.Rd
 create mode 100644 man/add_at_columns.Rd
 create mode 100644 man/add_ch_columns.Rd
 create mode 100644 man/add_cij_columns.Rd
 create mode 100644 man/add_cmh_columns.Rd
 create mode 100644 man/add_dd_columns.Rd
 create mode 100644 man/add_dn_columns.Rd
 create mode 100644 man/add_gls_columns.Rd
 create mode 100644 man/add_hc_columns.Rd
 create mode 100644 man/add_hl1_columns.Rd
 create mode 100644 man/add_ipdc_cols.Rd
 create mode 100644 man/add_mat_columns.Rd
 create mode 100644 man/add_mh_columns.Rd
 create mode 100644 man/add_nrs_columns.Rd
 create mode 100644 man/add_nsu_columns.Rd
 create mode 100644 man/add_ooh_columns.Rd
 create mode 100644 man/add_op_columns.Rd
 create mode 100644 man/add_pis_columns.Rd
 create mode 100644 man/add_sds_columns.Rd
 create mode 100644 man/add_standard_cols.Rd
 create mode 100644 man/aggregate_by_chi.Rd
 create mode 100644 man/aggregate_by_chi_zihao.Rd
 create mode 100644 man/aggregate_ch_episodes.Rd
 create mode 100644 man/aggregate_ch_episodes_zihao.Rd
 create mode 100644 man/clean_individual_file.Rd
 create mode 100644 man/clean_up_ch.Rd
 create mode 100644 man/clean_up_gender.Rd
 create mode 100644 man/condition_cols.Rd
 create mode 100644 man/create_individual_file.Rd
 create mode 100644 man/join_sc_client.Rd
 create mode 100644 man/join_slf_lookup_vars.Rd
 create mode 100644 man/max_no_inf.Rd
 create mode 100644 man/min_no_inf.Rd
 create mode 100644 man/recode_gender.Rd
 create mode 100644 man/remove_blank_chi.Rd
 create mode 100644 man/select.Rd

diff --git a/.github/actions/spelling/expect.txt b/.github/actions/spelling/expect.txt
index a1800b02f..3236edd84 100644
--- a/.github/actions/spelling/expect.txt
+++ b/.github/actions/spelling/expect.txt
@@ -28,6 +28,7 @@ cmh
 CNWs
 commhosp
 congen
+costincdnas
 costmonthnum
 costsfy
 covr
@@ -45,6 +46,7 @@ dbconnect
 dbplyr
 deathdiag
 demog
+dfc
 disch
 dischloc
 dischto
@@ -70,6 +72,7 @@ fyyear
 geogs
 ggplot
 GLS
+gls
 gms
 GPOo
 gpprac
@@ -86,6 +89,7 @@ hhg
 hjust
 hms
 homecare
+homev
 hscp
 hscpnames
 IDPC
@@ -102,6 +106,8 @@ keyring
 keytime
 keytimex
 kis
+lgl
+kis
 los
 ltc
 ltcs
@@ -116,6 +122,7 @@ multiday
 multisession
 multistaff
 NAs
+newcons
 nhs
 nhshosp
 NRS
@@ -147,7 +154,9 @@ purrr
 quickstart
 Rbuildignore
 rcmdcheck
+rdd
 rds
+reabl
 reablement
 readcode
 readr
@@ -164,8 +173,12 @@ rspm
 RStudio
 rstudioapi
 Rtype
+SDcols
 seealso
 selfharm
+setkeyv
+setnafill
+setnames
 Siar
 sigfac
 simd
diff --git a/DESCRIPTION b/DESCRIPTION
index 02b87f21b..a437b80cc 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -55,7 +55,8 @@ Imports:
     stringr (>= 1.5.0),
     tibble (>= 3.2.1),
     tidyr (>= 1.3.0),
-    tidyselect (>= 1.2.0)
+    tidyselect (>= 1.2.0),
+    zoo (>= 1.8.0)
 Suggests:
     covr (>= 3.6.1),
     roxygen2 (>= 7.2.3),
diff --git a/NAMESPACE b/NAMESPACE
index 642146578..d87bf9397 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -13,6 +13,7 @@ export(convert_hscp_to_hscpnames)
 export(convert_numeric_to_date)
 export(convert_sending_location_to_lca)
 export(convert_year_to_fyyear)
+export(create_individual_file)
 export(create_service_use_cohorts)
 export(end_fy)
 export(end_fy_quarter)
@@ -160,6 +161,8 @@ export(start_fy)
 export(start_fy_quarter)
 export(start_next_fy_quarter)
 export(write_file)
+importFrom(data.table,.N)
+importFrom(data.table,.SD)
 importFrom(magrittr,"%>%")
 importFrom(readr,col_character)
 importFrom(readr,col_date)
diff --git a/R/aggregate_by_chi_zihao.R b/R/aggregate_by_chi_zihao.R
new file mode 100644
index 000000000..0eee203e8
--- /dev/null
+++ b/R/aggregate_by_chi_zihao.R
@@ -0,0 +1,215 @@
+#' Aggregate by CHI
+#'
+#' @description Aggregate episode file by CHI to convert into
+#' individual file.
+#'
+#' @importFrom data.table .N
+#' @importFrom data.table .SD
+#'
+#' @inheritParams create_individual_file
+aggregate_by_chi_zihao <- function(episode_file) {
+  cli::cli_alert_info("Aggregate by CHI function started at {Sys.time()}")
+
+  # Convert to data.table
+  data.table::setDT(episode_file)
+
+  # Ensure all variable names are lowercase
+  data.table::setnames(episode_file, stringr::str_to_lower)
+
+  # Sort the data
+  data.table::setkeyv(
+    episode_file,
+    c(
+      "chi",
+      "record_keydate1",
+      "keytime1",
+      "record_keydate2",
+      "keytime2"
+    )
+  )
+
+  data.table::setnames(
+    episode_file,
+    c(
+      "ch_chi_cis", "cij_marker", "ooh_case_id"
+      # ,"hh_in_fy"
+    ),
+    c(
+      "ch_cis_episodes", "cij_total", "ooh_cases"
+      # ,"hl1_in_fy"
+    )
+  )
+
+  # column specification, grouped by chi
+  # columns to select last
+  cols2 <- c(
+    "postcode",
+    "dob",
+    "gpprac",
+    vars_start_with(episode_file, "sc_")
+  )
+  # columns to count unique rows
+  cols3 <- c(
+    "ch_cis_episodes",
+    "cij_total",
+    "cij_el",
+    "cij_non_el",
+    "cij_mat",
+    "cij_delay",
+    "ooh_cases",
+    "preventable_admissions"
+  )
+  # columns to sum up
+  cols4 <- c(
+    vars_end_with(
+      episode_file,
+      c(
+        "episodes",
+        "beddays",
+        "cost",
+        "attendances",
+        "attend",
+        "contacts",
+        "hours",
+        "alarms",
+        "telecare",
+        "paid_items",
+        "advice",
+        "homev",
+        "time",
+        "assessment",
+        "other",
+        "dn",
+        "nhs24",
+        "pcc",
+        "_dnas"
+      )
+    ),
+    vars_start_with(
+      episode_file,
+      "sds_option"
+    ),
+    "health_net_cost_inc_dnas"
+  )
+  cols4 <- cols4[!(cols4 %in% c("ch_cis_episodes"))]
+  # columns to select maximum
+  cols5 <- c("nsu", vars_contain(episode_file, c("hl1_in_fy")))
+  data.table::setnafill(episode_file, fill = 0L, cols = cols5)
+  # compute
+  individual_file_cols1 <- episode_file[,
+    .(gender = mean(gender)),
+    by = "chi"
+  ]
+  individual_file_cols2 <- episode_file[,
+    .SD[.N],
+    .SDcols = cols2,
+    by = "chi"
+  ]
+  individual_file_cols3 <- episode_file[,
+    lapply(.SD, function(x) {
+      data.table::uniqueN(x, na.rm = TRUE)
+    }),
+    .SDcols = cols3,
+    by = "chi"
+  ]
+  individual_file_cols4 <- episode_file[,
+    lapply(.SD, function(x) {
+      sum(x, na.rm = TRUE)
+    }),
+    .SDcols = cols4,
+    by = "chi"
+  ]
+  individual_file_cols5 <- episode_file[,
+    lapply(.SD, function(x) max(x, na.rm = TRUE)),
+    .SDcols = cols5,
+    by = "chi"
+  ]
+  individual_file_cols6 <- episode_file[,
+    .(
+      preventable_beddays = ifelse(
+        max(cij_ppa, na.rm = TRUE),
+        max(cij_end_date) - min(cij_start_date),
+        NA_real_
+      )
+    ),
+    # cij_marker has been renamed as cij_total
+    by = c("chi", "cij_total")
+  ]
+  individual_file_cols6 <- individual_file_cols6[,
+    .(
+      preventable_beddays = sum(preventable_beddays, na.rm = TRUE)
+    ),
+    by = "chi"
+  ]
+
+  individual_file <- dplyr::bind_cols(
+    individual_file_cols1,
+    individual_file_cols2[, chi := NULL],
+    individual_file_cols3[, chi := NULL],
+    individual_file_cols4[, chi := NULL],
+    individual_file_cols5[, chi := NULL],
+    individual_file_cols6[, chi := NULL]
+  )
+
+  # convert back to tibble
+  return(dplyr::as_tibble(individual_file))
+}
+
+
+#' select columns ending with some patterns
+#' @describeIn select columns based on patterns
+vars_end_with <- function(data, vars, ignore_case = FALSE) {
+  names(data)[stringr::str_ends(
+    names(data),
+    stringr::regex(paste(vars, collapse = "|"),
+      ignore_case = ignore_case
+    )
+  )]
+}
+
+#' select columns starting with some patterns
+#' @describeIn select columns based on patterns
+vars_start_with <- function(data, vars, ignore_case = FALSE) {
+  names(data)[stringr::str_starts(
+    names(data),
+    stringr::regex(paste(vars, collapse = "|"),
+      ignore_case = ignore_case
+    )
+  )]
+}
+
+#' select columns contains some characters
+#' @describeIn select columns based on patterns
+vars_contain <- function(data, vars, ignore_case = FALSE) {
+  names(data)[stringr::str_detect(
+    names(data),
+    stringr::regex(paste(vars, collapse = "|"),
+      ignore_case = ignore_case
+    )
+  )]
+}
+
+#' Aggregate CIS episodes
+#'
+#' @description Aggregate CH variables by CHI and CIS.
+#'
+#' @inheritParams create_individual_file
+aggregate_ch_episodes_zihao <- function(episode_file) {
+  cli::cli_alert_info("Aggregate ch episodes function started at {Sys.time()}")
+
+  # Convert to data.table
+  data.table::setDT(episode_file)
+
+  # Perform grouping and aggregation
+  episode_file <- episode_file[, `:=`(
+    ch_no_cost = max(ch_no_cost),
+    ch_ep_start = min(record_keydate1),
+    ch_ep_end = max(ch_ep_end),
+    ch_cost_per_day = mean(ch_cost_per_day)
+  ), by = c("chi", "ch_chi_cis")]
+
+  # Convert back to tibble if needed
+  episode_file <- tibble::as_tibble(episode_file)
+
+  return(episode_file)
+}
diff --git a/R/create_individual_file.R b/R/create_individual_file.R
new file mode 100644
index 000000000..675e2066a
--- /dev/null
+++ b/R/create_individual_file.R
@@ -0,0 +1,857 @@
+#' Create individual file
+#'
+#' @description Creates individual file from episode file
+#'
+#' @param episode_file Tibble containing episodic data
+#' @param anon_chi_in (Default:TRUE) Is `anon_chi` used in the input
+#' (instead of chi)
+#' @inheritParams run_episode_file
+#'
+#' @return The processed individual file
+#' @export
+create_individual_file <- function(
+    episode_file,
+    year,
+    write_to_disk = TRUE,
+    anon_chi_in = TRUE,
+    anon_chi_out = TRUE) {
+  if (anon_chi_in) {
+    episode_file <- slfhelper::get_chi(
+      episode_file,
+      anon_chi_var = "anon_chi",
+      drop = TRUE
+    ) %>%
+      dplyr::mutate(chi = dplyr::na_if(.data$chi, ""))
+  }
+
+  individual_file <- episode_file %>%
+    dplyr::select(
+      "year",
+      "chi",
+      "dob",
+      "gender",
+      "record_keydate1",
+      "record_keydate2",
+      "keytime1",
+      "keytime2",
+      "recid",
+      "smrtype",
+      "ipdc",
+      "postcode",
+      "gpprac",
+      "cij_marker",
+      "cij_start_date",
+      "cij_end_date",
+      "cij_pattype",
+      "cij_pattype_code",
+      "cij_ppa",
+      "ch_chi_cis",
+      "yearstay",
+      "cost_total_net",
+      "cost_total_net_inc_dnas",
+      "attendance_status",
+      "no_paid_items",
+      "total_no_dn_contacts",
+      "primary_delay_reason",
+      "sc_latest_submission",
+      "hc_hours_annual",
+      "hc_reablement",
+      "ooh_case_id"
+    ) %>%
+    remove_blank_chi() %>%
+    add_cij_columns() %>%
+    add_all_columns() %>%
+    aggregate_ch_episodes_zihao() %>%
+    clean_up_ch(year) %>%
+    recode_gender() %>%
+    aggregate_by_chi_zihao() %>%
+    clean_individual_file(year) %>%
+    join_cohort_lookups(year) %>%
+    match_on_ltcs(year) %>%
+    join_deaths_data(year) %>%
+    join_sparra_hhg(year) %>%
+    join_slf_lookup_vars() %>%
+    join_sc_client(year) %>%
+    dplyr::mutate(year = year)
+
+  if (anon_chi_out) {
+    individual_file <- individual_file %>%
+      tidyr::replace_na(list(chi = "")) %>%
+      slfhelper::get_anon_chi() %>%
+      dplyr::mutate(anon_chi = dplyr::na_if(.data$anon_chi, ""))
+  }
+
+  if (write_to_disk) {
+    slf_indiv_path <- get_file_path(
+      get_year_dir(year),
+      stringr::str_glue(
+        "source-individual-file-{year}.parquet"
+      ),
+      check_mode = "write"
+    )
+
+    write_file(individual_file, slf_indiv_path)
+  }
+
+  return(individual_file)
+}
+
+#' Remove blank CHI
+#'
+#' @description Convert blank strings to NA and remove NAs from CHI column
+#'
+#' @inheritParams create_individual_file
+remove_blank_chi <- function(episode_file) {
+  cli::cli_alert_info("Remove blank CHI function started at {Sys.time()}")
+
+  episode_file %>%
+    dplyr::mutate(chi = dplyr::na_if(.data$chi, "")) %>%
+    dplyr::filter(!is.na(.data$chi))
+}
+
+
+#' Add CIJ-related columns
+#'
+#' @description Add new columns related to CIJ
+#'
+#' @inheritParams create_individual_file
+add_cij_columns <- function(episode_file) {
+  cli::cli_alert_info("Add cij columns function started at {Sys.time()}")
+
+  episode_file %>%
+    dplyr::mutate(
+      cij_non_el = dplyr::if_else(
+        .data$cij_pattype_code == 0,
+        .data$cij_marker,
+        NA_real_
+      ),
+      cij_el = dplyr::if_else(
+        .data$cij_pattype_code == 1,
+        .data$cij_marker,
+        NA_real_
+      ),
+      cij_mat = dplyr::if_else(
+        .data$cij_pattype_code == 2,
+        .data$cij_marker,
+        NA_real_
+      ),
+      cij_delay = dplyr::if_else(
+        .data$recid == "DD",
+        .data$cij_marker,
+        NA_real_
+      ),
+      preventable_admissions = dplyr::if_else(
+        .data$cij_ppa == 1,
+        .data$cij_marker,
+        NA_integer_
+      )
+    )
+}
+
+#' Add all columns
+#'
+#' @description Add new columns based on SMRType and recid which follow a pattern
+#' of prefixed column names created based on some condition.
+#'
+#' @inheritParams create_individual_file
+add_all_columns <- function(episode_file) {
+  cli::cli_alert_info("Add all columns function started at {Sys.time()}")
+
+  episode_file %>%
+    add_acute_columns("Acute", (.data$smrtype == "Acute-DC" | .data$smrtype == "Acute-IP") & .data$cij_pattype != "Maternity") %>%
+    add_mat_columns("Mat", .data$recid == "02B" | .data$cij_pattype == "Maternity") %>%
+    add_mh_columns("MH", .data$recid == "04B" & .data$cij_pattype != "Maternity") %>%
+    add_gls_columns("GLS", .data$smrtype == "GLS-IP") %>%
+    add_op_columns("OP", .data$recid == "00B") %>%
+    add_ae_columns("AE", .data$recid == "AE2") %>%
+    add_pis_columns("PIS", .data$recid == "PIS") %>%
+    add_ooh_columns("OoH", .data$recid == "OoH") %>%
+    add_dn_columns("DN", .data$recid == "DN") %>%
+    add_cmh_columns("CMH", .data$recid == "CMH") %>%
+    add_dd_columns("DD", .data$recid == "DD") %>%
+    add_nsu_columns("NSU", .data$recid == "NSU") %>%
+    add_nrs_columns("NRS", .data$recid == "NRS") %>%
+    add_hl1_columns("HL1", .data$recid == "HL1") %>%
+    add_ch_columns("CH", .data$recid == "CH") %>%
+    add_hc_columns("HC", .data$recid == "HC") %>%
+    add_at_columns("AT", .data$recid == "AT") %>%
+    add_sds_columns("SDS", .data$recid == "SDS") %>%
+    dplyr::mutate(
+      health_net_cost = rowSums(
+        dplyr::pick(
+          .data$Acute_cost,
+          .data$Mat_cost,
+          .data$MH_cost,
+          .data$GLS_cost,
+          .data$OP_cost_attend,
+          .data$AE_cost,
+          .data$PIS_cost,
+          .data$OoH_cost
+        ),
+        na.rm = TRUE
+      ),
+      health_net_cost_inc_dnas = .data$health_net_cost + dplyr::if_else(
+        is.na(.data$OP_cost_dnas),
+        0,
+        .data$OP_cost_dnas
+      )
+    )
+}
+
+#' Add Acute columns
+#'
+#' @inheritParams create_individual_file
+#' @param prefix Prefix to add to related columns, e.g. "Acute"
+#' @param condition Condition to create new columns based on
+add_acute_columns <- function(episode_file, prefix, condition) {
+  condition <- substitute(condition)
+  episode_file %>%
+    add_standard_cols(prefix, condition, episode = TRUE, cost = TRUE) %>%
+    add_ipdc_cols(prefix, condition)
+}
+
+#' Add Mat columns
+#'
+#' @inheritParams add_acute_columns
+add_mat_columns <- function(episode_file, prefix, condition) {
+  condition <- substitute(condition)
+  episode_file %>%
+    add_standard_cols(prefix, condition, episode = TRUE, cost = TRUE) %>%
+    add_ipdc_cols(prefix, condition, elective = FALSE)
+}
+
+#' Add MH columns
+#'
+#' @inheritParams add_acute_columns
+add_mh_columns <- function(episode_file, prefix, condition) {
+  condition <- substitute(condition)
+  episode_file %>%
+    add_standard_cols(prefix, condition, episode = TRUE, cost = TRUE) %>%
+    add_ipdc_cols(prefix, condition, ipdc_d = FALSE)
+}
+
+#' Add GLS columns
+#'
+#' @inheritParams add_acute_columns
+add_gls_columns <- function(episode_file, prefix, condition) {
+  condition <- substitute(condition)
+  episode_file %>%
+    add_standard_cols(prefix, condition, episode = TRUE, cost = TRUE) %>%
+    add_ipdc_cols(prefix, condition, ipdc_d = FALSE)
+}
+
+#' Add OP columns
+#'
+#' @inheritParams add_acute_columns
+add_op_columns <- function(episode_file, prefix, condition) {
+  condition <- substitute(condition)
+  episode_file <- episode_file %>%
+    add_standard_cols(prefix, condition)
+  condition_1 <- substitute(condition & attendance_status == 1)
+  episode_file <- episode_file %>%
+    dplyr::mutate(
+      "{prefix}_newcons_attendances" := dplyr::if_else(eval(condition_1), 1L, NA_integer_),
+      "{prefix}_cost_attend" := dplyr::if_else(eval(condition_1), .data$cost_total_net, NA_real_)
+    )
+  condition_5_8 <- substitute(condition & attendance_status %in% c(5, 8))
+  episode_file <- episode_file %>%
+    dplyr::mutate(
+      "{prefix}_newcons_dnas" := dplyr::if_else(eval(condition_5_8), 1L, NA_integer_),
+      "{prefix}_cost_dnas" := dplyr::if_else(eval(condition_5_8), .data$cost_total_net_inc_dnas, NA_real_)
+    )
+  return(episode_file)
+}
+
+#' Add AE columns
+#'
+#' @inheritParams add_acute_columns
+add_ae_columns <- function(episode_file, prefix, condition) {
+  condition <- substitute(condition)
+  episode_file %>%
+    add_standard_cols(prefix, condition, cost = TRUE) %>%
+    dplyr::mutate("{prefix}_attendances" := dplyr::if_else(eval(condition), 1L, NA_integer_))
+}
+
+#' Add PIS columns
+#'
+#' @inheritParams add_acute_columns
+add_pis_columns <- function(episode_file, prefix, condition) {
+  condition <- substitute(condition)
+  episode_file %>%
+    add_standard_cols(prefix, condition, cost = TRUE) %>%
+    dplyr::mutate("{prefix}_paid_items" := dplyr::if_else(eval(condition), .data$no_paid_items, NA_integer_))
+}
+
+#' Add OoH columns
+#'
+#' @inheritParams add_acute_columns
+add_ooh_columns <- function(episode_file, prefix, condition) {
+  condition <- substitute(condition)
+  episode_file <- episode_file %>%
+    add_standard_cols(prefix, condition, cost = TRUE) %>%
+    dplyr::mutate(
+      "{prefix}_homeV" := dplyr::if_else(eval(condition) & .data$smrtype == "OOH-HomeV", 1L, NA_integer_),
+      "{prefix}_advice" := dplyr::if_else(eval(condition) & .data$smrtype == "OOH-Advice", 1L, NA_integer_),
+      "{prefix}_DN" := dplyr::if_else(eval(condition) & .data$smrtype == "OOH-DN", 1L, NA_integer_),
+      "{prefix}_NHS24" := dplyr::if_else(eval(condition) & .data$smrtype == "OOH-NHS24", 1L, NA_integer_),
+      "{prefix}_other" := dplyr::if_else(eval(condition) & .data$smrtype == "OOH-Other", 1L, NA_integer_),
+      "{prefix}_PCC" := dplyr::if_else(eval(condition) & .data$smrtype == "OOH-PCC", 1L, NA_integer_),
+      "{prefix}_covid_advice" := dplyr::if_else(eval(condition) & .data$smrtype == "OOH-C19Adv", 1L, NA_integer_),
+      "{prefix}_covid_assessment" := dplyr::if_else(eval(condition) & .data$smrtype == "OOH-C19Ass", 1L, NA_integer_),
+      "{prefix}_covid_other" := dplyr::if_else(eval(condition) & .data$smrtype == "OOH-C190th", 1L, NA_integer_)
+    )
+
+  episode_file <- episode_file %>%
+    dplyr::mutate(
+      "{prefix}_consultation_time" := dplyr::if_else(
+        eval(condition),
+        pmax(
+          0,
+          as.numeric((lubridate::seconds_to_period(.data$keytime2) + .data$record_keydate2) - (lubridate::seconds_to_period(.data$keytime1) + .data$record_keydate1), units = "mins")
+        ),
+        NA_real_
+      ),
+    )
+
+  return(episode_file)
+}
+
+#' Add DN columns
+#'
+#' @inheritParams add_acute_columns
+add_dn_columns <- function(episode_file, prefix, condition) {
+  condition <- substitute(condition)
+  episode_file %>%
+    add_standard_cols(prefix, condition, episode = TRUE, cost = TRUE) %>%
+    dplyr::mutate("{prefix}_contacts" := dplyr::if_else(eval(condition), .data$total_no_dn_contacts, NA_integer_))
+}
+
+#' Add CMH columns
+#'
+#' @inheritParams add_acute_columns
+add_cmh_columns <- function(episode_file, prefix, condition) {
+  condition <- substitute(condition)
+  episode_file %>%
+    add_standard_cols(prefix, condition) %>%
+    dplyr::mutate("{prefix}_contacts" := dplyr::if_else(eval(condition), 1L, NA_integer_))
+}
+
+#' Add DD columns
+#'
+#' @inheritParams add_acute_columns
+add_dd_columns <- function(episode_file, prefix, condition) {
+  condition <- substitute(condition)
+  condition_delay <- substitute(condition & primary_delay_reason != "9")
+  episode_file <- episode_file %>%
+    dplyr::mutate(
+      "{prefix}_NonCode9_episodes" := dplyr::if_else(eval(condition_delay), 1L, NA_integer_),
+      "{prefix}_NonCode9_beddays" := dplyr::if_else(eval(condition_delay), .data$yearstay, NA_real_)
+    )
+  condition_delay_9 <- substitute(condition & primary_delay_reason == "9")
+  episode_file <- episode_file %>%
+    dplyr::mutate(
+      "{prefix}_Code9_episodes" := dplyr::if_else(eval(condition_delay_9), 1L, NA_integer_),
+      "{prefix}_Code9_beddays" := dplyr::if_else(eval(condition_delay_9), .data$yearstay, NA_real_)
+    )
+  return(episode_file)
+}
+
+#' Add NSU columns
+#'
+#' @inheritParams add_acute_columns
+add_nsu_columns <- function(episode_file, prefix, condition) {
+  condition <- substitute(condition)
+  episode_file %>%
+    add_standard_cols(prefix, condition) %>%
+    dplyr::mutate("{prefix}" := dplyr::if_else(eval(condition), 1L, NA_integer_))
+}
+
+#' Add NRS columns
+#'
+#' @inheritParams add_acute_columns
+add_nrs_columns <- function(episode_file, prefix, condition) {
+  condition <- substitute(condition)
+  episode_file %>%
+    add_standard_cols(prefix, condition) %>%
+    dplyr::mutate("{prefix}" := dplyr::if_else(eval(condition), 1L, NA_integer_))
+}
+
+#' Add HL1 columns
+#'
+#' @inheritParams add_acute_columns
+add_hl1_columns <- function(episode_file, prefix, condition) {
+  condition <- substitute(condition)
+  episode_file %>%
+    add_standard_cols(prefix, condition)
+}
+
+#' Add CH columns
+#'
+#' @inheritParams add_acute_columns
+add_ch_columns <- function(episode_file, prefix, condition) {
+  condition <- substitute(condition)
+  episode_file %>%
+    add_standard_cols(prefix, condition) %>%
+    dplyr::mutate(
+      ch_cost_per_day = dplyr::if_else(
+        eval(condition) & .data$yearstay > 0,
+        .data$cost_total_net / .data$yearstay,
+        .data$cost_total_net
+      ),
+      ch_no_cost = eval(condition) & is.na(.data$ch_cost_per_day),
+      ch_ep_end = dplyr::if_else(
+        eval(condition),
+        .data$record_keydate2,
+        lubridate::NA_Date_
+      ),
+      # If end date is missing use the first day of next FY quarter
+      ch_ep_end = dplyr::if_else(
+        eval(condition) & is.na(.data$ch_ep_end),
+        start_next_fy_quarter(.data$sc_latest_submission),
+        .data$ch_ep_end
+      )
+    )
+}
+
+#' Add HC columns
+#'
+#' @inheritParams add_acute_columns
+add_hc_columns <- function(episode_file, prefix, condition) {
+  condition <- substitute(condition)
+  episode_file <- episode_file %>%
+    add_standard_cols(prefix, condition, episode = TRUE) %>%
+    dplyr::mutate(
+      "{prefix}_total_hours" := dplyr::if_else(eval(condition), .data$hc_hours_annual, NA_real_),
+      "{prefix}_total_cost" := dplyr::if_else(eval(condition), .data$cost_total_net, NA_real_),
+    )
+  condition_per <- substitute(condition & smrtype == "HC-Per")
+  episode_file <- episode_file %>%
+    dplyr::mutate(
+      "{prefix}_personal_episodes" := dplyr::if_else(eval(condition_per), 1L, NA_integer_),
+      "{prefix}_personal_hours" := dplyr::if_else(eval(condition_per), .data$HC_total_hours, NA_real_),
+      "{prefix}_personal_hours_cost" := dplyr::if_else(eval(condition_per), .data$cost_total_net, NA_real_)
+    )
+  condition_non_per <- substitute(condition & smrtype == "HC-Non-Per")
+  episode_file <- episode_file %>%
+    dplyr::mutate(
+      "{prefix}_non_personal_episodes" := dplyr::if_else(eval(condition_non_per), 1L, NA_integer_),
+      "{prefix}_non_personal_hours" := dplyr::if_else(eval(condition_non_per), .data$hc_hours_annual, NA_real_),
+      "{prefix}_non_personal_hours_cost" := dplyr::if_else(eval(condition_non_per), .data$cost_total_net, NA_real_)
+    )
+  condition_reabl <- substitute(condition & hc_reablement == 1)
+  episode_file <- episode_file %>%
+    dplyr::mutate(
+      "{prefix}_reablement_episodes" := dplyr::if_else(eval(condition_reabl), 1L, NA_integer_),
+      "{prefix}_reablement_hours" := dplyr::if_else(eval(condition_reabl), .data$hc_hours_annual, NA_real_),
+      "{prefix}_reablement_hours_cost" := dplyr::if_else(eval(condition_reabl), .data$cost_total_net, NA_real_)
+    )
+}
+
+#' Add AT columns
+#'
+#' @inheritParams add_acute_columns
+add_at_columns <- function(episode_file, prefix, condition) {
+  condition <- substitute(condition)
+  episode_file %>%
+    add_standard_cols(prefix, condition) %>%
+    dplyr::mutate(
+      "{prefix}_alarms" := dplyr::if_else(eval(condition) & .data$smrtype == "AT-Alarm", 1L, NA_integer_),
+      "{prefix}_telecare" := dplyr::if_else(eval(condition) & .data$smrtype == "AT-Tele", 1L, NA_integer_)
+    )
+}
+
+#' Add SDS columns
+#'
+#' @inheritParams add_acute_columns
+add_sds_columns <- function(episode_file, prefix, condition) {
+  condition <- substitute(condition)
+  episode_file %>%
+    add_standard_cols(prefix, condition) %>%
+    dplyr::mutate(
+      "{prefix}_option_1" := dplyr::if_else(eval(condition) & .data$smrtype == "SDS-1", 1L, NA_integer_),
+      "{prefix}_option_2" := dplyr::if_else(eval(condition) & .data$smrtype == "SDS-2", 1L, NA_integer_),
+      "{prefix}_option_3" := dplyr::if_else(eval(condition) & .data$smrtype == "SDS-3", 1L, NA_integer_),
+      "{prefix}_option_4" := dplyr::if_else(eval(condition) & .data$smrtype == "SDS-4", 1L, NA_integer_)
+    )
+}
+
+#' Add columns based on IPDC
+#'
+#' @description Add columns based on value in IPDC column, which can
+#' be further split by Elective/Non-Elective CIJ.
+#'
+#' @inheritParams add_acute_columns
+#' @param ipdc_d Whether to create columns based on IPDC = "D" (lgl)
+#' @param elective Whether to create columns based on Elective/Non-Elective cij_pattype (lgl)
+add_ipdc_cols <- function(episode_file, prefix, condition, ipdc_d = TRUE, elective = TRUE) {
+  condition_i <- substitute(eval(condition) & ipdc == "I")
+  episode_file <- episode_file %>%
+    dplyr::mutate(
+      "{prefix}_inpatient_cost" := dplyr::if_else(eval(condition_i), .data$cost_total_net, NA_real_),
+      "{prefix}_inpatient_episodes" := dplyr::if_else(eval(condition_i), 1L, NA_integer_),
+      "{prefix}_inpatient_beddays" := dplyr::if_else(eval(condition_i), .data$yearstay, NA_real_)
+    )
+  if (elective) {
+    condition_el <- substitute(condition_i & cij_pattype == "Elective")
+    episode_file <- episode_file %>%
+      dplyr::mutate(
+        "{prefix}_el_inpatient_episodes" := dplyr::if_else(eval(condition_el), 1L, NA_integer_),
+        "{prefix}_el_inpatient_beddays" := dplyr::if_else(eval(condition_el), .data$yearstay, NA_real_),
+        "{prefix}_el_inpatient_cost" := dplyr::if_else(eval(condition_el), .data$cost_total_net, NA_real_)
+      )
+    condition_non_el <- substitute(condition_i & cij_pattype == "Non-Elective")
+    episode_file <- episode_file %>%
+      dplyr::mutate(
+        "{prefix}_non_el_inpatient_episodes" := dplyr::if_else(eval(condition_non_el), 1L, NA_integer_),
+        "{prefix}_non_el_inpatient_beddays" := dplyr::if_else(eval(condition_non_el), .data$yearstay, NA_real_),
+        "{prefix}_non_el_inpatient_cost" := dplyr::if_else(eval(condition_non_el), .data$cost_total_net, NA_real_)
+      )
+  }
+  if (ipdc_d) {
+    condition_d <- substitute(eval(condition) & ipdc == "D")
+    episode_file <- episode_file %>%
+      dplyr::mutate(
+        "{prefix}_daycase_episodes" := dplyr::if_else(eval(condition_d), 1L, NA_integer_),
+        "{prefix}_daycase_cost" := dplyr::if_else(eval(condition_d), .data$cost_total_net, NA_real_)
+      )
+  }
+  return(episode_file)
+}
+
+#' Add standard columns
+#'
+#' @description Add standard columns (DoB, postcode, gpprac, episodes, cost) to episode file.
+#'
+#' @inheritParams add_acute_columns
+#' @param episode Whether to create prefix_episodes col, e.g. "Acute_episodes"
+#' @param cost Whether to create prefix_cost col, e.g. "Acute_cost"
+add_standard_cols <- function(episode_file, prefix, condition, episode = FALSE, cost = FALSE) {
+  if (episode) {
+    episode_file <- dplyr::mutate(episode_file, "{prefix}_episodes" := dplyr::if_else(eval(condition), 1L, NA_integer_))
+  }
+  if (cost) {
+    episode_file <- dplyr::mutate(episode_file, "{prefix}_cost" := dplyr::if_else(eval(condition), .data$cost_total_net, NA_real_))
+  }
+  return(episode_file)
+}
+
+
+#' Aggregate CIS episodes
+#'
+#' @description Aggregate CH variables by CHI and CIS.
+#'
+#' @inheritParams create_individual_file
+aggregate_ch_episodes <- function(episode_file) {
+  cli::cli_alert_info("Aggregate ch episodes function started at {Sys.time()}")
+
+  episode_file %>%
+    # dplyr::filter(!is.na(.data$ch_chi_cis)) %>%
+    # use as.data.table to change the data format to data.table to accelerate
+    data.table::as.data.table() %>%
+    dplyr::group_by(.data$chi, .data$ch_chi_cis) %>%
+    dplyr::mutate(
+      ch_no_cost = max(.data$ch_no_cost),
+      ch_ep_start = min(.data$record_keydate1),
+      ch_ep_end = max(.data$ch_ep_end),
+      ch_cost_per_day = mean(.data$ch_cost_per_day)
+    ) %>%
+    dplyr::ungroup() %>%
+    # change the data format from data.table to data.frame
+    tibble::as_tibble()
+
+  # dplyr::distinct(.data$chi, .data$ch_chi_cis) %>%
+  # dplyr::select(.data$chi, .data$ch_chi_cis, .data$ch_no_cost, .data$ch_ep_start, .data$ch_ep_end, .data$ch_cost_per_day) %>%
+  # dplyr::right_join(episode_file, by = c(.data$chi, .data$ch_chi_cis))
+}
+
+#' Clean up CH
+#'
+#' @description Clean up CH-related columns.
+#'
+#' @inheritParams create_individual_file
+clean_up_ch <- function(episode_file, year) {
+  cli::cli_alert_info("Clean up CH function started at {Sys.time()}")
+
+  episode_file %>%
+    dplyr::mutate(
+      fy_end = end_fy(year),
+      fy_start = start_fy(year)
+    ) %>%
+    dplyr::mutate(
+      term_1 = pmin(.data$ch_ep_end, .data$fy_end + 1),
+      term_2 = pmax(.data$ch_ep_start, .data$fy_start)
+    ) %>%
+    dplyr::mutate(
+      ch_beddays = dplyr::if_else(
+        .data$recid == "CH",
+        as.numeric(.data$term_1 - .data$term_2),
+        NA_real_
+      ),
+      ch_cost = dplyr::if_else(
+        .data$recid == "CH" & .data$ch_no_cost == 0,
+        .data$ch_beddays * .data$ch_cost_per_day,
+        NA_real_
+      ),
+      ch_beddays = dplyr::if_else(
+        .data$recid == "CH" & .data$ch_chi_cis == 0,
+        0,
+        .data$ch_beddays
+      ),
+      ch_cost = dplyr::if_else(
+        .data$recid == "CH" & .data$ch_chi_cis == 0,
+        0,
+        .data$ch_cost
+      )
+    ) %>%
+    dplyr::select(-c("fy_end", "fy_start", "term_1", "term_2"))
+}
+
+#' Recode gender
+#'
+#' @description Recode gender to 1.5 if 0 or 9.
+#'
+#' @inheritParams create_individual_file
+recode_gender <- function(episode_file) {
+  cli::cli_alert_info("Recode Gender function started at {Sys.time()}")
+
+  episode_file %>%
+    dplyr::mutate(
+      gender = dplyr::if_else(
+        .data$gender %in% c(0, 9),
+        1.5,
+        .data$gender
+      )
+    )
+}
+
+#' Aggregate by CHI
+#'
+#' @description Aggregate episode file by CHI to convert into
+#' individual file.
+#'
+#' @inheritParams create_individual_file
+aggregate_by_chi <- function(episode_file) {
+  cli::cli_alert_info("Aggregate by CHI function started at {Sys.time()}")
+
+  episode_file %>%
+    dplyr::arrange(
+      chi,
+      record_keydate1,
+      keytime1,
+      record_keydate2,
+      keytime2
+    ) %>%
+    dplyr::group_by(.data$chi) %>%
+    dplyr::summarise(
+      gender = mean(gender),
+      dplyr::across(
+        dplyr::ends_with(c("postcode", "DoB", "gpprac")),
+        ~ dplyr::last(., na_rm = TRUE)
+      ),
+      dplyr::across(
+        c(
+          "ch_cis_episodes" = "ch_chi_cis",
+          "cij_total" = "cij_marker",
+          "cij_el",
+          "cij_non_el",
+          "cij_mat",
+          # "cij_delay",
+          "ooh_cases" = "ooh_case_id",
+          "preventable_admissions"
+        ),
+        ~ dplyr::n_distinct(.x, na.rm = TRUE)
+      ),
+      dplyr::across(
+        c(
+          dplyr::ends_with(
+            c(
+              "episodes",
+              "beddays",
+              "cost",
+              "attendances",
+              "attend",
+              "contacts",
+              "hours",
+              "alarms",
+              "telecare",
+              "paid_items",
+              "advice",
+              "homeV",
+              "time",
+              "assessment",
+              "other",
+              # "DN",
+              "NHS24",
+              "PCC",
+              "_dnas"
+            )
+          ),
+          dplyr::starts_with("SDS_option")
+        ),
+        ~ sum(., na.rm = TRUE)
+      ),
+      # dplyr::across(
+      #   c(
+      #     # dplyr::starts_with("sc_"),
+      #     #-"sc_send_lca",
+      #     #-"sc_latest_submission",
+      #     # "HL1_in_FY" = "hh_in_fy",
+      #     "NSU"
+      #   ),
+      #   ~ max_no_inf(.)
+      # ),
+      dplyr::across(
+        c(
+          condition_cols(),
+          # "death_date",
+          # "deceased",
+          "year",
+          dplyr::ends_with(c(
+            "_Cohort", "end_fy", "start_fy"
+          )),
+        ),
+        ~ dplyr::first(., na_rm = TRUE)
+      )
+    ) %>%
+    dplyr::ungroup()
+}
+
+#' Condition columns
+#'
+#' @description Returns chr vector of column names
+#' which follow format "condition" and "condition_date" e.g.
+#' "dementia" and "dementia_date"
+condition_cols <- function() {
+  conditions <- slfhelper::ltc_vars
+  date_cols <- paste0(conditions, "_date")
+  all_cols <- c(conditions, date_cols)
+  return(all_cols)
+}
+
+#' Custom maximum
+#'
+#' @description Custom maximum function which removes
+#' missing values but doesn't return Inf if all values
+#' are missing (instead returns NA)
+#'
+#' @param x Vector to return max of
+max_no_inf <- function(x) {
+  dplyr::if_else(all(is.na(x)), NA, max(x, na.rm = TRUE))
+}
+
+#' Custom minimum
+#'
+#' @description Custom minimum function which removes
+#' missing values but doesn't return Inf if all values
+#' are missing (instead returns NA)
+#'
+#' @param x Vector to return min of
+min_no_inf <- function(x) {
+  dplyr::if_else(all(is.na(x)), NA, min(x, na.rm = TRUE))
+}
+
+#' Clean individual file
+#'
+#' @description Clean up columns in individual file
+#'
+#' @param individual_file Individual file where each row represents a unique CHI
+#' @param year Financial year e.g 1718
+clean_individual_file <- function(individual_file, year) {
+  cli::cli_alert_info("Clean individual file function started at {Sys.time()}")
+
+  individual_file %>%
+    dplyr::select(
+      !c(
+        "ch_no_cost",
+        "no_paid_items",
+        "total_no_dn_contacts",
+        "cost_total_net_inc_dnas"
+      )
+    ) %>%
+    clean_up_gender() %>%
+    dplyr::mutate(age = compute_mid_year_age(year, .data$dob))
+}
+
+#' Clean up gender column
+#'
+#' @description Clean up column containing gender.
+#'
+#' @inheritParams clean_individual_file
+clean_up_gender <- function(individual_file) {
+  individual_file %>%
+    dplyr::mutate(
+      gender = dplyr::case_when(
+        .data$gender != 1.5 ~ round(.data$gender),
+        .default = phsmethods::sex_from_chi(.data$chi, chi_check = FALSE)
+      )
+    )
+}
+
+#' Join slf lookup variables
+#'
+#' @description Join lookup variables from slf postcode lookup and slf gpprac
+#'              lookup.
+#'
+#' @param individual_file the processed individual file.
+#' @param slf_postcode_lookup SLF processed postcode lookup
+#' @param slf_gpprac_lookup SLF processed gpprac lookup
+#' @param hbrescode_var hbrescode variable
+#'
+join_slf_lookup_vars <- function(individual_file,
+                                 slf_postcode_lookup = read_file(get_slf_postcode_path()),
+                                 slf_gpprac_lookup = read_file(
+                                   get_slf_gpprac_path(),
+                                   col_select = c("gpprac", "cluster", "hbpraccode")
+                                 ),
+                                 hbrescode_var = "hb2018") {
+  individual_file <- individual_file %>%
+    dplyr::left_join(
+      slf_postcode_lookup,
+      by = "postcode"
+    ) %>%
+    dplyr::left_join(
+      slf_gpprac_lookup,
+      by = "gpprac"
+    ) %>%
+    dplyr::rename(hbrescode = hbrescode_var)
+
+  return(individual_file)
+}
+# TODO Remove the client data from the individual Social Care extracts
+# and instead, use this function in the episode file to match on the client
+# data to all episodes.
+#' Join sc client variables onto individual file
+#'
+#' @description Match on sc client variables.
+#'
+#' @param individual_file the processed individual file
+#' @param year financial year.
+#' @param sc_client SC client lookup
+#' @param sc_demographics SC Demographic lookup
+join_sc_client <- function(individual_file,
+                           year,
+                           sc_client = read_file(get_source_extract_path(year, "Client")),
+                           sc_demographics = read_file(get_sc_demog_lookup_path(),
+                             col_select = c("sending_location", "social_care_id", "chi")
+                           )) {
+  # TODO Update the client lookup processing script to match
+  # on demographics there so the client lookup already has CHI.
+
+  # Match to demographics lookup to get CHI
+  join_client_demog <- sc_client %>%
+    dplyr::left_join(
+      sc_demographics %>%
+        dplyr::select("sending_location", "social_care_id", "chi"),
+      by = c("sending_location", "social_care_id")
+    )
+
+  # Match on client variables by chi
+  individual_file <- individual_file %>%
+    dplyr::left_join(
+      join_client_demog,
+      by = "chi"
+    ) %>%
+    dplyr::select(!c("sending_location", "social_care_id", "sc_latest_submission"))
+
+  return(individual_file)
+}
diff --git a/R/process_tests_individual_file.R b/R/process_tests_individual_file.R
index 2eb3503e2..695dc19a0 100644
--- a/R/process_tests_individual_file.R
+++ b/R/process_tests_individual_file.R
@@ -10,12 +10,12 @@ process_tests_individual_file <- function(data, year) {
   data <- data %>%
     dplyr::select(
       "year",
-      "chi",
+      "anon_chi",
       "gender",
-      # "postcode", # Add back in once postcode is fixed
+      "postcode",
       "dob",
-      # "hbrescode", #add back in when available
-      # "health_net_cost",
+      "hbrescode",
+      "health_net_cost",
       slfhelper::ltc_vars,
       dplyr::contains(c(
         "beddays",
@@ -26,7 +26,8 @@ process_tests_individual_file <- function(data, year) {
         "cases",
         "consultations"
       ))
-    )
+    ) %>%
+    slfhelper::get_chi()
 
   old_data <- get_existing_data_for_tests(data, file_version = "individual")
 
@@ -61,8 +62,8 @@ produce_individual_file_tests <- function(data) {
   test_flags <- data %>%
     # use functions to create HB and partnership flags
     create_demog_test_flags() %>%
-    # create_hb_test_flags(.data$hbrescode) %>%
-    # create_hb_cost_test_flags(.data$hbrescode, .data$health_net_cost) %>%
+    create_hb_test_flags(.data$hbrescode) %>%
+    create_hb_cost_test_flags(.data$hbrescode, .data$health_net_cost) %>%
     # keep variables for comparison
     dplyr::select(c("valid_chi":dplyr::last_col())) %>%
     # use function to sum new test flags
@@ -82,13 +83,13 @@ produce_individual_file_tests <- function(data) {
       measure = "all"
     )
 
-  # min_max_measures <- data %>%
-  #   calculate_measures(
-  #     vars = c(
-  #       "health_net_cost",
-  #     ),
-  #     measure = "min-max"
-  #   )
+  min_max_measures <- data %>%
+    calculate_measures(
+      vars = c(
+        "health_net_cost"
+      ),
+      measure = "min-max"
+    )
 
   sum_measures <- data %>%
     dplyr::select(slfhelper::ltc_vars) %>%
@@ -102,7 +103,7 @@ produce_individual_file_tests <- function(data) {
   join_output <- list(
     test_flags,
     all_measures,
-    # min_max_measures,
+    min_max_measures,
     sum_measures
   ) %>%
     purrr::reduce(dplyr::full_join, by = c("measure", "value"))
diff --git a/_targets.R b/_targets.R
index 58e5f573f..ef2fbbe74 100644
--- a/_targets.R
+++ b/_targets.R
@@ -549,6 +549,20 @@ list(
       )
     ),
     tar_target(
+      individual_file,
+      create_individual_file(
+        episode_file = episode_file,
+        year = year,
+        write_to_disk = write_to_disk
+      )
+    ),
+    tar_target(
+      individual_file_tests,
+      process_tests_individual_file(
+        data = individual_file,
+        year = year
+),
+          tar_target(
       episode_file_dataset,
       arrow::write_dataset(
         dataset = episode_file,
diff --git a/man/add_acute_columns.Rd b/man/add_acute_columns.Rd
new file mode 100644
index 000000000..52ba071b6
--- /dev/null
+++ b/man/add_acute_columns.Rd
@@ -0,0 +1,18 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/create_individual_file.R
+\name{add_acute_columns}
+\alias{add_acute_columns}
+\title{Add Acute columns}
+\usage{
+add_acute_columns(episode_file, prefix, condition)
+}
+\arguments{
+\item{episode_file}{Tibble containing episodic data}
+
+\item{prefix}{Prefix to add to related columns, e.g. "Acute"}
+
+\item{condition}{Condition to create new columns based on}
+}
+\description{
+Add Acute columns
+}
diff --git a/man/add_ae_columns.Rd b/man/add_ae_columns.Rd
new file mode 100644
index 000000000..9b7099513
--- /dev/null
+++ b/man/add_ae_columns.Rd
@@ -0,0 +1,18 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/create_individual_file.R
+\name{add_ae_columns}
+\alias{add_ae_columns}
+\title{Add AE columns}
+\usage{
+add_ae_columns(episode_file, prefix, condition)
+}
+\arguments{
+\item{episode_file}{Tibble containing episodic data}
+
+\item{prefix}{Prefix to add to related columns, e.g. "Acute"}
+
+\item{condition}{Condition to create new columns based on}
+}
+\description{
+Add AE columns
+}
diff --git a/man/add_all_columns.Rd b/man/add_all_columns.Rd
new file mode 100644
index 000000000..d502e95c3
--- /dev/null
+++ b/man/add_all_columns.Rd
@@ -0,0 +1,15 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/create_individual_file.R
+\name{add_all_columns}
+\alias{add_all_columns}
+\title{Add all columns}
+\usage{
+add_all_columns(episode_file)
+}
+\arguments{
+\item{episode_file}{Tibble containing episodic data}
+}
+\description{
+Add new columns based on SMRType and recid which follow a pattern
+of prefixed column names created based on some condition.
+}
diff --git a/man/add_at_columns.Rd b/man/add_at_columns.Rd
new file mode 100644
index 000000000..e05ea9101
--- /dev/null
+++ b/man/add_at_columns.Rd
@@ -0,0 +1,18 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/create_individual_file.R
+\name{add_at_columns}
+\alias{add_at_columns}
+\title{Add AT columns}
+\usage{
+add_at_columns(episode_file, prefix, condition)
+}
+\arguments{
+\item{episode_file}{Tibble containing episodic data}
+
+\item{prefix}{Prefix to add to related columns, e.g. "Acute"}
+
+\item{condition}{Condition to create new columns based on}
+}
+\description{
+Add AT columns
+}
diff --git a/man/add_ch_columns.Rd b/man/add_ch_columns.Rd
new file mode 100644
index 000000000..4938f7690
--- /dev/null
+++ b/man/add_ch_columns.Rd
@@ -0,0 +1,18 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/create_individual_file.R
+\name{add_ch_columns}
+\alias{add_ch_columns}
+\title{Add CH columns}
+\usage{
+add_ch_columns(episode_file, prefix, condition)
+}
+\arguments{
+\item{episode_file}{Tibble containing episodic data}
+
+\item{prefix}{Prefix to add to related columns, e.g. "Acute"}
+
+\item{condition}{Condition to create new columns based on}
+}
+\description{
+Add CH columns
+}
diff --git a/man/add_cij_columns.Rd b/man/add_cij_columns.Rd
new file mode 100644
index 000000000..7d00e6299
--- /dev/null
+++ b/man/add_cij_columns.Rd
@@ -0,0 +1,14 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/create_individual_file.R
+\name{add_cij_columns}
+\alias{add_cij_columns}
+\title{Add CIJ-related columns}
+\usage{
+add_cij_columns(episode_file)
+}
+\arguments{
+\item{episode_file}{Tibble containing episodic data}
+}
+\description{
+Add new columns related to CIJ
+}
diff --git a/man/add_cmh_columns.Rd b/man/add_cmh_columns.Rd
new file mode 100644
index 000000000..a1d82cba6
--- /dev/null
+++ b/man/add_cmh_columns.Rd
@@ -0,0 +1,18 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/create_individual_file.R
+\name{add_cmh_columns}
+\alias{add_cmh_columns}
+\title{Add CMH columns}
+\usage{
+add_cmh_columns(episode_file, prefix, condition)
+}
+\arguments{
+\item{episode_file}{Tibble containing episodic data}
+
+\item{prefix}{Prefix to add to related columns, e.g. "Acute"}
+
+\item{condition}{Condition to create new columns based on}
+}
+\description{
+Add CMH columns
+}
diff --git a/man/add_dd_columns.Rd b/man/add_dd_columns.Rd
new file mode 100644
index 000000000..08d9c0fe4
--- /dev/null
+++ b/man/add_dd_columns.Rd
@@ -0,0 +1,18 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/create_individual_file.R
+\name{add_dd_columns}
+\alias{add_dd_columns}
+\title{Add DD columns}
+\usage{
+add_dd_columns(episode_file, prefix, condition)
+}
+\arguments{
+\item{episode_file}{Tibble containing episodic data}
+
+\item{prefix}{Prefix to add to related columns, e.g. "Acute"}
+
+\item{condition}{Condition to create new columns based on}
+}
+\description{
+Add DD columns
+}
diff --git a/man/add_dn_columns.Rd b/man/add_dn_columns.Rd
new file mode 100644
index 000000000..bf6af008f
--- /dev/null
+++ b/man/add_dn_columns.Rd
@@ -0,0 +1,18 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/create_individual_file.R
+\name{add_dn_columns}
+\alias{add_dn_columns}
+\title{Add DN columns}
+\usage{
+add_dn_columns(episode_file, prefix, condition)
+}
+\arguments{
+\item{episode_file}{Tibble containing episodic data}
+
+\item{prefix}{Prefix to add to related columns, e.g. "Acute"}
+
+\item{condition}{Condition to create new columns based on}
+}
+\description{
+Add DN columns
+}
diff --git a/man/add_gls_columns.Rd b/man/add_gls_columns.Rd
new file mode 100644
index 000000000..e71dc755b
--- /dev/null
+++ b/man/add_gls_columns.Rd
@@ -0,0 +1,18 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/create_individual_file.R
+\name{add_gls_columns}
+\alias{add_gls_columns}
+\title{Add GLS columns}
+\usage{
+add_gls_columns(episode_file, prefix, condition)
+}
+\arguments{
+\item{episode_file}{Tibble containing episodic data}
+
+\item{prefix}{Prefix to add to related columns, e.g. "Acute"}
+
+\item{condition}{Condition to create new columns based on}
+}
+\description{
+Add GLS columns
+}
diff --git a/man/add_hc_columns.Rd b/man/add_hc_columns.Rd
new file mode 100644
index 000000000..95d8f1d3b
--- /dev/null
+++ b/man/add_hc_columns.Rd
@@ -0,0 +1,18 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/create_individual_file.R
+\name{add_hc_columns}
+\alias{add_hc_columns}
+\title{Add HC columns}
+\usage{
+add_hc_columns(episode_file, prefix, condition)
+}
+\arguments{
+\item{episode_file}{Tibble containing episodic data}
+
+\item{prefix}{Prefix to add to related columns, e.g. "Acute"}
+
+\item{condition}{Condition to create new columns based on}
+}
+\description{
+Add HC columns
+}
diff --git a/man/add_hl1_columns.Rd b/man/add_hl1_columns.Rd
new file mode 100644
index 000000000..7600db5e9
--- /dev/null
+++ b/man/add_hl1_columns.Rd
@@ -0,0 +1,18 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/create_individual_file.R
+\name{add_hl1_columns}
+\alias{add_hl1_columns}
+\title{Add HL1 columns}
+\usage{
+add_hl1_columns(episode_file, prefix, condition)
+}
+\arguments{
+\item{episode_file}{Tibble containing episodic data}
+
+\item{prefix}{Prefix to add to related columns, e.g. "Acute"}
+
+\item{condition}{Condition to create new columns based on}
+}
+\description{
+Add HL1 columns
+}
diff --git a/man/add_ipdc_cols.Rd b/man/add_ipdc_cols.Rd
new file mode 100644
index 000000000..0f91cbd90
--- /dev/null
+++ b/man/add_ipdc_cols.Rd
@@ -0,0 +1,23 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/create_individual_file.R
+\name{add_ipdc_cols}
+\alias{add_ipdc_cols}
+\title{Add columns based on IPDC}
+\usage{
+add_ipdc_cols(episode_file, prefix, condition, ipdc_d = TRUE, elective = TRUE)
+}
+\arguments{
+\item{episode_file}{Tibble containing episodic data}
+
+\item{prefix}{Prefix to add to related columns, e.g. "Acute"}
+
+\item{condition}{Condition to create new columns based on}
+
+\item{ipdc_d}{Whether to create columns based on IPDC = "D" (lgl)}
+
+\item{elective}{Whether to create columns based on Elective/Non-Elective cij_pattype (lgl)}
+}
+\description{
+Add columns based on value in IPDC column, which can
+be further split by Elective/Non-Elective CIJ.
+}
diff --git a/man/add_mat_columns.Rd b/man/add_mat_columns.Rd
new file mode 100644
index 000000000..aae729323
--- /dev/null
+++ b/man/add_mat_columns.Rd
@@ -0,0 +1,18 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/create_individual_file.R
+\name{add_mat_columns}
+\alias{add_mat_columns}
+\title{Add Mat columns}
+\usage{
+add_mat_columns(episode_file, prefix, condition)
+}
+\arguments{
+\item{episode_file}{Tibble containing episodic data}
+
+\item{prefix}{Prefix to add to related columns, e.g. "Acute"}
+
+\item{condition}{Condition to create new columns based on}
+}
+\description{
+Add Mat columns
+}
diff --git a/man/add_mh_columns.Rd b/man/add_mh_columns.Rd
new file mode 100644
index 000000000..3c50c6cb8
--- /dev/null
+++ b/man/add_mh_columns.Rd
@@ -0,0 +1,18 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/create_individual_file.R
+\name{add_mh_columns}
+\alias{add_mh_columns}
+\title{Add MH columns}
+\usage{
+add_mh_columns(episode_file, prefix, condition)
+}
+\arguments{
+\item{episode_file}{Tibble containing episodic data}
+
+\item{prefix}{Prefix to add to related columns, e.g. "Acute"}
+
+\item{condition}{Condition to create new columns based on}
+}
+\description{
+Add MH columns
+}
diff --git a/man/add_nrs_columns.Rd b/man/add_nrs_columns.Rd
new file mode 100644
index 000000000..9d7b3f8bf
--- /dev/null
+++ b/man/add_nrs_columns.Rd
@@ -0,0 +1,18 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/create_individual_file.R
+\name{add_nrs_columns}
+\alias{add_nrs_columns}
+\title{Add NRS columns}
+\usage{
+add_nrs_columns(episode_file, prefix, condition)
+}
+\arguments{
+\item{episode_file}{Tibble containing episodic data}
+
+\item{prefix}{Prefix to add to related columns, e.g. "Acute"}
+
+\item{condition}{Condition to create new columns based on}
+}
+\description{
+Add NRS columns
+}
diff --git a/man/add_nsu_columns.Rd b/man/add_nsu_columns.Rd
new file mode 100644
index 000000000..6a54bbcbf
--- /dev/null
+++ b/man/add_nsu_columns.Rd
@@ -0,0 +1,18 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/create_individual_file.R
+\name{add_nsu_columns}
+\alias{add_nsu_columns}
+\title{Add NSU columns}
+\usage{
+add_nsu_columns(episode_file, prefix, condition)
+}
+\arguments{
+\item{episode_file}{Tibble containing episodic data}
+
+\item{prefix}{Prefix to add to related columns, e.g. "Acute"}
+
+\item{condition}{Condition to create new columns based on}
+}
+\description{
+Add NSU columns
+}
diff --git a/man/add_ooh_columns.Rd b/man/add_ooh_columns.Rd
new file mode 100644
index 000000000..01814ab6d
--- /dev/null
+++ b/man/add_ooh_columns.Rd
@@ -0,0 +1,18 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/create_individual_file.R
+\name{add_ooh_columns}
+\alias{add_ooh_columns}
+\title{Add OoH columns}
+\usage{
+add_ooh_columns(episode_file, prefix, condition)
+}
+\arguments{
+\item{episode_file}{Tibble containing episodic data}
+
+\item{prefix}{Prefix to add to related columns, e.g. "Acute"}
+
+\item{condition}{Condition to create new columns based on}
+}
+\description{
+Add OoH columns
+}
diff --git a/man/add_op_columns.Rd b/man/add_op_columns.Rd
new file mode 100644
index 000000000..08c4419e2
--- /dev/null
+++ b/man/add_op_columns.Rd
@@ -0,0 +1,18 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/create_individual_file.R
+\name{add_op_columns}
+\alias{add_op_columns}
+\title{Add OP columns}
+\usage{
+add_op_columns(episode_file, prefix, condition)
+}
+\arguments{
+\item{episode_file}{Tibble containing episodic data}
+
+\item{prefix}{Prefix to add to related columns, e.g. "Acute"}
+
+\item{condition}{Condition to create new columns based on}
+}
+\description{
+Add OP columns
+}
diff --git a/man/add_pis_columns.Rd b/man/add_pis_columns.Rd
new file mode 100644
index 000000000..b582acf2e
--- /dev/null
+++ b/man/add_pis_columns.Rd
@@ -0,0 +1,18 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/create_individual_file.R
+\name{add_pis_columns}
+\alias{add_pis_columns}
+\title{Add PIS columns}
+\usage{
+add_pis_columns(episode_file, prefix, condition)
+}
+\arguments{
+\item{episode_file}{Tibble containing episodic data}
+
+\item{prefix}{Prefix to add to related columns, e.g. "Acute"}
+
+\item{condition}{Condition to create new columns based on}
+}
+\description{
+Add PIS columns
+}
diff --git a/man/add_sds_columns.Rd b/man/add_sds_columns.Rd
new file mode 100644
index 000000000..d5a5fb2cf
--- /dev/null
+++ b/man/add_sds_columns.Rd
@@ -0,0 +1,18 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/create_individual_file.R
+\name{add_sds_columns}
+\alias{add_sds_columns}
+\title{Add SDS columns}
+\usage{
+add_sds_columns(episode_file, prefix, condition)
+}
+\arguments{
+\item{episode_file}{Tibble containing episodic data}
+
+\item{prefix}{Prefix to add to related columns, e.g. "Acute"}
+
+\item{condition}{Condition to create new columns based on}
+}
+\description{
+Add SDS columns
+}
diff --git a/man/add_standard_cols.Rd b/man/add_standard_cols.Rd
new file mode 100644
index 000000000..744aa49de
--- /dev/null
+++ b/man/add_standard_cols.Rd
@@ -0,0 +1,28 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/create_individual_file.R
+\name{add_standard_cols}
+\alias{add_standard_cols}
+\title{Add standard columns}
+\usage{
+add_standard_cols(
+  episode_file,
+  prefix,
+  condition,
+  episode = FALSE,
+  cost = FALSE
+)
+}
+\arguments{
+\item{episode_file}{Tibble containing episodic data}
+
+\item{prefix}{Prefix to add to related columns, e.g. "Acute"}
+
+\item{condition}{Condition to create new columns based on}
+
+\item{episode}{Whether to create prefix_episodes col, e.g. "Acute_episodes"}
+
+\item{cost}{Whether to create prefix_cost col, e.g. "Acute_cost"}
+}
+\description{
+Add standard columns (DoB, postcode, gpprac, episodes, cost) to episode file.
+}
diff --git a/man/aggregate_by_chi.Rd b/man/aggregate_by_chi.Rd
new file mode 100644
index 000000000..73804ad9b
--- /dev/null
+++ b/man/aggregate_by_chi.Rd
@@ -0,0 +1,15 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/create_individual_file.R
+\name{aggregate_by_chi}
+\alias{aggregate_by_chi}
+\title{Aggregate by CHI}
+\usage{
+aggregate_by_chi(episode_file)
+}
+\arguments{
+\item{episode_file}{Tibble containing episodic data}
+}
+\description{
+Aggregate episode file by CHI to convert into
+individual file.
+}
diff --git a/man/aggregate_by_chi_zihao.Rd b/man/aggregate_by_chi_zihao.Rd
new file mode 100644
index 000000000..3d4961e19
--- /dev/null
+++ b/man/aggregate_by_chi_zihao.Rd
@@ -0,0 +1,15 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/aggregate_by_chi_zihao.R
+\name{aggregate_by_chi_zihao}
+\alias{aggregate_by_chi_zihao}
+\title{Aggregate by CHI}
+\usage{
+aggregate_by_chi_zihao(episode_file)
+}
+\arguments{
+\item{episode_file}{Tibble containing episodic data}
+}
+\description{
+Aggregate episode file by CHI to convert into
+individual file.
+}
diff --git a/man/aggregate_ch_episodes.Rd b/man/aggregate_ch_episodes.Rd
new file mode 100644
index 000000000..2753da14f
--- /dev/null
+++ b/man/aggregate_ch_episodes.Rd
@@ -0,0 +1,14 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/create_individual_file.R
+\name{aggregate_ch_episodes}
+\alias{aggregate_ch_episodes}
+\title{Aggregate CIS episodes}
+\usage{
+aggregate_ch_episodes(episode_file)
+}
+\arguments{
+\item{episode_file}{Tibble containing episodic data}
+}
+\description{
+Aggregate CH variables by CHI and CIS.
+}
diff --git a/man/aggregate_ch_episodes_zihao.Rd b/man/aggregate_ch_episodes_zihao.Rd
new file mode 100644
index 000000000..808262654
--- /dev/null
+++ b/man/aggregate_ch_episodes_zihao.Rd
@@ -0,0 +1,14 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/aggregate_by_chi_zihao.R
+\name{aggregate_ch_episodes_zihao}
+\alias{aggregate_ch_episodes_zihao}
+\title{Aggregate CIS episodes}
+\usage{
+aggregate_ch_episodes_zihao(episode_file)
+}
+\arguments{
+\item{episode_file}{Tibble containing episodic data}
+}
+\description{
+Aggregate CH variables by CHI and CIS.
+}
diff --git a/man/clean_individual_file.Rd b/man/clean_individual_file.Rd
new file mode 100644
index 000000000..fb2d3ae13
--- /dev/null
+++ b/man/clean_individual_file.Rd
@@ -0,0 +1,16 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/create_individual_file.R
+\name{clean_individual_file}
+\alias{clean_individual_file}
+\title{Clean individual file}
+\usage{
+clean_individual_file(individual_file, year)
+}
+\arguments{
+\item{individual_file}{Individual file where each row represents a unique CHI}
+
+\item{year}{Financial year e.g 1718}
+}
+\description{
+Clean up columns in individual file
+}
diff --git a/man/clean_up_ch.Rd b/man/clean_up_ch.Rd
new file mode 100644
index 000000000..0182c84e8
--- /dev/null
+++ b/man/clean_up_ch.Rd
@@ -0,0 +1,16 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/create_individual_file.R
+\name{clean_up_ch}
+\alias{clean_up_ch}
+\title{Clean up CH}
+\usage{
+clean_up_ch(episode_file, year)
+}
+\arguments{
+\item{episode_file}{Tibble containing episodic data}
+
+\item{year}{The year to process, in FY format.}
+}
+\description{
+Clean up CH-related columns.
+}
diff --git a/man/clean_up_gender.Rd b/man/clean_up_gender.Rd
new file mode 100644
index 000000000..edf05bfc8
--- /dev/null
+++ b/man/clean_up_gender.Rd
@@ -0,0 +1,14 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/create_individual_file.R
+\name{clean_up_gender}
+\alias{clean_up_gender}
+\title{Clean up gender column}
+\usage{
+clean_up_gender(individual_file)
+}
+\arguments{
+\item{individual_file}{Individual file where each row represents a unique CHI}
+}
+\description{
+Clean up column containing gender.
+}
diff --git a/man/condition_cols.Rd b/man/condition_cols.Rd
new file mode 100644
index 000000000..ba037a609
--- /dev/null
+++ b/man/condition_cols.Rd
@@ -0,0 +1,13 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/create_individual_file.R
+\name{condition_cols}
+\alias{condition_cols}
+\title{Condition columns}
+\usage{
+condition_cols()
+}
+\description{
+Returns chr vector of column names
+which follow format "condition" and "condition_date" e.g.
+"dementia" and "dementia_date"
+}
diff --git a/man/create_individual_file.Rd b/man/create_individual_file.Rd
new file mode 100644
index 000000000..fa759e7b1
--- /dev/null
+++ b/man/create_individual_file.Rd
@@ -0,0 +1,34 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/create_individual_file.R
+\name{create_individual_file}
+\alias{create_individual_file}
+\title{Create individual file}
+\usage{
+create_individual_file(
+  episode_file,
+  year,
+  write_to_disk = TRUE,
+  anon_chi_in = TRUE,
+  anon_chi_out = TRUE
+)
+}
+\arguments{
+\item{episode_file}{Tibble containing episodic data}
+
+\item{year}{The year to process, in FY format.}
+
+\item{write_to_disk}{(optional) Should the data be written to disk default is
+\code{TRUE} i.e. write the data to disk.}
+
+\item{anon_chi_in}{(Default:TRUE) Is \code{anon_chi} used in the input
+(instead of chi)}
+
+\item{anon_chi_out}{(Default:TRUE) Should \code{anon_chi} be used in the output
+(instead of chi)}
+}
+\value{
+The processed individual file
+}
+\description{
+Creates individual file from episode file
+}
diff --git a/man/join_sc_client.Rd b/man/join_sc_client.Rd
new file mode 100644
index 000000000..a30719698
--- /dev/null
+++ b/man/join_sc_client.Rd
@@ -0,0 +1,26 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/create_individual_file.R
+\name{join_sc_client}
+\alias{join_sc_client}
+\title{Join sc client variables onto individual file}
+\usage{
+join_sc_client(
+  individual_file,
+  year,
+  sc_client = read_file(get_source_extract_path(year, "Client")),
+  sc_demographics = read_file(get_sc_demog_lookup_path(), col_select =
+    c("sending_location", "social_care_id", "chi"))
+)
+}
+\arguments{
+\item{individual_file}{the processed individual file}
+
+\item{year}{financial year.}
+
+\item{sc_client}{SC client lookup}
+
+\item{sc_demographics}{SC Demographic lookup}
+}
+\description{
+Match on sc client variables.
+}
diff --git a/man/join_slf_lookup_vars.Rd b/man/join_slf_lookup_vars.Rd
new file mode 100644
index 000000000..980c66f31
--- /dev/null
+++ b/man/join_slf_lookup_vars.Rd
@@ -0,0 +1,27 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/create_individual_file.R
+\name{join_slf_lookup_vars}
+\alias{join_slf_lookup_vars}
+\title{Join slf lookup variables}
+\usage{
+join_slf_lookup_vars(
+  individual_file,
+  slf_postcode_lookup = read_file(get_slf_postcode_path()),
+  slf_gpprac_lookup = read_file(get_slf_gpprac_path(), col_select = c("gpprac",
+    "cluster", "hbpraccode")),
+  hbrescode_var = "hb2018"
+)
+}
+\arguments{
+\item{individual_file}{the processed individual file.}
+
+\item{slf_postcode_lookup}{SLF processed postcode lookup}
+
+\item{slf_gpprac_lookup}{SLF processed gpprac lookup}
+
+\item{hbrescode_var}{hbrescode variable}
+}
+\description{
+Join lookup variables from slf postcode lookup and slf gpprac
+lookup.
+}
diff --git a/man/max_no_inf.Rd b/man/max_no_inf.Rd
new file mode 100644
index 000000000..79b9a1057
--- /dev/null
+++ b/man/max_no_inf.Rd
@@ -0,0 +1,16 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/create_individual_file.R
+\name{max_no_inf}
+\alias{max_no_inf}
+\title{Custom maximum}
+\usage{
+max_no_inf(x)
+}
+\arguments{
+\item{x}{Vector to return max of}
+}
+\description{
+Custom maximum function which removes
+missing values but doesn't return Inf if all values
+are missing (instead returns NA)
+}
diff --git a/man/min_no_inf.Rd b/man/min_no_inf.Rd
new file mode 100644
index 000000000..38029214f
--- /dev/null
+++ b/man/min_no_inf.Rd
@@ -0,0 +1,16 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/create_individual_file.R
+\name{min_no_inf}
+\alias{min_no_inf}
+\title{Custom minimum}
+\usage{
+min_no_inf(x)
+}
+\arguments{
+\item{x}{Vector to return min of}
+}
+\description{
+Custom minimum function which removes
+missing values but doesn't return Inf if all values
+are missing (instead returns NA)
+}
diff --git a/man/recode_gender.Rd b/man/recode_gender.Rd
new file mode 100644
index 000000000..526d2829d
--- /dev/null
+++ b/man/recode_gender.Rd
@@ -0,0 +1,14 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/create_individual_file.R
+\name{recode_gender}
+\alias{recode_gender}
+\title{Recode gender}
+\usage{
+recode_gender(episode_file)
+}
+\arguments{
+\item{episode_file}{Tibble containing episodic data}
+}
+\description{
+Recode gender to 1.5 if 0 or 9.
+}
diff --git a/man/remove_blank_chi.Rd b/man/remove_blank_chi.Rd
new file mode 100644
index 000000000..9cba40a8f
--- /dev/null
+++ b/man/remove_blank_chi.Rd
@@ -0,0 +1,14 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/create_individual_file.R
+\name{remove_blank_chi}
+\alias{remove_blank_chi}
+\title{Remove blank CHI}
+\usage{
+remove_blank_chi(episode_file)
+}
+\arguments{
+\item{episode_file}{Tibble containing episodic data}
+}
+\description{
+Convert blank strings to NA and remove NAs from CHI column
+}
diff --git a/man/select.Rd b/man/select.Rd
new file mode 100644
index 000000000..435096d9a
--- /dev/null
+++ b/man/select.Rd
@@ -0,0 +1,30 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/aggregate_by_chi_zihao.R
+\name{vars_end_with}
+\alias{vars_end_with}
+\alias{vars_start_with}
+\alias{vars_contain}
+\title{select columns ending with some patterns}
+\usage{
+vars_end_with(data, vars, ignore_case = FALSE)
+
+vars_start_with(data, vars, ignore_case = FALSE)
+
+vars_contain(data, vars, ignore_case = FALSE)
+}
+\description{
+select columns ending with some patterns
+
+select columns starting with some patterns
+
+select columns contains some characters
+}
+\section{Functions}{
+\itemize{
+\item \code{vars_end_with()}: columns based on patterns
+
+\item \code{vars_start_with()}: columns based on patterns
+
+\item \code{vars_contain()}: columns based on patterns
+
+}}

From 55d5948a50a7714a4110d0c940ad28fd2f14663b Mon Sep 17 00:00:00 2001
From: James McMahon <james.mcmahon@phs.scot>
Date: Wed, 19 Jul 2023 12:55:42 +0100
Subject: [PATCH 11/16] Fix missed bracket in _targets.R

---
 _targets.R | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/_targets.R b/_targets.R
index ef2fbbe74..af0fd257c 100644
--- a/_targets.R
+++ b/_targets.R
@@ -561,8 +561,9 @@ list(
       process_tests_individual_file(
         data = individual_file,
         year = year
-),
-          tar_target(
+      )
+    ),
+    tar_target(
       episode_file_dataset,
       arrow::write_dataset(
         dataset = episode_file,

From e4c14652ec8d919fdaa077f193dfab31ca3f2782 Mon Sep 17 00:00:00 2001
From: James McMahon <james.mcmahon@phs.scot>
Date: Wed, 19 Jul 2023 15:14:25 +0100
Subject: [PATCH 12/16] Update arrow dataset targets

---
 _targets.R | 20 +++++++++++++++++++-
 1 file changed, 19 insertions(+), 1 deletion(-)

diff --git a/_targets.R b/_targets.R
index af0fd257c..0886466a8 100644
--- a/_targets.R
+++ b/_targets.R
@@ -567,13 +567,31 @@ list(
       episode_file_dataset,
       arrow::write_dataset(
         dataset = episode_file,
-        path = fs::path_ext_remove(slf_episode_path),
+        path = fs::path(
+          get_year_dir(year),
+          stringr::str_glue("source-episode-file-{year}")
+        ),
         format = "parquet",
         # Should correspond to the available slfhelper filters
         partitioning = c("recid", "hscp2018"),
         compression = "zstd",
         version = "latest"
       )
+    ),
+    tar_target(
+      individual_file_dataset,
+      arrow::write_dataset(
+        dataset = individual_file,
+        path = fs::path(
+          get_year_dir(year),
+          stringr::str_glue("source-individual-file-{year}")
+        ),
+        format = "parquet",
+        # Should correspond to the available slfhelper filters
+        partitioning = c("hscp2018"),
+        compression = "zstd",
+        version = "latest"
+      )
     )
   )
 )

From 24012df030e61df8f7e0126507fd63c08747f38c Mon Sep 17 00:00:00 2001
From: James McMahon <james.mcmahon@phs.scot>
Date: Thu, 20 Jul 2023 19:27:36 +0100
Subject: [PATCH 13/16] Fix for years with no DN data (#755)

* Fix for years with no DN data

21/22 and 22/23 we're failing because they didn't have `total_no_dn_contacts` this is a bit of a crude fix but should work for any year, and if the variable doesn't exist in the episode file it will be created as `NA` in the individual file.

* Selectively clean up variables after
---
 R/create_individual_file.R | 36 +++++++++++++++++++++++-------------
 1 file changed, 23 insertions(+), 13 deletions(-)

diff --git a/R/create_individual_file.R b/R/create_individual_file.R
index 675e2066a..b7812c806 100644
--- a/R/create_individual_file.R
+++ b/R/create_individual_file.R
@@ -25,7 +25,7 @@ create_individual_file <- function(
   }
 
   individual_file <- episode_file %>%
-    dplyr::select(
+    dplyr::select(dplyr::any_of(c(
       "year",
       "chi",
       "dob",
@@ -57,7 +57,7 @@ create_individual_file <- function(
       "hc_hours_annual",
       "hc_reablement",
       "ooh_case_id"
-    ) %>%
+    ))) %>%
     remove_blank_chi() %>%
     add_cij_columns() %>%
     add_all_columns() %>%
@@ -321,9 +321,21 @@ add_ooh_columns <- function(episode_file, prefix, condition) {
 #' @inheritParams add_acute_columns
 add_dn_columns <- function(episode_file, prefix, condition) {
   condition <- substitute(condition)
-  episode_file %>%
-    add_standard_cols(prefix, condition, episode = TRUE, cost = TRUE) %>%
-    dplyr::mutate("{prefix}_contacts" := dplyr::if_else(eval(condition), .data$total_no_dn_contacts, NA_integer_))
+  if ("total_no_dn_contacts" %in% names(episode_file)) {
+    episode_file %>%
+      add_standard_cols(prefix, condition, episode = TRUE, cost = TRUE) %>%
+      dplyr::mutate(
+        "{prefix}_contacts" := dplyr::if_else(
+          eval(condition),
+          .data$total_no_dn_contacts,
+          NA_integer_
+        )
+      )
+  } else {
+    episode_file %>%
+      add_standard_cols(prefix, condition, episode = TRUE, cost = TRUE) %>%
+      dplyr::mutate("{prefix}_contacts" := NA_integer_)
+  }
 }
 
 #' Add CMH columns
@@ -760,14 +772,12 @@ clean_individual_file <- function(individual_file, year) {
   cli::cli_alert_info("Clean individual file function started at {Sys.time()}")
 
   individual_file %>%
-    dplyr::select(
-      !c(
-        "ch_no_cost",
-        "no_paid_items",
-        "total_no_dn_contacts",
-        "cost_total_net_inc_dnas"
-      )
-    ) %>%
+    dplyr::select(dplyr::any_of(!c(
+      "ch_no_cost",
+      "no_paid_items",
+      "total_no_dn_contacts",
+      "cost_total_net_inc_dnas"
+    ))) %>%
     clean_up_gender() %>%
     dplyr::mutate(age = compute_mid_year_age(year, .data$dob))
 }

From 9b676fc2bcc0f656ed43d96a218258f2526a30fc Mon Sep 17 00:00:00 2001
From: James McMahon <james.mcmahon@phs.scot>
Date: Thu, 20 Jul 2023 19:28:00 +0100
Subject: [PATCH 14/16] Avoid duplcate `health_net_cost_inc_dnas` (#756)

* Selectively clean up variables after

* Avoid selecting variables we don't want

`cost_total_net_inc_dnas` was being picked up here which we didn't want.
---
 R/aggregate_by_chi_zihao.R | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/R/aggregate_by_chi_zihao.R b/R/aggregate_by_chi_zihao.R
index 0eee203e8..7d9ce5ed3 100644
--- a/R/aggregate_by_chi_zihao.R
+++ b/R/aggregate_by_chi_zihao.R
@@ -81,8 +81,7 @@ aggregate_by_chi_zihao <- function(episode_file) {
         "other",
         "dn",
         "nhs24",
-        "pcc",
-        "_dnas"
+        "pcc"
       )
     ),
     vars_start_with(

From 4eb6b9395b2b27fd6366b7a1874437005bc33ea4 Mon Sep 17 00:00:00 2001
From: James McMahon <james.mcmahon@phs.scot>
Date: Fri, 21 Jul 2023 12:25:41 +0100
Subject: [PATCH 15/16] Fix a typo

---
 R/create_individual_file.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/R/create_individual_file.R b/R/create_individual_file.R
index b7812c806..e2cf996a1 100644
--- a/R/create_individual_file.R
+++ b/R/create_individual_file.R
@@ -772,7 +772,7 @@ clean_individual_file <- function(individual_file, year) {
   cli::cli_alert_info("Clean individual file function started at {Sys.time()}")
 
   individual_file %>%
-    dplyr::select(dplyr::any_of(!c(
+    dplyr::select(!dplyr::any_of(c(
       "ch_no_cost",
       "no_paid_items",
       "total_no_dn_contacts",

From 1efe25e7a092310fafc931f2557d49e065a11a1c Mon Sep 17 00:00:00 2001
From: Jennit07 <67372904+Jennit07@users.noreply.github.com>
Date: Mon, 31 Jul 2023 12:54:46 +0100
Subject: [PATCH 16/16] Add tests for delayed discharges extract (#760)

* Add tests for delayed discharges extract

* Style code

* Change calculation to TRUE/FALSE

Co-authored-by: James McMahon <james.mcmahon@phs.scot>

* Remove TODO and add DD tests to targets pipeline

---------

Co-authored-by: Jennit07 <Jennit07@users.noreply.github.com>
Co-authored-by: James McMahon <james.mcmahon@phs.scot>
---
 NAMESPACE                               |  1 +
 R/process_tests_delayed_discharges.R    | 50 +++++++++++++++++++++++++
 _targets.R                              |  8 +++-
 man/process_tests_delayed_discharges.Rd | 20 ++++++++++
 man/produce_source_dd_tests.Rd          | 28 ++++++++++++++
 man/produce_source_pis_tests.Rd         |  4 ++
 6 files changed, 110 insertions(+), 1 deletion(-)
 create mode 100644 R/process_tests_delayed_discharges.R
 create mode 100644 man/process_tests_delayed_discharges.Rd
 create mode 100644 man/produce_source_dd_tests.Rd

diff --git a/NAMESPACE b/NAMESPACE
index d87bf9397..464cced34 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -115,6 +115,7 @@ export(process_tests_ae)
 export(process_tests_alarms_telecare)
 export(process_tests_care_home)
 export(process_tests_cmh)
+export(process_tests_delayed_discharges)
 export(process_tests_district_nursing)
 export(process_tests_episode_file)
 export(process_tests_gp_ooh)
diff --git a/R/process_tests_delayed_discharges.R b/R/process_tests_delayed_discharges.R
new file mode 100644
index 000000000..b540d1f74
--- /dev/null
+++ b/R/process_tests_delayed_discharges.R
@@ -0,0 +1,50 @@
+#' Process Delayed Discharges tests
+#'
+#' @description Takes the processed Delayed Discharges extract and produces
+#' a test comparison with the previous data. This is written to disk as a CSV.
+#'
+#' @param data a [tibble][tibble::tibble-package] of the processed data extract.
+#' @param year the financial year of the extract in the format '1718'.
+#'
+#' @return a [tibble][tibble::tibble-package] containing a test comparison.
+#'
+#' @export
+process_tests_delayed_discharges <- function(data, year) {
+  old_data <- get_existing_data_for_tests(data)
+
+  comparison <- produce_test_comparison(
+    old_data = produce_source_dd_tests(old_data),
+    new_data = produce_source_dd_tests(data)
+  ) %>%
+    write_tests_xlsx(sheet_name = "DD", year)
+
+  return(comparison)
+}
+
+#' Delayed Discharges extract tests
+#'
+#' @description Produce tests for the delayed discharges extract.
+#'
+#' @param data new or old data for testing summary flags
+#' (data is from [get_source_extract_path()])
+#'
+#' @return a dataframe with a count of each flag
+#' from [calculate_measures()]
+#'
+#' @family extract test functions
+#' for creating test flags
+#' @seealso calculate_measures
+produce_source_dd_tests <- function(data) {
+  test_flags <- data %>%
+    dplyr::mutate(
+      n_delay_episodes = 1L,
+      code9_episodes = .data$primary_delay_reason == "9"
+    ) %>%
+    create_hb_test_flags(.data$hbtreatcode) %>%
+    # keep variables for comparison
+    dplyr::select(c("n_delay_episodes":dplyr::last_col())) %>%
+    # use function to sum new test flags
+    calculate_measures(measure = "sum")
+
+  return(test_flags)
+}
diff --git a/_targets.R b/_targets.R
index 0886466a8..f50045aed 100644
--- a/_targets.R
+++ b/_targets.R
@@ -310,12 +310,18 @@ list(
         year
       )
     ),
-    # TODO add tests for the Delayed Discharges extract
     tar_target(source_dd_extract, process_extract_delayed_discharges(
       dd_data,
       year,
       write_to_disk = write_to_disk
     )),
+    tar_target(
+      tests_source_dd_extract,
+      process_tests_delayed_discharges(
+        source_dd_extract,
+        year
+      )
+    ),
     tar_target(source_dn_extract, process_extract_district_nursing(
       dn_data,
       year,
diff --git a/man/process_tests_delayed_discharges.Rd b/man/process_tests_delayed_discharges.Rd
new file mode 100644
index 000000000..68e1b8f17
--- /dev/null
+++ b/man/process_tests_delayed_discharges.Rd
@@ -0,0 +1,20 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/process_tests_delayed_discharges.R
+\name{process_tests_delayed_discharges}
+\alias{process_tests_delayed_discharges}
+\title{Process Delayed Discharges tests}
+\usage{
+process_tests_delayed_discharges(data, year)
+}
+\arguments{
+\item{data}{a \link[tibble:tibble-package]{tibble} of the processed data extract.}
+
+\item{year}{the financial year of the extract in the format '1718'.}
+}
+\value{
+a \link[tibble:tibble-package]{tibble} containing a test comparison.
+}
+\description{
+Takes the processed Delayed Discharges extract and produces
+a test comparison with the previous data. This is written to disk as a CSV.
+}
diff --git a/man/produce_source_dd_tests.Rd b/man/produce_source_dd_tests.Rd
new file mode 100644
index 000000000..2eb9f6455
--- /dev/null
+++ b/man/produce_source_dd_tests.Rd
@@ -0,0 +1,28 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/process_tests_delayed_discharges.R
+\name{produce_source_dd_tests}
+\alias{produce_source_dd_tests}
+\title{Delayed Discharges extract tests}
+\usage{
+produce_source_dd_tests(data)
+}
+\arguments{
+\item{data}{new or old data for testing summary flags
+(data is from \code{\link[=get_source_extract_path]{get_source_extract_path()}})}
+}
+\value{
+a dataframe with a count of each flag
+from \code{\link[=calculate_measures]{calculate_measures()}}
+}
+\description{
+Produce tests for the delayed discharges extract.
+}
+\seealso{
+calculate_measures
+
+Other extract test functions
+for creating test flags: 
+\code{\link{produce_source_pis_tests}()}
+}
+\concept{extract test functions
+for creating test flags}
diff --git a/man/produce_source_pis_tests.Rd b/man/produce_source_pis_tests.Rd
index 070cc789d..487ad2fd7 100644
--- a/man/produce_source_pis_tests.Rd
+++ b/man/produce_source_pis_tests.Rd
@@ -24,6 +24,10 @@ episode date variables.
 }
 \seealso{
 calculate_measures
+
+Other extract test functions
+for creating test flags: 
+\code{\link{produce_source_dd_tests}()}
 }
 \concept{extract test functions
 for creating test flags}