From 3f86f892653d72127228aeb4ba4e8dbe8ccec86f Mon Sep 17 00:00:00 2001 From: James McMahon Date: Mon, 17 Jul 2023 13:23:15 +0100 Subject: [PATCH 01/16] Fix some R CMD CHK build error and warnings (#743) * Add {future} and {future.callr} as required packages * Use `.data$` * Document all parameters * Simplify the code and avoid using `runif` * [check-spelling] Update metadata Update for https://github.com/Public-Health-Scotland/source-linkage-files/actions/runs/5573642848/attempts/1 Accepted in https://github.com/Public-Health-Scotland/source-linkage-files/pull/743#issuecomment-1637643299 Signed-off-by: check-spelling-bot --------- Signed-off-by: check-spelling-bot --- .github/actions/spelling/expect.txt | 4 ++++ DESCRIPTION | 2 ++ R/produce_test_comparison.R | 4 ++-- R/run_episode_file.R | 1 + R/write_tests_xlsx.R | 4 ++-- man/join_cohort_lookups.Rd | 2 ++ 6 files changed, 13 insertions(+), 4 deletions(-) diff --git a/.github/actions/spelling/expect.txt b/.github/actions/spelling/expect.txt index 50162e23f..23691cb31 100644 --- a/.github/actions/spelling/expect.txt +++ b/.github/actions/spelling/expect.txt @@ -94,6 +94,7 @@ itle iwalk jaccard jan +jennifer jul keydate keyring @@ -104,6 +105,8 @@ ltc ltcs lubridate magrittr +Mcbride +mcmahon MMMYY monthflag mpat @@ -200,5 +203,6 @@ xintercept xlsx yearstay YYYYQX +zihao zsav zstd diff --git a/DESCRIPTION b/DESCRIPTION index 31b205b36..02b87f21b 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -28,6 +28,8 @@ Imports: dtplyr (>= 1.3.0), fs (>= 1.6.1), fst (>= 0.9.8), + future (>= 1.33.0), + future.callr (>= 0.8.1), glue (>= 1.6.2), haven (>= 2.5.2), hms (>= 1.1.0), diff --git a/R/produce_test_comparison.R b/R/produce_test_comparison.R index dda1f2411..624623806 100644 --- a/R/produce_test_comparison.R +++ b/R/produce_test_comparison.R @@ -23,7 +23,7 @@ produce_test_comparison <- function(old_data, new_data, recid = FALSE) { dplyr::mutate( difference = round(.data$value_new - .data$value_old, digits = 2L), pct_change = .data$difference / .data$value_old, - issue = !dplyr::between(pct_change, -0.05, 0.05) + issue = !dplyr::between(.data$pct_change, -0.05, 0.05) ) } else { dplyr::full_join(old_data, @@ -34,7 +34,7 @@ produce_test_comparison <- function(old_data, new_data, recid = FALSE) { dplyr::mutate( difference = round(.data$value_new - .data$value_old, digits = 2L), pct_change = .data$difference / .data$value_old, - issue = !dplyr::between(pct_change, -0.05, 0.05) + issue = !dplyr::between(.data$pct_change, -0.05, 0.05) ) } } diff --git a/R/run_episode_file.R b/R/run_episode_file.R index 00e864f40..df51d430d 100644 --- a/R/run_episode_file.R +++ b/R/run_episode_file.R @@ -340,6 +340,7 @@ create_cohort_lookups <- function(data, year, update = latest_update()) { #' Join cohort lookups #' #' @inheritParams store_ep_file_vars +#' @inheritParams get_demographic_cohorts_path #' #' @return The data including the Demographic and Service Use lookups. join_cohort_lookups <- function(data, year, update = latest_update()) { diff --git a/R/write_tests_xlsx.R b/R/write_tests_xlsx.R index c517e496e..e187149d5 100644 --- a/R/write_tests_xlsx.R +++ b/R/write_tests_xlsx.R @@ -47,7 +47,7 @@ write_tests_xlsx <- function(comparison_data, sheet_name, year = NULL) { while (fs::file_exists(path = in_use_path) && seconds < max_wait) { # While the tests are in use (wait a random number of seconds from 1 to 30) cli::cli_progress_update() - wait <- round(runif(1, 1, 15)) + wait <- sample(x = 3:15, size = 1) Sys.sleep(wait) seconds <- seconds + wait @@ -56,7 +56,7 @@ write_tests_xlsx <- function(comparison_data, sheet_name, year = NULL) { } # Final check to maybe avoid corrupting the workbook - Sys.sleep(round(runif(1, 1, 3))) + Sys.sleep(sample(x = 1:3, size = 1)) if (!fs::file_exists(path = in_use_path)) { fs::file_create(path = in_use_path) } else { diff --git a/man/join_cohort_lookups.Rd b/man/join_cohort_lookups.Rd index 21f376bdc..fcd419a1b 100644 --- a/man/join_cohort_lookups.Rd +++ b/man/join_cohort_lookups.Rd @@ -10,6 +10,8 @@ join_cohort_lookups(data, year, update = latest_update()) \item{data}{The in progress episode file data.} \item{year}{The year to process, in FY format.} + +\item{update}{The update to use} } \value{ The data including the Demographic and Service Use lookups. From 5272eede4345c39ef456f24507a2d88d8f9b93b1 Mon Sep 17 00:00:00 2001 From: James McMahon Date: Mon, 17 Jul 2023 13:37:42 +0100 Subject: [PATCH 02/16] Some updates to `read_file()` (#727) * Use `!!` injection operator as a simpler workaround https://github.com/apache/arrow/issues/36658 * Add explicit `.rds.gz` and `.csv.gz` extension handling Previously this assumed any file ending in `.gz` was a CSV. * Style code --------- Co-authored-by: Moohan Co-authored-by: Jennit07 <67372904+Jennit07@users.noreply.github.com> --- R/read_file.R | 43 ++++++++++++++++++++++++--------- tests/testthat/test-read_file.R | 5 +++- 2 files changed, 35 insertions(+), 13 deletions(-) diff --git a/R/read_file.R b/R/read_file.R index 53300c70b..2941b62ed 100644 --- a/R/read_file.R +++ b/R/read_file.R @@ -16,10 +16,27 @@ #' @return the data a [tibble][tibble::tibble-package] #' @export read_file <- function(path, col_select = NULL, as_data_frame = TRUE, ...) { - valid_extensions <- c("rds", "fst", "sav", "zsav", "csv", "gz", "parquet") + valid_extensions <- c( + "rds", + "rds.gz", + "fst", + "sav", + "zsav", + "csv", + "csv.gz", + "parquet" + ) ext <- fs::path_ext(path) + if (ext == "gz") { + ext <- paste( + fs::path_ext(fs::path_ext_remove(path)), + "gz", + sep = "." + ) + } + if (!(ext %in% valid_extensions)) { cli::cli_abort(c( "x" = "Invalid extension: {.val {ext}}", @@ -36,17 +53,19 @@ read_file <- function(path, col_select = NULL, as_data_frame = TRUE, ...) { } data <- switch(ext, - "rds" = readr::read_rds(path), - "fst" = fst::read_fst(path), - "sav" = haven::read_spss(path, ...), - "zsav" = haven::read_spss(path, ...), - "csv" = readr::read_csv(path, ..., show_col_types = FALSE), - "gz" = readr::read_csv(path, ..., show_col_types = FALSE), - "parquet" = if (is.null(col_select)) { - arrow::read_parquet(path, as_data_frame = as_data_frame, ...) - } else { - arrow::read_parquet(path, col_select = col_select, as_data_frame = as_data_frame, ...) - } + "rds" = readr::read_rds(file = path), + "rds.gz" = readr::read_rds(file = path), + "fst" = tibble::as_tibble(fst::read_fst(path = path)), + "sav" = haven::read_spss(file = path, ...), + "zsav" = haven::read_spss(file = path, ...), + "csv" = readr::read_csv(file = path, ..., show_col_types = FALSE), + "csv.gz" = readr::read_csv(file = path, ..., show_col_types = FALSE), + "parquet" = arrow::read_parquet( + file = path, + col_select = !!col_select, + as_data_frame = as_data_frame, + ... + ) ) return(data) diff --git a/tests/testthat/test-read_file.R b/tests/testthat/test-read_file.R index 392ba4a49..e823180fb 100644 --- a/tests/testthat/test-read_file.R +++ b/tests/testthat/test-read_file.R @@ -1,5 +1,6 @@ test_that("read_file works", { rds_path <- tempfile(fileext = ".rds") + rds_gz_path <- tempfile(fileext = ".rds.gz") fst_path <- tempfile(fileext = ".fst") sav_path <- tempfile(fileext = ".sav") zsav_path <- tempfile(fileext = ".zsav") @@ -10,6 +11,7 @@ test_that("read_file works", { aq_data <- tibble::as_tibble(datasets::airquality) readr::write_rds(aq_data, rds_path) + readr::write_rds(aq_data, rds_gz_path) fst::write_fst(aq_data, fst_path) haven::write_sav(aq_data, sav_path) haven::write_sav(aq_data, zsav_path, compress = "zsav") @@ -18,7 +20,8 @@ test_that("read_file works", { arrow::write_parquet(aq_data, parquet_path) expect_equal(aq_data, read_file(rds_path)) - expect_equal(aq_data, tibble::as_tibble(read_file(fst_path))) + expect_equal(aq_data, read_file(rds_gz_path)) + expect_equal(aq_data, read_file(fst_path)) expect_equal(aq_data, haven::zap_formats(read_file(sav_path))) expect_equal(aq_data, haven::zap_formats(read_file(zsav_path))) expect_equal(aq_data, read_file(csv_gz_path)) From 018dced9ecf5c79649c103a7e682287ce2e2c781 Mon Sep 17 00:00:00 2001 From: Jennit07 <67372904+Jennit07@users.noreply.github.com> Date: Mon, 17 Jul 2023 13:56:14 +0100 Subject: [PATCH 03/16] Rename `datazone` to `datazone2011` (#744) * Rename `datazone` to `datazone2011` * [check-spelling] Update metadata Update for https://github.com/Public-Health-Scotland/source-linkage-files/actions/runs/5575756558/attempts/1 Accepted in https://github.com/Public-Health-Scotland/source-linkage-files/pull/744#issuecomment-1638065893 Signed-off-by: check-spelling-bot --------- Signed-off-by: check-spelling-bot Co-authored-by: James McMahon --- .github/actions/spelling/expect.txt | 2 ++ R/fill_geographies.R | 2 +- R/process_extract_acute.R | 2 +- R/process_extract_district_nursing.R | 2 +- R/process_extract_gp_ooh.R | 2 +- R/process_extract_mental_health.R | 2 +- R/read_extract_acute.R | 2 +- R/read_extract_district_nursing.R | 2 +- R/read_extract_mental_health.R | 2 +- R/read_extract_nrs_deaths.R | 2 +- R/read_extract_ooh_consultations.R | 2 +- R/run_episode_file.R | 2 +- 12 files changed, 13 insertions(+), 11 deletions(-) diff --git a/.github/actions/spelling/expect.txt b/.github/actions/spelling/expect.txt index 23691cb31..a1800b02f 100644 --- a/.github/actions/spelling/expect.txt +++ b/.github/actions/spelling/expect.txt @@ -88,6 +88,7 @@ hms homecare hscp hscpnames +IDPC infyyear ipdc itle @@ -100,6 +101,7 @@ keydate keyring keytime keytimex +kis los ltc ltcs diff --git a/R/fill_geographies.R b/R/fill_geographies.R index 22b3f03ba..28bab7fa2 100644 --- a/R/fill_geographies.R +++ b/R/fill_geographies.R @@ -14,7 +14,7 @@ fill_geographies <- function(data) { "hbrescode", "hscp", "lca", - "datazone", + "datazone2011", "hbpraccode", "hbtreatcode", "gpprac" diff --git a/R/process_extract_acute.R b/R/process_extract_acute.R index db810b9fb..7d47d0ef4 100644 --- a/R/process_extract_acute.R +++ b/R/process_extract_acute.R @@ -79,7 +79,7 @@ process_extract_acute <- function(data, year, write_to_disk = TRUE) { "hbrescode", "lca", "hscp", - "datazone", + "datazone2011", "location", "hbtreatcode", "yearstay", diff --git a/R/process_extract_district_nursing.R b/R/process_extract_district_nursing.R index 2097b38d2..a1b3bf816 100644 --- a/R/process_extract_district_nursing.R +++ b/R/process_extract_district_nursing.R @@ -107,7 +107,7 @@ process_extract_district_nursing <- function( "gender", "gpprac", "postcode", - "datazone", + "datazone2011", "lca", "hscp", "hbrescode", diff --git a/R/process_extract_gp_ooh.R b/R/process_extract_gp_ooh.R index d85ce33f5..4add41cfa 100644 --- a/R/process_extract_gp_ooh.R +++ b/R/process_extract_gp_ooh.R @@ -111,7 +111,7 @@ process_extract_gp_ooh <- function(year, data_list, write_to_disk = TRUE) { "gpprac", "postcode", "hbrescode", - "datazone", + "datazone2011", "hscp", "hbtreatcode", "location", diff --git a/R/process_extract_mental_health.R b/R/process_extract_mental_health.R index f79eff35f..108c14c61 100644 --- a/R/process_extract_mental_health.R +++ b/R/process_extract_mental_health.R @@ -85,7 +85,7 @@ process_extract_mental_health <- function(data, year, write_to_disk = TRUE) { "hbrescode", "lca", "hscp", - "datazone", + "datazone2011", "location", "hbtreatcode", "stay", diff --git a/R/read_extract_acute.R b/R/read_extract_acute.R index 6c699d6b4..a0fba0707 100644 --- a/R/read_extract_acute.R +++ b/R/read_extract_acute.R @@ -107,7 +107,7 @@ read_extract_acute <- function(year, file_path = get_boxi_extract_path(year = ye disch = "Discharge Type Code", falls_adm = "Falls Related Admission (01)", lca = "Geo Council Area Code", - datazone = "Geo Data Zone 2011", + datazone2011 = "Geo Data Zone 2011", postcode = "Geo Postcode [C]", hscp = "Geo HSCP of Residence Code - current", conc = "Lead Consultant/HCP Code", diff --git a/R/read_extract_district_nursing.R b/R/read_extract_district_nursing.R index 5640fb7b7..607f9b47e 100644 --- a/R/read_extract_district_nursing.R +++ b/R/read_extract_district_nursing.R @@ -43,7 +43,7 @@ read_extract_district_nursing <- function( lca = "Patient Council Area Code (Contact)", postcode = "Patient Postcode [C] (Contact)", gpprac = "Practice Code (Contact)", - datazone = "Patient Data Zone 2011 (Contact)", + datazone2011 = "Patient Data Zone 2011 (Contact)", hbpraccode = "Practice NHS Board Code 9 (Contact)", hbtreatcode = "Treatment NHS Board Code 9", chi = "UPI Number [C]", diff --git a/R/read_extract_mental_health.R b/R/read_extract_mental_health.R index bbdd1d5f9..fe82732c8 100644 --- a/R/read_extract_mental_health.R +++ b/R/read_extract_mental_health.R @@ -83,7 +83,7 @@ read_extract_mental_health <- function( hbrescode = "NHS Board of Residence Code - current", lca = "Geo Council Area Code", hscp = "Geo HSCP of Residence Code - current", - datazone = "Geo Data Zone 2011", + datazone2011 = "Geo Data Zone 2011", location = "Treatment Location Code", hbtreatcode = "Treatment NHS Board Code - current", yearstay = "Occupied Bed Days (04)", diff --git a/R/read_extract_nrs_deaths.R b/R/read_extract_nrs_deaths.R index 8fd2f26e9..1734b23aa 100644 --- a/R/read_extract_nrs_deaths.R +++ b/R/read_extract_nrs_deaths.R @@ -39,7 +39,7 @@ read_extract_nrs_deaths <- function( dplyr::rename( death_location_code = "Death Location Code", lca = "Geo Council Area Code", - datazone = "Geo Data Zone 2011", + datazone2011 = "Geo Data Zone 2011", postcode = "Geo Postcode [C]", hscp = "Geo HSCP of Residence Code - current", death_board_occurrence = "NHS Board of Occurrence Code - current", diff --git a/R/read_extract_ooh_consultations.R b/R/read_extract_ooh_consultations.R index 1c32ca085..4e16527a3 100644 --- a/R/read_extract_ooh_consultations.R +++ b/R/read_extract_ooh_consultations.R @@ -34,7 +34,7 @@ read_extract_ooh_consultations <- function( postcode = "Patient Postcode [C]", hbrescode = "Patient NHS Board Code 9 - current", hscp = "HSCP of Residence Code Current", - datazone = "Patient Data Zone 2011", + datazone2011 = "Patient Data Zone 2011", gpprac = "Practice Code", ooh_case_id = "GUID", attendance_status = "Consultation Recorded", diff --git a/R/run_episode_file.R b/R/run_episode_file.R index df51d430d..45a4e6ed5 100644 --- a/R/run_episode_file.R +++ b/R/run_episode_file.R @@ -51,7 +51,7 @@ run_episode_file <- function(processed_data_list, year, write_to_disk = TRUE) { "cij_dis_spec", "cost_total_net", "hscp", - "datazone", + "datazone2011", "attendance_status", "deathdiag1", "deathdiag2", From eae87509e39b9d066ff880617a8690a2b41a00d7 Mon Sep 17 00:00:00 2001 From: Jennit07 <67372904+Jennit07@users.noreply.github.com> Date: Mon, 17 Jul 2023 16:02:07 +0100 Subject: [PATCH 04/16] Sort variables with issues `hbrescode` (HB2018), `datazone` and `hscp` (#746) * rename `hscp` to `hscp2018` * rename `spd` as `slf_pc_lookup` * Add `datazone2011` to coalesce code * Rename `datazone` to `datazone2011` * include `datazone2011_old` in selections * Update R/fill_geographies.R --------- Co-authored-by: James McMahon --- R/fill_geographies.R | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/R/fill_geographies.R b/R/fill_geographies.R index 28bab7fa2..58d001493 100644 --- a/R/fill_geographies.R +++ b/R/fill_geographies.R @@ -85,7 +85,7 @@ make_gpprac_lookup <- function(data) { } fill_postcode_geogs <- function(data) { - spd <- read_file(get_slf_postcode_path()) + slf_pc_lookup <- read_file(get_slf_postcode_path()) filled_postcodes <- dplyr::left_join( data, @@ -102,7 +102,7 @@ fill_postcode_geogs <- function(data) { ) %>% # Fill geographies dplyr::left_join( - spd, + slf_pc_lookup, by = "postcode", suffix = c("_old", "") ) %>% @@ -117,10 +117,11 @@ fill_postcode_geogs <- function(data) { cascade_geographies() %>% dplyr::mutate( hbrescode = dplyr::coalesce(.data$hb2018, .data$hbrescode), - hscp = dplyr::coalesce(.data$hscp2018, .data$hscp), - lca = dplyr::coalesce(.data$lca, .data$lca_old) + hscp2018 = dplyr::coalesce(.data$hscp2018, .data$hscp), + lca = dplyr::coalesce(.data$lca, .data$lca_old), + datazone2011 = dplyr::coalesce(.data$datazone2011, .data$datazone2011_old) ) %>% - dplyr::select(!c("hb2018", "hscp2018", "lca_old", "most_recent_postcode")) + dplyr::select(!c("hb2018", "hscp", "lca_old", "datazone2011_old", "most_recent_postcode")) return(filled_postcodes) } From 3b2c54df3e69523b8b16e2b5aa8fcf0b50cc66e7 Mon Sep 17 00:00:00 2001 From: James McMahon Date: Mon, 17 Jul 2023 16:23:45 +0100 Subject: [PATCH 05/16] Add priorities (default is 0) to targets (#745) * Add priorities (default is 0) to targets This should run the lookups and 'all year' files first. * Use `qs` instead of `rds` * Use {tarchetypes} * Style code --------- Co-authored-by: Moohan Co-authored-by: Jennit07 <67372904+Jennit07@users.noreply.github.com> --- _targets.R | 36 ++++++++++++++++++++++-------------- 1 file changed, 22 insertions(+), 14 deletions(-) diff --git a/_targets.R b/_targets.R index 3625fa072..869d0d194 100644 --- a/_targets.R +++ b/_targets.R @@ -59,7 +59,8 @@ list( process_lookup_sc_demographics( sc_demog_data, write_to_disk = write_to_disk - ) + ), + priority = 0.9 ), tar_target( tests_sc_demog_lookup, @@ -70,7 +71,8 @@ list( process_it_chi_deaths( data = it_chi_deaths_extract, write_to_disk = write_to_disk - ) + ), + priority = 0.9 ), tar_target( tests_it_chi_deaths, @@ -83,7 +85,8 @@ list( gpprac_ref_path = gpprac_ref_path, spd_path = spd_path, write_to_disk = write_to_disk - ) + ), + priority = 0.9 ), tar_target( tests_source_gp_lookup, @@ -96,16 +99,17 @@ list( simd_path = simd_path, locality_path = locality_path, write_to_disk = write_to_disk - ) + ), + priority = 0.9 ), tar_target( tests_source_pc_lookup, process_tests_lookup_pc(source_pc_lookup) ), ## Cost Lookups ## - tar_target(ch_cost_lookup, process_costs_ch_rmd()), - tar_target(dn_cost_lookup, process_costs_dn_rmd()), - tar_target(hc_cost_lookup, process_costs_hc_rmd()), + tar_target(ch_cost_lookup, process_costs_ch_rmd(), priority = 0.8), + tar_target(dn_cost_lookup, process_costs_dn_rmd(), priority = 0.8), + tar_target(hc_cost_lookup, process_costs_hc_rmd(), priority = 0.8), tar_target(gp_ooh_cost_lookup, process_costs_gp_ooh_rmd()), ## Social Care - 'All' data ## tar_target( @@ -122,7 +126,8 @@ list( all_at_extract, sc_demog_lookup = sc_demog_lookup, write_to_disk = write_to_disk - ) + ), + priority = 0.5 ), tar_target( all_home_care_extract, @@ -138,7 +143,8 @@ list( all_home_care_extract, sc_demog_lookup = sc_demog_lookup, write_to_disk = write_to_disk - ) + ), + priority = 0.5 ), tar_target( all_care_home_extract, @@ -157,7 +163,8 @@ list( ch_name_lookup_path = slf_ch_name_lookup_path, spd_path = spd_path, write_to_disk = write_to_disk - ) + ), + priority = 0.5 ), tar_target( tests_all_care_home, @@ -177,7 +184,8 @@ list( all_sds_extract, sc_demog_lookup = sc_demog_lookup, write_to_disk = write_to_disk - ) + ), + priority = 0.5 ), tar_map( list(year = years_to_run), @@ -256,14 +264,14 @@ list( get_boxi_extract_path(year = year, type = "GP_OoH-c"), format = "file" ), - tar_target(ooh_data, + tar_qs( + ooh_data, read_extract_gp_ooh( year, diagnosis_data_path, outcomes_data_path, consultations_data_path - ), - format = "rds" + ) ), ### Target source processed extracts ### tar_target(source_acute_extract, process_extract_acute( From d09efcd7e736d14937ee7bdf1ab13b4c3caadbd6 Mon Sep 17 00:00:00 2001 From: James McMahon Date: Tue, 18 Jul 2023 08:15:11 +0100 Subject: [PATCH 06/16] Make episode file output with `anon_chi` (#747) I've added this as a parameter so you can output CHI if desired, but the default is for anon_chi. For the tests, it swaps back to CHI as there are some tests which specifically us the CHI number. Co-authored-by: Jennit07 <67372904+Jennit07@users.noreply.github.com> --- R/process_tests_episode_file.R | 5 +++-- R/run_episode_file.R | 16 +++++++++++++++- man/run_episode_file.Rd | 10 +++++++++- 3 files changed, 27 insertions(+), 4 deletions(-) diff --git a/R/process_tests_episode_file.R b/R/process_tests_episode_file.R index b595d1d54..46e9e7171 100644 --- a/R/process_tests_episode_file.R +++ b/R/process_tests_episode_file.R @@ -10,7 +10,7 @@ process_tests_episode_file <- function(data, year) { data <- data %>% dplyr::select( "year", - "chi", + "anon_chi", "gender", "postcode", "hbtreatcode", @@ -20,7 +20,8 @@ process_tests_episode_file <- function(data, year) { "record_keydate1", "record_keydate2", dplyr::contains(c("beddays", "cost", "cij")) - ) + ) %>% + slfhelper::get_chi() old_data <- get_existing_data_for_tests(data) diff --git a/R/run_episode_file.R b/R/run_episode_file.R index 45a4e6ed5..1f2bb33ed 100644 --- a/R/run_episode_file.R +++ b/R/run_episode_file.R @@ -4,11 +4,17 @@ #' @param year The year to process, in FY format. #' @param write_to_disk (optional) Should the data be written to disk default is #' `TRUE` i.e. write the data to disk. +#' @param anon_chi_out (Default:TRUE) Should `anon_chi` be used in the output +#' (instead of chi) #' #' @return a [tibble][tibble::tibble-package] containing the episode file #' @export #' -run_episode_file <- function(processed_data_list, year, write_to_disk = TRUE) { +run_episode_file <- function( + processed_data_list, + year, + write_to_disk = TRUE, + anon_chi_out = TRUE) { episode_file <- dplyr::bind_rows(processed_data_list) %>% create_cost_inc_dna() %>% apply_cost_uplift() %>% @@ -103,6 +109,14 @@ run_episode_file <- function(processed_data_list, year, write_to_disk = TRUE) { join_deaths_data(year) %>% load_ep_file_vars(year) + if (anon_chi_out) { + episode_file <- slfhelper::get_anon_chi( + episode_file, + chi_var = "chi", + drop = TRUE + ) + } + if (write_to_disk) { slf_path <- get_file_path( get_year_dir(year), diff --git a/man/run_episode_file.Rd b/man/run_episode_file.Rd index e85621b59..59d5fea1d 100644 --- a/man/run_episode_file.Rd +++ b/man/run_episode_file.Rd @@ -4,7 +4,12 @@ \alias{run_episode_file} \title{Produce the Source Episode file} \usage{ -run_episode_file(processed_data_list, year, write_to_disk = TRUE) +run_episode_file( + processed_data_list, + year, + write_to_disk = TRUE, + anon_chi_out = TRUE +) } \arguments{ \item{processed_data_list}{containing data from processed extracts.} @@ -13,6 +18,9 @@ run_episode_file(processed_data_list, year, write_to_disk = TRUE) \item{write_to_disk}{(optional) Should the data be written to disk default is \code{TRUE} i.e. write the data to disk.} + +\item{anon_chi_out}{(Default:TRUE) Should \code{anon_chi} be used in the output +(instead of chi)} } \value{ a \link[tibble:tibble-package]{tibble} containing the episode file From 654dc00232a5ff87c1e97d614ad360cf42e03fcd Mon Sep 17 00:00:00 2001 From: James McMahon Date: Tue, 18 Jul 2023 08:19:11 +0100 Subject: [PATCH 07/16] Write out as a partitioned arrow dataset (#726) * Also, write out as a partitioned arrow dataset This will allow us to test if this format provides any additional benefits. It is still also written out as a (non-partitioned) parquet file. * Fix a typo --------- Co-authored-by: Jennit07 <67372904+Jennit07@users.noreply.github.com> --- R/run_episode_file.R | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/R/run_episode_file.R b/R/run_episode_file.R index 1f2bb33ed..16b7ee3c2 100644 --- a/R/run_episode_file.R +++ b/R/run_episode_file.R @@ -118,7 +118,8 @@ run_episode_file <- function( } if (write_to_disk) { - slf_path <- get_file_path( + # TODO make the slf_path a function + slf_episode_path <- get_file_path( get_year_dir(year), stringr::str_glue( "source-episode-file-{year}.parquet" @@ -126,7 +127,17 @@ run_episode_file <- function( check_mode = "write" ) - write_file(episode_file, slf_path) + write_file(episode_file, slf_episode_path) + + arrow::write_dataset( + dataset = episode_file, + path = fs::path_ext_remove(slf_episode_path), + format = "parquet", + # Should correspond to the available slfhelper filters + partitioning = c("recid", "hscp2018"), + compression = "zstd", + version = "latest" + ) } return(episode_file) From ee7445bf831b362bacdf3b899e0bbd70fb07b1fd Mon Sep 17 00:00:00 2001 From: James McMahon Date: Tue, 18 Jul 2023 14:39:40 +0100 Subject: [PATCH 08/16] Write the episode file as a partitioned dataset (#750) * Revert "Write out as a partitioned arrow dataset (#726)" This reverts commit 654dc00232a5ff87c1e97d614ad360cf42e03fcd. * Write the episode file out as a partitioned dataset This is its own target so it won't hold up the rest of the processing. --- R/run_episode_file.R | 10 ---------- _targets.R | 12 ++++++++++++ 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/R/run_episode_file.R b/R/run_episode_file.R index 16b7ee3c2..668a40124 100644 --- a/R/run_episode_file.R +++ b/R/run_episode_file.R @@ -128,16 +128,6 @@ run_episode_file <- function( ) write_file(episode_file, slf_episode_path) - - arrow::write_dataset( - dataset = episode_file, - path = fs::path_ext_remove(slf_episode_path), - format = "parquet", - # Should correspond to the available slfhelper filters - partitioning = c("recid", "hscp2018"), - compression = "zstd", - version = "latest" - ) } return(episode_file) diff --git a/_targets.R b/_targets.R index 869d0d194..58e5f573f 100644 --- a/_targets.R +++ b/_targets.R @@ -547,6 +547,18 @@ list( data = episode_file, year = year ) + ), + tar_target( + episode_file_dataset, + arrow::write_dataset( + dataset = episode_file, + path = fs::path_ext_remove(slf_episode_path), + format = "parquet", + # Should correspond to the available slfhelper filters + partitioning = c("recid", "hscp2018"), + compression = "zstd", + version = "latest" + ) ) ) ) From 74109bfdc7b9741922cbaad310af32a5045d8f18 Mon Sep 17 00:00:00 2001 From: James McMahon Date: Wed, 19 Jul 2023 11:06:04 +0100 Subject: [PATCH 09/16] Fix for anon_chi missing (#752) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Fix for anon_chi missing `slfhelper::get_anon_chi` converts `NA` to "TkE=" which would then be converted back to "e1" 🤯 This fixes that by making NA CHIs blank `""`, then making blank anon_chis NA after. slfhelper should be updated so that it always converts NA to NA. * Update run_episode_file.R * Style code * Update documentation --------- Co-authored-by: Moohan --- R/run_episode_file.R | 20 ++++++++++---------- man/correct_cij_vars.Rd | 2 +- man/create_cohort_lookups.Rd | 2 +- man/create_cost_inc_dna.Rd | 2 +- man/fill_missing_cij_markers.Rd | 2 +- man/join_cohort_lookups.Rd | 2 +- man/join_sparra_hhg.Rd | 2 +- man/load_ep_file_vars.Rd | 2 +- man/store_ep_file_vars.Rd | 6 +++--- 9 files changed, 20 insertions(+), 20 deletions(-) diff --git a/R/run_episode_file.R b/R/run_episode_file.R index 668a40124..852a4fd8b 100644 --- a/R/run_episode_file.R +++ b/R/run_episode_file.R @@ -86,7 +86,7 @@ run_episode_file <- function( ) ) %>% # Check chi is valid using phsmethods function - # If the CHI is invalid for whatever reason, set the CHI to blank string + # If the CHI is invalid for whatever reason, set the CHI to NA dplyr::mutate( chi = dplyr::if_else( phsmethods::chi_check(.data$chi) != "Valid CHI", @@ -110,11 +110,11 @@ run_episode_file <- function( load_ep_file_vars(year) if (anon_chi_out) { - episode_file <- slfhelper::get_anon_chi( - episode_file, - chi_var = "chi", - drop = TRUE - ) + # TODO When slfhelper is updated remove the unnecessary code + episode_file <- episode_file %>% + tidyr::replace_na(list(chi = "")) %>% + slfhelper::get_anon_chi() %>% + dplyr::mutate(anon_chi = dplyr::na_if(.data$anon_chi, "")) } if (write_to_disk) { @@ -135,10 +135,10 @@ run_episode_file <- function( #' Store the unneeded episode file variables #' -#' @param data The in progress episode file data. +#' @param data The in-progress episode file data. #' @inheritParams run_episode_file -#' @param vars_to_keep a character vector of variable to keep, all others will -#' be stored. +#' @param vars_to_keep a character vector of the variables to keep, all others +#' will be stored. #' #' @return `data` with only the `vars_to_keep` kept store_ep_file_vars <- function(data, year, vars_to_keep) { @@ -324,7 +324,7 @@ create_cost_inc_dna <- function(data) { #' #' @return The data unchanged (the cohorts are written to disk) create_cohort_lookups <- function(data, year, update = latest_update()) { - # Use future so the cohorts can be create simultaneously (in parallel) + # Use future so the cohorts can be created simultaneously (in parallel) future::plan(strategy = future.callr::callr, .skip = TRUE) options(future.globals.maxSize = 21474836480) diff --git a/man/correct_cij_vars.Rd b/man/correct_cij_vars.Rd index 18ce990f8..97a7f046f 100644 --- a/man/correct_cij_vars.Rd +++ b/man/correct_cij_vars.Rd @@ -7,7 +7,7 @@ correct_cij_vars(data) } \arguments{ -\item{data}{The in progress episode file data.} +\item{data}{The in-progress episode file data.} } \value{ The data with CIJ variables corrected. diff --git a/man/create_cohort_lookups.Rd b/man/create_cohort_lookups.Rd index cbfc1442f..f0ad267aa 100644 --- a/man/create_cohort_lookups.Rd +++ b/man/create_cohort_lookups.Rd @@ -7,7 +7,7 @@ create_cohort_lookups(data, year, update = latest_update()) } \arguments{ -\item{data}{The in progress episode file data.} +\item{data}{The in-progress episode file data.} \item{year}{The year to process, in FY format.} diff --git a/man/create_cost_inc_dna.Rd b/man/create_cost_inc_dna.Rd index 588c602be..69e7e37b5 100644 --- a/man/create_cost_inc_dna.Rd +++ b/man/create_cost_inc_dna.Rd @@ -7,7 +7,7 @@ create_cost_inc_dna(data) } \arguments{ -\item{data}{The in progress episode file data.} +\item{data}{The in-progress episode file data.} } \value{ The data with cost including dna. diff --git a/man/fill_missing_cij_markers.Rd b/man/fill_missing_cij_markers.Rd index 002c8d927..03b64217e 100644 --- a/man/fill_missing_cij_markers.Rd +++ b/man/fill_missing_cij_markers.Rd @@ -7,7 +7,7 @@ fill_missing_cij_markers(data) } \arguments{ -\item{data}{The in progress episode file data.} +\item{data}{The in-progress episode file data.} } \value{ A data frame with CIJ markers filled in for those missing. diff --git a/man/join_cohort_lookups.Rd b/man/join_cohort_lookups.Rd index fcd419a1b..445dcd7c0 100644 --- a/man/join_cohort_lookups.Rd +++ b/man/join_cohort_lookups.Rd @@ -7,7 +7,7 @@ join_cohort_lookups(data, year, update = latest_update()) } \arguments{ -\item{data}{The in progress episode file data.} +\item{data}{The in-progress episode file data.} \item{year}{The year to process, in FY format.} diff --git a/man/join_sparra_hhg.Rd b/man/join_sparra_hhg.Rd index 9bbdd916a..ab4d3b946 100644 --- a/man/join_sparra_hhg.Rd +++ b/man/join_sparra_hhg.Rd @@ -7,7 +7,7 @@ join_sparra_hhg(data, year) } \arguments{ -\item{data}{The in progress episode file data.} +\item{data}{The in-progress episode file data.} \item{year}{The year to process, in FY format.} } diff --git a/man/load_ep_file_vars.Rd b/man/load_ep_file_vars.Rd index d290ba512..cee9cc440 100644 --- a/man/load_ep_file_vars.Rd +++ b/man/load_ep_file_vars.Rd @@ -7,7 +7,7 @@ load_ep_file_vars(data, year) } \arguments{ -\item{data}{The in progress episode file data.} +\item{data}{The in-progress episode file data.} \item{year}{The year to process, in FY format.} } diff --git a/man/store_ep_file_vars.Rd b/man/store_ep_file_vars.Rd index f31f63976..06316aac1 100644 --- a/man/store_ep_file_vars.Rd +++ b/man/store_ep_file_vars.Rd @@ -7,12 +7,12 @@ store_ep_file_vars(data, year, vars_to_keep) } \arguments{ -\item{data}{The in progress episode file data.} +\item{data}{The in-progress episode file data.} \item{year}{The year to process, in FY format.} -\item{vars_to_keep}{a character vector of variable to keep, all others will -be stored.} +\item{vars_to_keep}{a character vector of the variables to keep, all others +will be stored.} } \value{ \code{data} with only the \code{vars_to_keep} kept From 8db37690172f5de1a7a580dda0c2a2da293d6d94 Mon Sep 17 00:00:00 2001 From: James McMahon Date: Wed, 19 Jul 2023 12:43:20 +0100 Subject: [PATCH 10/16] Create individual file (#715) * Until L594 * Converted until L677 * Until L731 * Update documentation * Remove test ref * Style code * WIP writing functions to fill postcode in line with previous DOB functions * Update documentation * implement quick fix for running 22/23 * Style code * Fix missed comma * Exclude DD code for now - TEMP fix * Correct/rename variables * Style code * Include NSU in `check_year_valid` * Update `check_year_valid_tests` * Update documentation * Update `add_nsu_cohort` to pick up years valid * Style code * remove extra `!` * Exclude `cij_delay` * Style code * improve `max_no_inf()` * Use pmin/max instead of `rowwise` * improve `min_no_inf()` * Use n_distinct(cij_marker) * deal with distinct(ch_chi_cis) * use n_distinct(ooh_case_id) * remove `find_non_duplicates` * Use dplyr::if_else() Co-authored-by: James McMahon * Fix typo in `ooh_covid_assessment` * Move `ooh_case_id` to aggregate * Use `slfhelper::ltc_vars` * Remove `clean_up_dob` Already done in `correct_demographics` * Update documentation * [check-spelling] Update metadata Update for https://github.com/Public-Health-Scotland/source-linkage-files/actions/runs/4981058958/attempts/1 Accepted in https://github.com/Public-Health-Scotland/source-linkage-files/pull/654#issuecomment-1551009850 Signed-off-by: check-spelling-bot * Use `start_next_fy_quarter` in place of rowwise * Style code * Use `compute_mid_year_age` * convert code into data.table for improving speed * Update `get_fy_dates`function * remove `date_from_fy`, use `get_fy_dates` * Update documentation * Remove `clean_up_postcode` function Not needed anymore * Remove non duplicates function/move to aggregate * Style code * Update documentation * Add time stamps to `create_individual_file` * Style code * remove `clean_up_postcode` * Deal with ch cis episodes * Style code * add .data$ * Turn ch aggregate into a data table * Style code * use ch_chi_cis * remove `preventable_admissions` from aggregate * exclude `hh_in_fy` for now * Style code * Test - exclude `sc_` vars from aggregate * Style code * Exclude for now * exclude for now * Style code * automate `check_year_valid` * Return dummy file path for NSU not valid * Style code * Fix brackets in aggregate * TEMP - exclude variables * Use `phsmethods::sex_from_chi` * Style code * Add ungroup() * lowercase dob * Remove as.data.table * rewrite aggregate_by_chi with data.table * Style code * minor changes * Use the updated function * to properly import data.table * remove redundant columns dob postcode and gpprac * minor changes to remove redundant postcode gpprac columns * Style code * rename columns with small letters * Style code * newaggregate_ch_episodes * Update documentation * add functions to replace regular expressions to select column/variables * Update documentation * Style code * minor changes * add a missing variable, cij_delay * Style code * add variables cij_delay, preventable_beddays * add missing variables health_net_cost, health_net_costincdnas, and cmh, dd sds columns * Style code * add more variables needed * Style code * Update R/link_delayed_discharge_eps.R * Style code * amend costs * Style code * Revert "amend costs" This reverts commit 8048e68c829edbf6c0c43e1bf3ade1d142e0e250. * Add DN and cij_delay back in * fix the issue * Style code * remove running in chunks * Style code * Update tests to include missing variables * Remove unnecessary comma * fix the bug of preventable_beddays * Update documentation * fix total ae_attendances * fix the bug of preventable_admissions * fix the bug of hbrescode etc * minor fix * minor fix * Style code * Fix some warnings being produced by the tests * Fix failing test * remove running in chunks * Style code * Update the targets config to use `timestamp_positives` as the default reporter * fix the bug of preventable_beddays * Update documentation * fix total ae_attendances * fix the bug of preventable_admissions * fix the bug of hbrescode etc * minor fix * minor fix * Style code * fix home care cost * add ipdc to fix maternity * fix preventable addmission and care home cost * fix preventable_admissions and calculate preventable_beddays here * add monthly_beddays and yearstay to dd * Style code * fix preventable_admissions and preventable_beddays * Style code * include parameter for write to disk/year * Add lookups to indiv file creation pipeline * include parameter for write to disk/year * fix delay discharge beddays and yearstay * Style code * fix preventable issues * Style code * fix the issue of preventable stuff * Style code * Update R/aggregate_by_chi_zihao.R * Update documentation * Fix minor typos * [check-spelling] Update metadata Update for https://github.com/Public-Health-Scotland/source-linkage-files/actions/runs/5443581387/attempts/1 Accepted in https://github.com/Public-Health-Scotland/source-linkage-files/pull/709#issuecomment-1617917895 Signed-off-by: check-spelling-bot * Remove some obsolete comments * Remove some unnecessary brackets * Reformat some code * Use some `dplyr` functions for readability * Style code * Update R/link_delayed_discharge_eps.R * Style code * Remove some code which is no longer needed We now match on these variables after * Work out preventable admissions with similar indicators * Lowercase variable names * Restore `cij_delay` * Restore DN variables * Tidy the code and use integers where possible * Supply `year` as a parameter to `clean_up_ch` * Supply `year` as a parameter to `clean_individual_file` * Only keep required variables to save memory * Rename the parameter so the documentation works * Use `setnames` to change names to lower * Remove unneeded code * Update file path name * Trim the return code * Some fixes * Correctly compute `ooh_cases` * Update documentation * Style code * [check-spelling] Update metadata Update for https://github.com/Public-Health-Scotland/source-linkage-files/actions/runs/5466392495/attempts/1 Accepted in https://github.com/Public-Health-Scotland/source-linkage-files/pull/719#issuecomment-1623280566 Signed-off-by: check-spelling-bot * Add targets for the individual file * Fix missed pipe * Style code * Update some targets to only run once a week * Make the deaths lookup unique * Add `year` back to the individual file * Remove `cost_total_net_inc_dnas` from the indiv file (#737) * Drop `cost_total_net_inc_dnas` * Rename `health_net_costincdnas` to `health_net_cost_inc_dnas` * Join slf lookups onto individual file (#724) * Create function for matching on slf lookups * fix some build warnings * Add `hbrescode` to select list * Pass lookups as parameters/deal with hbrescode * Update R/create_individual_file.R --------- Co-authored-by: James McMahon * Join sc client variables onto individual file (#740) * New function for matching sc client to indiv file * Style code * [check-spelling] Update metadata Update for https://github.com/Public-Health-Scotland/source-linkage-files/actions/runs/5555048903/attempts/1 Accepted in https://github.com/Public-Health-Scotland/source-linkage-files/pull/740#issuecomment-1635955654 Signed-off-by: check-spelling-bot * Code layout * Style code * Remove redundant sc variables Co-authored-by: James McMahon * Update comments Co-authored-by: James McMahon * Update comments Co-authored-by: James McMahon * Sort order of parameters to pass `data` first * Update documentation * Style code * Update R/create_individual_file.R * Update R/create_individual_file.R * Update R/create_individual_file.R * Style code --------- Signed-off-by: check-spelling-bot Co-authored-by: Jennit07 Co-authored-by: James McMahon Co-authored-by: Moohan * Update documentation * Output the individual file with `anon_chi` (#748) * Make episode file output with `anon_chi` I've added this as a parameter so you can output CHI if desired, but the default is for anon_chi. For the tests, it swaps back to CHI as there are some tests which specifically us the CHI number. * Output `anon_chi` in the individual file * Style code * Sort variables with issues `hbrescode` (HB2018), `datazone` and `hscp` (#746) * rename `hscp` to `hscp2018` * rename `spd` as `slf_pc_lookup` * Add `datazone2011` to coalesce code * Rename `datazone` to `datazone2011` * include `datazone2011_old` in selections * Update R/fill_geographies.R --------- Co-authored-by: James McMahon * Fix for anon_chi being NA --------- Co-authored-by: Moohan Co-authored-by: Jennit07 <67372904+Jennit07@users.noreply.github.com> --------- Signed-off-by: check-spelling-bot Co-authored-by: Mandy Norrbo Co-authored-by: jr-mandy Co-authored-by: shintoLampgit config --global user.email bateman.mcbride@phs.scotm git config --global user.name shintoLamp Co-authored-by: shintoLamp Co-authored-by: Jennit07 <67372904+Jennit07@users.noreply.github.com> Co-authored-by: Jennifer Thom Co-authored-by: Jennit07 Co-authored-by: Zihao Li Co-authored-by: lizihao-anu Co-authored-by: Moohan Co-authored-by: Zihao Li --- .github/actions/spelling/expect.txt | 13 + DESCRIPTION | 3 +- NAMESPACE | 3 + R/aggregate_by_chi_zihao.R | 215 +++++++ R/create_individual_file.R | 857 ++++++++++++++++++++++++++++ R/process_tests_individual_file.R | 31 +- _targets.R | 14 + man/add_acute_columns.Rd | 18 + man/add_ae_columns.Rd | 18 + man/add_all_columns.Rd | 15 + man/add_at_columns.Rd | 18 + man/add_ch_columns.Rd | 18 + man/add_cij_columns.Rd | 14 + man/add_cmh_columns.Rd | 18 + man/add_dd_columns.Rd | 18 + man/add_dn_columns.Rd | 18 + man/add_gls_columns.Rd | 18 + man/add_hc_columns.Rd | 18 + man/add_hl1_columns.Rd | 18 + man/add_ipdc_cols.Rd | 23 + man/add_mat_columns.Rd | 18 + man/add_mh_columns.Rd | 18 + man/add_nrs_columns.Rd | 18 + man/add_nsu_columns.Rd | 18 + man/add_ooh_columns.Rd | 18 + man/add_op_columns.Rd | 18 + man/add_pis_columns.Rd | 18 + man/add_sds_columns.Rd | 18 + man/add_standard_cols.Rd | 28 + man/aggregate_by_chi.Rd | 15 + man/aggregate_by_chi_zihao.Rd | 15 + man/aggregate_ch_episodes.Rd | 14 + man/aggregate_ch_episodes_zihao.Rd | 14 + man/clean_individual_file.Rd | 16 + man/clean_up_ch.Rd | 16 + man/clean_up_gender.Rd | 14 + man/condition_cols.Rd | 13 + man/create_individual_file.Rd | 34 ++ man/join_sc_client.Rd | 26 + man/join_slf_lookup_vars.Rd | 27 + man/max_no_inf.Rd | 16 + man/min_no_inf.Rd | 16 + man/recode_gender.Rd | 14 + man/remove_blank_chi.Rd | 14 + man/select.Rd | 30 + 45 files changed, 1818 insertions(+), 16 deletions(-) create mode 100644 R/aggregate_by_chi_zihao.R create mode 100644 R/create_individual_file.R create mode 100644 man/add_acute_columns.Rd create mode 100644 man/add_ae_columns.Rd create mode 100644 man/add_all_columns.Rd create mode 100644 man/add_at_columns.Rd create mode 100644 man/add_ch_columns.Rd create mode 100644 man/add_cij_columns.Rd create mode 100644 man/add_cmh_columns.Rd create mode 100644 man/add_dd_columns.Rd create mode 100644 man/add_dn_columns.Rd create mode 100644 man/add_gls_columns.Rd create mode 100644 man/add_hc_columns.Rd create mode 100644 man/add_hl1_columns.Rd create mode 100644 man/add_ipdc_cols.Rd create mode 100644 man/add_mat_columns.Rd create mode 100644 man/add_mh_columns.Rd create mode 100644 man/add_nrs_columns.Rd create mode 100644 man/add_nsu_columns.Rd create mode 100644 man/add_ooh_columns.Rd create mode 100644 man/add_op_columns.Rd create mode 100644 man/add_pis_columns.Rd create mode 100644 man/add_sds_columns.Rd create mode 100644 man/add_standard_cols.Rd create mode 100644 man/aggregate_by_chi.Rd create mode 100644 man/aggregate_by_chi_zihao.Rd create mode 100644 man/aggregate_ch_episodes.Rd create mode 100644 man/aggregate_ch_episodes_zihao.Rd create mode 100644 man/clean_individual_file.Rd create mode 100644 man/clean_up_ch.Rd create mode 100644 man/clean_up_gender.Rd create mode 100644 man/condition_cols.Rd create mode 100644 man/create_individual_file.Rd create mode 100644 man/join_sc_client.Rd create mode 100644 man/join_slf_lookup_vars.Rd create mode 100644 man/max_no_inf.Rd create mode 100644 man/min_no_inf.Rd create mode 100644 man/recode_gender.Rd create mode 100644 man/remove_blank_chi.Rd create mode 100644 man/select.Rd diff --git a/.github/actions/spelling/expect.txt b/.github/actions/spelling/expect.txt index a1800b02f..3236edd84 100644 --- a/.github/actions/spelling/expect.txt +++ b/.github/actions/spelling/expect.txt @@ -28,6 +28,7 @@ cmh CNWs commhosp congen +costincdnas costmonthnum costsfy covr @@ -45,6 +46,7 @@ dbconnect dbplyr deathdiag demog +dfc disch dischloc dischto @@ -70,6 +72,7 @@ fyyear geogs ggplot GLS +gls gms GPOo gpprac @@ -86,6 +89,7 @@ hhg hjust hms homecare +homev hscp hscpnames IDPC @@ -102,6 +106,8 @@ keyring keytime keytimex kis +lgl +kis los ltc ltcs @@ -116,6 +122,7 @@ multiday multisession multistaff NAs +newcons nhs nhshosp NRS @@ -147,7 +154,9 @@ purrr quickstart Rbuildignore rcmdcheck +rdd rds +reabl reablement readcode readr @@ -164,8 +173,12 @@ rspm RStudio rstudioapi Rtype +SDcols seealso selfharm +setkeyv +setnafill +setnames Siar sigfac simd diff --git a/DESCRIPTION b/DESCRIPTION index 02b87f21b..a437b80cc 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -55,7 +55,8 @@ Imports: stringr (>= 1.5.0), tibble (>= 3.2.1), tidyr (>= 1.3.0), - tidyselect (>= 1.2.0) + tidyselect (>= 1.2.0), + zoo (>= 1.8.0) Suggests: covr (>= 3.6.1), roxygen2 (>= 7.2.3), diff --git a/NAMESPACE b/NAMESPACE index 642146578..d87bf9397 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -13,6 +13,7 @@ export(convert_hscp_to_hscpnames) export(convert_numeric_to_date) export(convert_sending_location_to_lca) export(convert_year_to_fyyear) +export(create_individual_file) export(create_service_use_cohorts) export(end_fy) export(end_fy_quarter) @@ -160,6 +161,8 @@ export(start_fy) export(start_fy_quarter) export(start_next_fy_quarter) export(write_file) +importFrom(data.table,.N) +importFrom(data.table,.SD) importFrom(magrittr,"%>%") importFrom(readr,col_character) importFrom(readr,col_date) diff --git a/R/aggregate_by_chi_zihao.R b/R/aggregate_by_chi_zihao.R new file mode 100644 index 000000000..0eee203e8 --- /dev/null +++ b/R/aggregate_by_chi_zihao.R @@ -0,0 +1,215 @@ +#' Aggregate by CHI +#' +#' @description Aggregate episode file by CHI to convert into +#' individual file. +#' +#' @importFrom data.table .N +#' @importFrom data.table .SD +#' +#' @inheritParams create_individual_file +aggregate_by_chi_zihao <- function(episode_file) { + cli::cli_alert_info("Aggregate by CHI function started at {Sys.time()}") + + # Convert to data.table + data.table::setDT(episode_file) + + # Ensure all variable names are lowercase + data.table::setnames(episode_file, stringr::str_to_lower) + + # Sort the data + data.table::setkeyv( + episode_file, + c( + "chi", + "record_keydate1", + "keytime1", + "record_keydate2", + "keytime2" + ) + ) + + data.table::setnames( + episode_file, + c( + "ch_chi_cis", "cij_marker", "ooh_case_id" + # ,"hh_in_fy" + ), + c( + "ch_cis_episodes", "cij_total", "ooh_cases" + # ,"hl1_in_fy" + ) + ) + + # column specification, grouped by chi + # columns to select last + cols2 <- c( + "postcode", + "dob", + "gpprac", + vars_start_with(episode_file, "sc_") + ) + # columns to count unique rows + cols3 <- c( + "ch_cis_episodes", + "cij_total", + "cij_el", + "cij_non_el", + "cij_mat", + "cij_delay", + "ooh_cases", + "preventable_admissions" + ) + # columns to sum up + cols4 <- c( + vars_end_with( + episode_file, + c( + "episodes", + "beddays", + "cost", + "attendances", + "attend", + "contacts", + "hours", + "alarms", + "telecare", + "paid_items", + "advice", + "homev", + "time", + "assessment", + "other", + "dn", + "nhs24", + "pcc", + "_dnas" + ) + ), + vars_start_with( + episode_file, + "sds_option" + ), + "health_net_cost_inc_dnas" + ) + cols4 <- cols4[!(cols4 %in% c("ch_cis_episodes"))] + # columns to select maximum + cols5 <- c("nsu", vars_contain(episode_file, c("hl1_in_fy"))) + data.table::setnafill(episode_file, fill = 0L, cols = cols5) + # compute + individual_file_cols1 <- episode_file[, + .(gender = mean(gender)), + by = "chi" + ] + individual_file_cols2 <- episode_file[, + .SD[.N], + .SDcols = cols2, + by = "chi" + ] + individual_file_cols3 <- episode_file[, + lapply(.SD, function(x) { + data.table::uniqueN(x, na.rm = TRUE) + }), + .SDcols = cols3, + by = "chi" + ] + individual_file_cols4 <- episode_file[, + lapply(.SD, function(x) { + sum(x, na.rm = TRUE) + }), + .SDcols = cols4, + by = "chi" + ] + individual_file_cols5 <- episode_file[, + lapply(.SD, function(x) max(x, na.rm = TRUE)), + .SDcols = cols5, + by = "chi" + ] + individual_file_cols6 <- episode_file[, + .( + preventable_beddays = ifelse( + max(cij_ppa, na.rm = TRUE), + max(cij_end_date) - min(cij_start_date), + NA_real_ + ) + ), + # cij_marker has been renamed as cij_total + by = c("chi", "cij_total") + ] + individual_file_cols6 <- individual_file_cols6[, + .( + preventable_beddays = sum(preventable_beddays, na.rm = TRUE) + ), + by = "chi" + ] + + individual_file <- dplyr::bind_cols( + individual_file_cols1, + individual_file_cols2[, chi := NULL], + individual_file_cols3[, chi := NULL], + individual_file_cols4[, chi := NULL], + individual_file_cols5[, chi := NULL], + individual_file_cols6[, chi := NULL] + ) + + # convert back to tibble + return(dplyr::as_tibble(individual_file)) +} + + +#' select columns ending with some patterns +#' @describeIn select columns based on patterns +vars_end_with <- function(data, vars, ignore_case = FALSE) { + names(data)[stringr::str_ends( + names(data), + stringr::regex(paste(vars, collapse = "|"), + ignore_case = ignore_case + ) + )] +} + +#' select columns starting with some patterns +#' @describeIn select columns based on patterns +vars_start_with <- function(data, vars, ignore_case = FALSE) { + names(data)[stringr::str_starts( + names(data), + stringr::regex(paste(vars, collapse = "|"), + ignore_case = ignore_case + ) + )] +} + +#' select columns contains some characters +#' @describeIn select columns based on patterns +vars_contain <- function(data, vars, ignore_case = FALSE) { + names(data)[stringr::str_detect( + names(data), + stringr::regex(paste(vars, collapse = "|"), + ignore_case = ignore_case + ) + )] +} + +#' Aggregate CIS episodes +#' +#' @description Aggregate CH variables by CHI and CIS. +#' +#' @inheritParams create_individual_file +aggregate_ch_episodes_zihao <- function(episode_file) { + cli::cli_alert_info("Aggregate ch episodes function started at {Sys.time()}") + + # Convert to data.table + data.table::setDT(episode_file) + + # Perform grouping and aggregation + episode_file <- episode_file[, `:=`( + ch_no_cost = max(ch_no_cost), + ch_ep_start = min(record_keydate1), + ch_ep_end = max(ch_ep_end), + ch_cost_per_day = mean(ch_cost_per_day) + ), by = c("chi", "ch_chi_cis")] + + # Convert back to tibble if needed + episode_file <- tibble::as_tibble(episode_file) + + return(episode_file) +} diff --git a/R/create_individual_file.R b/R/create_individual_file.R new file mode 100644 index 000000000..675e2066a --- /dev/null +++ b/R/create_individual_file.R @@ -0,0 +1,857 @@ +#' Create individual file +#' +#' @description Creates individual file from episode file +#' +#' @param episode_file Tibble containing episodic data +#' @param anon_chi_in (Default:TRUE) Is `anon_chi` used in the input +#' (instead of chi) +#' @inheritParams run_episode_file +#' +#' @return The processed individual file +#' @export +create_individual_file <- function( + episode_file, + year, + write_to_disk = TRUE, + anon_chi_in = TRUE, + anon_chi_out = TRUE) { + if (anon_chi_in) { + episode_file <- slfhelper::get_chi( + episode_file, + anon_chi_var = "anon_chi", + drop = TRUE + ) %>% + dplyr::mutate(chi = dplyr::na_if(.data$chi, "")) + } + + individual_file <- episode_file %>% + dplyr::select( + "year", + "chi", + "dob", + "gender", + "record_keydate1", + "record_keydate2", + "keytime1", + "keytime2", + "recid", + "smrtype", + "ipdc", + "postcode", + "gpprac", + "cij_marker", + "cij_start_date", + "cij_end_date", + "cij_pattype", + "cij_pattype_code", + "cij_ppa", + "ch_chi_cis", + "yearstay", + "cost_total_net", + "cost_total_net_inc_dnas", + "attendance_status", + "no_paid_items", + "total_no_dn_contacts", + "primary_delay_reason", + "sc_latest_submission", + "hc_hours_annual", + "hc_reablement", + "ooh_case_id" + ) %>% + remove_blank_chi() %>% + add_cij_columns() %>% + add_all_columns() %>% + aggregate_ch_episodes_zihao() %>% + clean_up_ch(year) %>% + recode_gender() %>% + aggregate_by_chi_zihao() %>% + clean_individual_file(year) %>% + join_cohort_lookups(year) %>% + match_on_ltcs(year) %>% + join_deaths_data(year) %>% + join_sparra_hhg(year) %>% + join_slf_lookup_vars() %>% + join_sc_client(year) %>% + dplyr::mutate(year = year) + + if (anon_chi_out) { + individual_file <- individual_file %>% + tidyr::replace_na(list(chi = "")) %>% + slfhelper::get_anon_chi() %>% + dplyr::mutate(anon_chi = dplyr::na_if(.data$anon_chi, "")) + } + + if (write_to_disk) { + slf_indiv_path <- get_file_path( + get_year_dir(year), + stringr::str_glue( + "source-individual-file-{year}.parquet" + ), + check_mode = "write" + ) + + write_file(individual_file, slf_indiv_path) + } + + return(individual_file) +} + +#' Remove blank CHI +#' +#' @description Convert blank strings to NA and remove NAs from CHI column +#' +#' @inheritParams create_individual_file +remove_blank_chi <- function(episode_file) { + cli::cli_alert_info("Remove blank CHI function started at {Sys.time()}") + + episode_file %>% + dplyr::mutate(chi = dplyr::na_if(.data$chi, "")) %>% + dplyr::filter(!is.na(.data$chi)) +} + + +#' Add CIJ-related columns +#' +#' @description Add new columns related to CIJ +#' +#' @inheritParams create_individual_file +add_cij_columns <- function(episode_file) { + cli::cli_alert_info("Add cij columns function started at {Sys.time()}") + + episode_file %>% + dplyr::mutate( + cij_non_el = dplyr::if_else( + .data$cij_pattype_code == 0, + .data$cij_marker, + NA_real_ + ), + cij_el = dplyr::if_else( + .data$cij_pattype_code == 1, + .data$cij_marker, + NA_real_ + ), + cij_mat = dplyr::if_else( + .data$cij_pattype_code == 2, + .data$cij_marker, + NA_real_ + ), + cij_delay = dplyr::if_else( + .data$recid == "DD", + .data$cij_marker, + NA_real_ + ), + preventable_admissions = dplyr::if_else( + .data$cij_ppa == 1, + .data$cij_marker, + NA_integer_ + ) + ) +} + +#' Add all columns +#' +#' @description Add new columns based on SMRType and recid which follow a pattern +#' of prefixed column names created based on some condition. +#' +#' @inheritParams create_individual_file +add_all_columns <- function(episode_file) { + cli::cli_alert_info("Add all columns function started at {Sys.time()}") + + episode_file %>% + add_acute_columns("Acute", (.data$smrtype == "Acute-DC" | .data$smrtype == "Acute-IP") & .data$cij_pattype != "Maternity") %>% + add_mat_columns("Mat", .data$recid == "02B" | .data$cij_pattype == "Maternity") %>% + add_mh_columns("MH", .data$recid == "04B" & .data$cij_pattype != "Maternity") %>% + add_gls_columns("GLS", .data$smrtype == "GLS-IP") %>% + add_op_columns("OP", .data$recid == "00B") %>% + add_ae_columns("AE", .data$recid == "AE2") %>% + add_pis_columns("PIS", .data$recid == "PIS") %>% + add_ooh_columns("OoH", .data$recid == "OoH") %>% + add_dn_columns("DN", .data$recid == "DN") %>% + add_cmh_columns("CMH", .data$recid == "CMH") %>% + add_dd_columns("DD", .data$recid == "DD") %>% + add_nsu_columns("NSU", .data$recid == "NSU") %>% + add_nrs_columns("NRS", .data$recid == "NRS") %>% + add_hl1_columns("HL1", .data$recid == "HL1") %>% + add_ch_columns("CH", .data$recid == "CH") %>% + add_hc_columns("HC", .data$recid == "HC") %>% + add_at_columns("AT", .data$recid == "AT") %>% + add_sds_columns("SDS", .data$recid == "SDS") %>% + dplyr::mutate( + health_net_cost = rowSums( + dplyr::pick( + .data$Acute_cost, + .data$Mat_cost, + .data$MH_cost, + .data$GLS_cost, + .data$OP_cost_attend, + .data$AE_cost, + .data$PIS_cost, + .data$OoH_cost + ), + na.rm = TRUE + ), + health_net_cost_inc_dnas = .data$health_net_cost + dplyr::if_else( + is.na(.data$OP_cost_dnas), + 0, + .data$OP_cost_dnas + ) + ) +} + +#' Add Acute columns +#' +#' @inheritParams create_individual_file +#' @param prefix Prefix to add to related columns, e.g. "Acute" +#' @param condition Condition to create new columns based on +add_acute_columns <- function(episode_file, prefix, condition) { + condition <- substitute(condition) + episode_file %>% + add_standard_cols(prefix, condition, episode = TRUE, cost = TRUE) %>% + add_ipdc_cols(prefix, condition) +} + +#' Add Mat columns +#' +#' @inheritParams add_acute_columns +add_mat_columns <- function(episode_file, prefix, condition) { + condition <- substitute(condition) + episode_file %>% + add_standard_cols(prefix, condition, episode = TRUE, cost = TRUE) %>% + add_ipdc_cols(prefix, condition, elective = FALSE) +} + +#' Add MH columns +#' +#' @inheritParams add_acute_columns +add_mh_columns <- function(episode_file, prefix, condition) { + condition <- substitute(condition) + episode_file %>% + add_standard_cols(prefix, condition, episode = TRUE, cost = TRUE) %>% + add_ipdc_cols(prefix, condition, ipdc_d = FALSE) +} + +#' Add GLS columns +#' +#' @inheritParams add_acute_columns +add_gls_columns <- function(episode_file, prefix, condition) { + condition <- substitute(condition) + episode_file %>% + add_standard_cols(prefix, condition, episode = TRUE, cost = TRUE) %>% + add_ipdc_cols(prefix, condition, ipdc_d = FALSE) +} + +#' Add OP columns +#' +#' @inheritParams add_acute_columns +add_op_columns <- function(episode_file, prefix, condition) { + condition <- substitute(condition) + episode_file <- episode_file %>% + add_standard_cols(prefix, condition) + condition_1 <- substitute(condition & attendance_status == 1) + episode_file <- episode_file %>% + dplyr::mutate( + "{prefix}_newcons_attendances" := dplyr::if_else(eval(condition_1), 1L, NA_integer_), + "{prefix}_cost_attend" := dplyr::if_else(eval(condition_1), .data$cost_total_net, NA_real_) + ) + condition_5_8 <- substitute(condition & attendance_status %in% c(5, 8)) + episode_file <- episode_file %>% + dplyr::mutate( + "{prefix}_newcons_dnas" := dplyr::if_else(eval(condition_5_8), 1L, NA_integer_), + "{prefix}_cost_dnas" := dplyr::if_else(eval(condition_5_8), .data$cost_total_net_inc_dnas, NA_real_) + ) + return(episode_file) +} + +#' Add AE columns +#' +#' @inheritParams add_acute_columns +add_ae_columns <- function(episode_file, prefix, condition) { + condition <- substitute(condition) + episode_file %>% + add_standard_cols(prefix, condition, cost = TRUE) %>% + dplyr::mutate("{prefix}_attendances" := dplyr::if_else(eval(condition), 1L, NA_integer_)) +} + +#' Add PIS columns +#' +#' @inheritParams add_acute_columns +add_pis_columns <- function(episode_file, prefix, condition) { + condition <- substitute(condition) + episode_file %>% + add_standard_cols(prefix, condition, cost = TRUE) %>% + dplyr::mutate("{prefix}_paid_items" := dplyr::if_else(eval(condition), .data$no_paid_items, NA_integer_)) +} + +#' Add OoH columns +#' +#' @inheritParams add_acute_columns +add_ooh_columns <- function(episode_file, prefix, condition) { + condition <- substitute(condition) + episode_file <- episode_file %>% + add_standard_cols(prefix, condition, cost = TRUE) %>% + dplyr::mutate( + "{prefix}_homeV" := dplyr::if_else(eval(condition) & .data$smrtype == "OOH-HomeV", 1L, NA_integer_), + "{prefix}_advice" := dplyr::if_else(eval(condition) & .data$smrtype == "OOH-Advice", 1L, NA_integer_), + "{prefix}_DN" := dplyr::if_else(eval(condition) & .data$smrtype == "OOH-DN", 1L, NA_integer_), + "{prefix}_NHS24" := dplyr::if_else(eval(condition) & .data$smrtype == "OOH-NHS24", 1L, NA_integer_), + "{prefix}_other" := dplyr::if_else(eval(condition) & .data$smrtype == "OOH-Other", 1L, NA_integer_), + "{prefix}_PCC" := dplyr::if_else(eval(condition) & .data$smrtype == "OOH-PCC", 1L, NA_integer_), + "{prefix}_covid_advice" := dplyr::if_else(eval(condition) & .data$smrtype == "OOH-C19Adv", 1L, NA_integer_), + "{prefix}_covid_assessment" := dplyr::if_else(eval(condition) & .data$smrtype == "OOH-C19Ass", 1L, NA_integer_), + "{prefix}_covid_other" := dplyr::if_else(eval(condition) & .data$smrtype == "OOH-C190th", 1L, NA_integer_) + ) + + episode_file <- episode_file %>% + dplyr::mutate( + "{prefix}_consultation_time" := dplyr::if_else( + eval(condition), + pmax( + 0, + as.numeric((lubridate::seconds_to_period(.data$keytime2) + .data$record_keydate2) - (lubridate::seconds_to_period(.data$keytime1) + .data$record_keydate1), units = "mins") + ), + NA_real_ + ), + ) + + return(episode_file) +} + +#' Add DN columns +#' +#' @inheritParams add_acute_columns +add_dn_columns <- function(episode_file, prefix, condition) { + condition <- substitute(condition) + episode_file %>% + add_standard_cols(prefix, condition, episode = TRUE, cost = TRUE) %>% + dplyr::mutate("{prefix}_contacts" := dplyr::if_else(eval(condition), .data$total_no_dn_contacts, NA_integer_)) +} + +#' Add CMH columns +#' +#' @inheritParams add_acute_columns +add_cmh_columns <- function(episode_file, prefix, condition) { + condition <- substitute(condition) + episode_file %>% + add_standard_cols(prefix, condition) %>% + dplyr::mutate("{prefix}_contacts" := dplyr::if_else(eval(condition), 1L, NA_integer_)) +} + +#' Add DD columns +#' +#' @inheritParams add_acute_columns +add_dd_columns <- function(episode_file, prefix, condition) { + condition <- substitute(condition) + condition_delay <- substitute(condition & primary_delay_reason != "9") + episode_file <- episode_file %>% + dplyr::mutate( + "{prefix}_NonCode9_episodes" := dplyr::if_else(eval(condition_delay), 1L, NA_integer_), + "{prefix}_NonCode9_beddays" := dplyr::if_else(eval(condition_delay), .data$yearstay, NA_real_) + ) + condition_delay_9 <- substitute(condition & primary_delay_reason == "9") + episode_file <- episode_file %>% + dplyr::mutate( + "{prefix}_Code9_episodes" := dplyr::if_else(eval(condition_delay_9), 1L, NA_integer_), + "{prefix}_Code9_beddays" := dplyr::if_else(eval(condition_delay_9), .data$yearstay, NA_real_) + ) + return(episode_file) +} + +#' Add NSU columns +#' +#' @inheritParams add_acute_columns +add_nsu_columns <- function(episode_file, prefix, condition) { + condition <- substitute(condition) + episode_file %>% + add_standard_cols(prefix, condition) %>% + dplyr::mutate("{prefix}" := dplyr::if_else(eval(condition), 1L, NA_integer_)) +} + +#' Add NRS columns +#' +#' @inheritParams add_acute_columns +add_nrs_columns <- function(episode_file, prefix, condition) { + condition <- substitute(condition) + episode_file %>% + add_standard_cols(prefix, condition) %>% + dplyr::mutate("{prefix}" := dplyr::if_else(eval(condition), 1L, NA_integer_)) +} + +#' Add HL1 columns +#' +#' @inheritParams add_acute_columns +add_hl1_columns <- function(episode_file, prefix, condition) { + condition <- substitute(condition) + episode_file %>% + add_standard_cols(prefix, condition) +} + +#' Add CH columns +#' +#' @inheritParams add_acute_columns +add_ch_columns <- function(episode_file, prefix, condition) { + condition <- substitute(condition) + episode_file %>% + add_standard_cols(prefix, condition) %>% + dplyr::mutate( + ch_cost_per_day = dplyr::if_else( + eval(condition) & .data$yearstay > 0, + .data$cost_total_net / .data$yearstay, + .data$cost_total_net + ), + ch_no_cost = eval(condition) & is.na(.data$ch_cost_per_day), + ch_ep_end = dplyr::if_else( + eval(condition), + .data$record_keydate2, + lubridate::NA_Date_ + ), + # If end date is missing use the first day of next FY quarter + ch_ep_end = dplyr::if_else( + eval(condition) & is.na(.data$ch_ep_end), + start_next_fy_quarter(.data$sc_latest_submission), + .data$ch_ep_end + ) + ) +} + +#' Add HC columns +#' +#' @inheritParams add_acute_columns +add_hc_columns <- function(episode_file, prefix, condition) { + condition <- substitute(condition) + episode_file <- episode_file %>% + add_standard_cols(prefix, condition, episode = TRUE) %>% + dplyr::mutate( + "{prefix}_total_hours" := dplyr::if_else(eval(condition), .data$hc_hours_annual, NA_real_), + "{prefix}_total_cost" := dplyr::if_else(eval(condition), .data$cost_total_net, NA_real_), + ) + condition_per <- substitute(condition & smrtype == "HC-Per") + episode_file <- episode_file %>% + dplyr::mutate( + "{prefix}_personal_episodes" := dplyr::if_else(eval(condition_per), 1L, NA_integer_), + "{prefix}_personal_hours" := dplyr::if_else(eval(condition_per), .data$HC_total_hours, NA_real_), + "{prefix}_personal_hours_cost" := dplyr::if_else(eval(condition_per), .data$cost_total_net, NA_real_) + ) + condition_non_per <- substitute(condition & smrtype == "HC-Non-Per") + episode_file <- episode_file %>% + dplyr::mutate( + "{prefix}_non_personal_episodes" := dplyr::if_else(eval(condition_non_per), 1L, NA_integer_), + "{prefix}_non_personal_hours" := dplyr::if_else(eval(condition_non_per), .data$hc_hours_annual, NA_real_), + "{prefix}_non_personal_hours_cost" := dplyr::if_else(eval(condition_non_per), .data$cost_total_net, NA_real_) + ) + condition_reabl <- substitute(condition & hc_reablement == 1) + episode_file <- episode_file %>% + dplyr::mutate( + "{prefix}_reablement_episodes" := dplyr::if_else(eval(condition_reabl), 1L, NA_integer_), + "{prefix}_reablement_hours" := dplyr::if_else(eval(condition_reabl), .data$hc_hours_annual, NA_real_), + "{prefix}_reablement_hours_cost" := dplyr::if_else(eval(condition_reabl), .data$cost_total_net, NA_real_) + ) +} + +#' Add AT columns +#' +#' @inheritParams add_acute_columns +add_at_columns <- function(episode_file, prefix, condition) { + condition <- substitute(condition) + episode_file %>% + add_standard_cols(prefix, condition) %>% + dplyr::mutate( + "{prefix}_alarms" := dplyr::if_else(eval(condition) & .data$smrtype == "AT-Alarm", 1L, NA_integer_), + "{prefix}_telecare" := dplyr::if_else(eval(condition) & .data$smrtype == "AT-Tele", 1L, NA_integer_) + ) +} + +#' Add SDS columns +#' +#' @inheritParams add_acute_columns +add_sds_columns <- function(episode_file, prefix, condition) { + condition <- substitute(condition) + episode_file %>% + add_standard_cols(prefix, condition) %>% + dplyr::mutate( + "{prefix}_option_1" := dplyr::if_else(eval(condition) & .data$smrtype == "SDS-1", 1L, NA_integer_), + "{prefix}_option_2" := dplyr::if_else(eval(condition) & .data$smrtype == "SDS-2", 1L, NA_integer_), + "{prefix}_option_3" := dplyr::if_else(eval(condition) & .data$smrtype == "SDS-3", 1L, NA_integer_), + "{prefix}_option_4" := dplyr::if_else(eval(condition) & .data$smrtype == "SDS-4", 1L, NA_integer_) + ) +} + +#' Add columns based on IPDC +#' +#' @description Add columns based on value in IPDC column, which can +#' be further split by Elective/Non-Elective CIJ. +#' +#' @inheritParams add_acute_columns +#' @param ipdc_d Whether to create columns based on IPDC = "D" (lgl) +#' @param elective Whether to create columns based on Elective/Non-Elective cij_pattype (lgl) +add_ipdc_cols <- function(episode_file, prefix, condition, ipdc_d = TRUE, elective = TRUE) { + condition_i <- substitute(eval(condition) & ipdc == "I") + episode_file <- episode_file %>% + dplyr::mutate( + "{prefix}_inpatient_cost" := dplyr::if_else(eval(condition_i), .data$cost_total_net, NA_real_), + "{prefix}_inpatient_episodes" := dplyr::if_else(eval(condition_i), 1L, NA_integer_), + "{prefix}_inpatient_beddays" := dplyr::if_else(eval(condition_i), .data$yearstay, NA_real_) + ) + if (elective) { + condition_el <- substitute(condition_i & cij_pattype == "Elective") + episode_file <- episode_file %>% + dplyr::mutate( + "{prefix}_el_inpatient_episodes" := dplyr::if_else(eval(condition_el), 1L, NA_integer_), + "{prefix}_el_inpatient_beddays" := dplyr::if_else(eval(condition_el), .data$yearstay, NA_real_), + "{prefix}_el_inpatient_cost" := dplyr::if_else(eval(condition_el), .data$cost_total_net, NA_real_) + ) + condition_non_el <- substitute(condition_i & cij_pattype == "Non-Elective") + episode_file <- episode_file %>% + dplyr::mutate( + "{prefix}_non_el_inpatient_episodes" := dplyr::if_else(eval(condition_non_el), 1L, NA_integer_), + "{prefix}_non_el_inpatient_beddays" := dplyr::if_else(eval(condition_non_el), .data$yearstay, NA_real_), + "{prefix}_non_el_inpatient_cost" := dplyr::if_else(eval(condition_non_el), .data$cost_total_net, NA_real_) + ) + } + if (ipdc_d) { + condition_d <- substitute(eval(condition) & ipdc == "D") + episode_file <- episode_file %>% + dplyr::mutate( + "{prefix}_daycase_episodes" := dplyr::if_else(eval(condition_d), 1L, NA_integer_), + "{prefix}_daycase_cost" := dplyr::if_else(eval(condition_d), .data$cost_total_net, NA_real_) + ) + } + return(episode_file) +} + +#' Add standard columns +#' +#' @description Add standard columns (DoB, postcode, gpprac, episodes, cost) to episode file. +#' +#' @inheritParams add_acute_columns +#' @param episode Whether to create prefix_episodes col, e.g. "Acute_episodes" +#' @param cost Whether to create prefix_cost col, e.g. "Acute_cost" +add_standard_cols <- function(episode_file, prefix, condition, episode = FALSE, cost = FALSE) { + if (episode) { + episode_file <- dplyr::mutate(episode_file, "{prefix}_episodes" := dplyr::if_else(eval(condition), 1L, NA_integer_)) + } + if (cost) { + episode_file <- dplyr::mutate(episode_file, "{prefix}_cost" := dplyr::if_else(eval(condition), .data$cost_total_net, NA_real_)) + } + return(episode_file) +} + + +#' Aggregate CIS episodes +#' +#' @description Aggregate CH variables by CHI and CIS. +#' +#' @inheritParams create_individual_file +aggregate_ch_episodes <- function(episode_file) { + cli::cli_alert_info("Aggregate ch episodes function started at {Sys.time()}") + + episode_file %>% + # dplyr::filter(!is.na(.data$ch_chi_cis)) %>% + # use as.data.table to change the data format to data.table to accelerate + data.table::as.data.table() %>% + dplyr::group_by(.data$chi, .data$ch_chi_cis) %>% + dplyr::mutate( + ch_no_cost = max(.data$ch_no_cost), + ch_ep_start = min(.data$record_keydate1), + ch_ep_end = max(.data$ch_ep_end), + ch_cost_per_day = mean(.data$ch_cost_per_day) + ) %>% + dplyr::ungroup() %>% + # change the data format from data.table to data.frame + tibble::as_tibble() + + # dplyr::distinct(.data$chi, .data$ch_chi_cis) %>% + # dplyr::select(.data$chi, .data$ch_chi_cis, .data$ch_no_cost, .data$ch_ep_start, .data$ch_ep_end, .data$ch_cost_per_day) %>% + # dplyr::right_join(episode_file, by = c(.data$chi, .data$ch_chi_cis)) +} + +#' Clean up CH +#' +#' @description Clean up CH-related columns. +#' +#' @inheritParams create_individual_file +clean_up_ch <- function(episode_file, year) { + cli::cli_alert_info("Clean up CH function started at {Sys.time()}") + + episode_file %>% + dplyr::mutate( + fy_end = end_fy(year), + fy_start = start_fy(year) + ) %>% + dplyr::mutate( + term_1 = pmin(.data$ch_ep_end, .data$fy_end + 1), + term_2 = pmax(.data$ch_ep_start, .data$fy_start) + ) %>% + dplyr::mutate( + ch_beddays = dplyr::if_else( + .data$recid == "CH", + as.numeric(.data$term_1 - .data$term_2), + NA_real_ + ), + ch_cost = dplyr::if_else( + .data$recid == "CH" & .data$ch_no_cost == 0, + .data$ch_beddays * .data$ch_cost_per_day, + NA_real_ + ), + ch_beddays = dplyr::if_else( + .data$recid == "CH" & .data$ch_chi_cis == 0, + 0, + .data$ch_beddays + ), + ch_cost = dplyr::if_else( + .data$recid == "CH" & .data$ch_chi_cis == 0, + 0, + .data$ch_cost + ) + ) %>% + dplyr::select(-c("fy_end", "fy_start", "term_1", "term_2")) +} + +#' Recode gender +#' +#' @description Recode gender to 1.5 if 0 or 9. +#' +#' @inheritParams create_individual_file +recode_gender <- function(episode_file) { + cli::cli_alert_info("Recode Gender function started at {Sys.time()}") + + episode_file %>% + dplyr::mutate( + gender = dplyr::if_else( + .data$gender %in% c(0, 9), + 1.5, + .data$gender + ) + ) +} + +#' Aggregate by CHI +#' +#' @description Aggregate episode file by CHI to convert into +#' individual file. +#' +#' @inheritParams create_individual_file +aggregate_by_chi <- function(episode_file) { + cli::cli_alert_info("Aggregate by CHI function started at {Sys.time()}") + + episode_file %>% + dplyr::arrange( + chi, + record_keydate1, + keytime1, + record_keydate2, + keytime2 + ) %>% + dplyr::group_by(.data$chi) %>% + dplyr::summarise( + gender = mean(gender), + dplyr::across( + dplyr::ends_with(c("postcode", "DoB", "gpprac")), + ~ dplyr::last(., na_rm = TRUE) + ), + dplyr::across( + c( + "ch_cis_episodes" = "ch_chi_cis", + "cij_total" = "cij_marker", + "cij_el", + "cij_non_el", + "cij_mat", + # "cij_delay", + "ooh_cases" = "ooh_case_id", + "preventable_admissions" + ), + ~ dplyr::n_distinct(.x, na.rm = TRUE) + ), + dplyr::across( + c( + dplyr::ends_with( + c( + "episodes", + "beddays", + "cost", + "attendances", + "attend", + "contacts", + "hours", + "alarms", + "telecare", + "paid_items", + "advice", + "homeV", + "time", + "assessment", + "other", + # "DN", + "NHS24", + "PCC", + "_dnas" + ) + ), + dplyr::starts_with("SDS_option") + ), + ~ sum(., na.rm = TRUE) + ), + # dplyr::across( + # c( + # # dplyr::starts_with("sc_"), + # #-"sc_send_lca", + # #-"sc_latest_submission", + # # "HL1_in_FY" = "hh_in_fy", + # "NSU" + # ), + # ~ max_no_inf(.) + # ), + dplyr::across( + c( + condition_cols(), + # "death_date", + # "deceased", + "year", + dplyr::ends_with(c( + "_Cohort", "end_fy", "start_fy" + )), + ), + ~ dplyr::first(., na_rm = TRUE) + ) + ) %>% + dplyr::ungroup() +} + +#' Condition columns +#' +#' @description Returns chr vector of column names +#' which follow format "condition" and "condition_date" e.g. +#' "dementia" and "dementia_date" +condition_cols <- function() { + conditions <- slfhelper::ltc_vars + date_cols <- paste0(conditions, "_date") + all_cols <- c(conditions, date_cols) + return(all_cols) +} + +#' Custom maximum +#' +#' @description Custom maximum function which removes +#' missing values but doesn't return Inf if all values +#' are missing (instead returns NA) +#' +#' @param x Vector to return max of +max_no_inf <- function(x) { + dplyr::if_else(all(is.na(x)), NA, max(x, na.rm = TRUE)) +} + +#' Custom minimum +#' +#' @description Custom minimum function which removes +#' missing values but doesn't return Inf if all values +#' are missing (instead returns NA) +#' +#' @param x Vector to return min of +min_no_inf <- function(x) { + dplyr::if_else(all(is.na(x)), NA, min(x, na.rm = TRUE)) +} + +#' Clean individual file +#' +#' @description Clean up columns in individual file +#' +#' @param individual_file Individual file where each row represents a unique CHI +#' @param year Financial year e.g 1718 +clean_individual_file <- function(individual_file, year) { + cli::cli_alert_info("Clean individual file function started at {Sys.time()}") + + individual_file %>% + dplyr::select( + !c( + "ch_no_cost", + "no_paid_items", + "total_no_dn_contacts", + "cost_total_net_inc_dnas" + ) + ) %>% + clean_up_gender() %>% + dplyr::mutate(age = compute_mid_year_age(year, .data$dob)) +} + +#' Clean up gender column +#' +#' @description Clean up column containing gender. +#' +#' @inheritParams clean_individual_file +clean_up_gender <- function(individual_file) { + individual_file %>% + dplyr::mutate( + gender = dplyr::case_when( + .data$gender != 1.5 ~ round(.data$gender), + .default = phsmethods::sex_from_chi(.data$chi, chi_check = FALSE) + ) + ) +} + +#' Join slf lookup variables +#' +#' @description Join lookup variables from slf postcode lookup and slf gpprac +#' lookup. +#' +#' @param individual_file the processed individual file. +#' @param slf_postcode_lookup SLF processed postcode lookup +#' @param slf_gpprac_lookup SLF processed gpprac lookup +#' @param hbrescode_var hbrescode variable +#' +join_slf_lookup_vars <- function(individual_file, + slf_postcode_lookup = read_file(get_slf_postcode_path()), + slf_gpprac_lookup = read_file( + get_slf_gpprac_path(), + col_select = c("gpprac", "cluster", "hbpraccode") + ), + hbrescode_var = "hb2018") { + individual_file <- individual_file %>% + dplyr::left_join( + slf_postcode_lookup, + by = "postcode" + ) %>% + dplyr::left_join( + slf_gpprac_lookup, + by = "gpprac" + ) %>% + dplyr::rename(hbrescode = hbrescode_var) + + return(individual_file) +} +# TODO Remove the client data from the individual Social Care extracts +# and instead, use this function in the episode file to match on the client +# data to all episodes. +#' Join sc client variables onto individual file +#' +#' @description Match on sc client variables. +#' +#' @param individual_file the processed individual file +#' @param year financial year. +#' @param sc_client SC client lookup +#' @param sc_demographics SC Demographic lookup +join_sc_client <- function(individual_file, + year, + sc_client = read_file(get_source_extract_path(year, "Client")), + sc_demographics = read_file(get_sc_demog_lookup_path(), + col_select = c("sending_location", "social_care_id", "chi") + )) { + # TODO Update the client lookup processing script to match + # on demographics there so the client lookup already has CHI. + + # Match to demographics lookup to get CHI + join_client_demog <- sc_client %>% + dplyr::left_join( + sc_demographics %>% + dplyr::select("sending_location", "social_care_id", "chi"), + by = c("sending_location", "social_care_id") + ) + + # Match on client variables by chi + individual_file <- individual_file %>% + dplyr::left_join( + join_client_demog, + by = "chi" + ) %>% + dplyr::select(!c("sending_location", "social_care_id", "sc_latest_submission")) + + return(individual_file) +} diff --git a/R/process_tests_individual_file.R b/R/process_tests_individual_file.R index 2eb3503e2..695dc19a0 100644 --- a/R/process_tests_individual_file.R +++ b/R/process_tests_individual_file.R @@ -10,12 +10,12 @@ process_tests_individual_file <- function(data, year) { data <- data %>% dplyr::select( "year", - "chi", + "anon_chi", "gender", - # "postcode", # Add back in once postcode is fixed + "postcode", "dob", - # "hbrescode", #add back in when available - # "health_net_cost", + "hbrescode", + "health_net_cost", slfhelper::ltc_vars, dplyr::contains(c( "beddays", @@ -26,7 +26,8 @@ process_tests_individual_file <- function(data, year) { "cases", "consultations" )) - ) + ) %>% + slfhelper::get_chi() old_data <- get_existing_data_for_tests(data, file_version = "individual") @@ -61,8 +62,8 @@ produce_individual_file_tests <- function(data) { test_flags <- data %>% # use functions to create HB and partnership flags create_demog_test_flags() %>% - # create_hb_test_flags(.data$hbrescode) %>% - # create_hb_cost_test_flags(.data$hbrescode, .data$health_net_cost) %>% + create_hb_test_flags(.data$hbrescode) %>% + create_hb_cost_test_flags(.data$hbrescode, .data$health_net_cost) %>% # keep variables for comparison dplyr::select(c("valid_chi":dplyr::last_col())) %>% # use function to sum new test flags @@ -82,13 +83,13 @@ produce_individual_file_tests <- function(data) { measure = "all" ) - # min_max_measures <- data %>% - # calculate_measures( - # vars = c( - # "health_net_cost", - # ), - # measure = "min-max" - # ) + min_max_measures <- data %>% + calculate_measures( + vars = c( + "health_net_cost" + ), + measure = "min-max" + ) sum_measures <- data %>% dplyr::select(slfhelper::ltc_vars) %>% @@ -102,7 +103,7 @@ produce_individual_file_tests <- function(data) { join_output <- list( test_flags, all_measures, - # min_max_measures, + min_max_measures, sum_measures ) %>% purrr::reduce(dplyr::full_join, by = c("measure", "value")) diff --git a/_targets.R b/_targets.R index 58e5f573f..ef2fbbe74 100644 --- a/_targets.R +++ b/_targets.R @@ -549,6 +549,20 @@ list( ) ), tar_target( + individual_file, + create_individual_file( + episode_file = episode_file, + year = year, + write_to_disk = write_to_disk + ) + ), + tar_target( + individual_file_tests, + process_tests_individual_file( + data = individual_file, + year = year +), + tar_target( episode_file_dataset, arrow::write_dataset( dataset = episode_file, diff --git a/man/add_acute_columns.Rd b/man/add_acute_columns.Rd new file mode 100644 index 000000000..52ba071b6 --- /dev/null +++ b/man/add_acute_columns.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/create_individual_file.R +\name{add_acute_columns} +\alias{add_acute_columns} +\title{Add Acute columns} +\usage{ +add_acute_columns(episode_file, prefix, condition) +} +\arguments{ +\item{episode_file}{Tibble containing episodic data} + +\item{prefix}{Prefix to add to related columns, e.g. "Acute"} + +\item{condition}{Condition to create new columns based on} +} +\description{ +Add Acute columns +} diff --git a/man/add_ae_columns.Rd b/man/add_ae_columns.Rd new file mode 100644 index 000000000..9b7099513 --- /dev/null +++ b/man/add_ae_columns.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/create_individual_file.R +\name{add_ae_columns} +\alias{add_ae_columns} +\title{Add AE columns} +\usage{ +add_ae_columns(episode_file, prefix, condition) +} +\arguments{ +\item{episode_file}{Tibble containing episodic data} + +\item{prefix}{Prefix to add to related columns, e.g. "Acute"} + +\item{condition}{Condition to create new columns based on} +} +\description{ +Add AE columns +} diff --git a/man/add_all_columns.Rd b/man/add_all_columns.Rd new file mode 100644 index 000000000..d502e95c3 --- /dev/null +++ b/man/add_all_columns.Rd @@ -0,0 +1,15 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/create_individual_file.R +\name{add_all_columns} +\alias{add_all_columns} +\title{Add all columns} +\usage{ +add_all_columns(episode_file) +} +\arguments{ +\item{episode_file}{Tibble containing episodic data} +} +\description{ +Add new columns based on SMRType and recid which follow a pattern +of prefixed column names created based on some condition. +} diff --git a/man/add_at_columns.Rd b/man/add_at_columns.Rd new file mode 100644 index 000000000..e05ea9101 --- /dev/null +++ b/man/add_at_columns.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/create_individual_file.R +\name{add_at_columns} +\alias{add_at_columns} +\title{Add AT columns} +\usage{ +add_at_columns(episode_file, prefix, condition) +} +\arguments{ +\item{episode_file}{Tibble containing episodic data} + +\item{prefix}{Prefix to add to related columns, e.g. "Acute"} + +\item{condition}{Condition to create new columns based on} +} +\description{ +Add AT columns +} diff --git a/man/add_ch_columns.Rd b/man/add_ch_columns.Rd new file mode 100644 index 000000000..4938f7690 --- /dev/null +++ b/man/add_ch_columns.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/create_individual_file.R +\name{add_ch_columns} +\alias{add_ch_columns} +\title{Add CH columns} +\usage{ +add_ch_columns(episode_file, prefix, condition) +} +\arguments{ +\item{episode_file}{Tibble containing episodic data} + +\item{prefix}{Prefix to add to related columns, e.g. "Acute"} + +\item{condition}{Condition to create new columns based on} +} +\description{ +Add CH columns +} diff --git a/man/add_cij_columns.Rd b/man/add_cij_columns.Rd new file mode 100644 index 000000000..7d00e6299 --- /dev/null +++ b/man/add_cij_columns.Rd @@ -0,0 +1,14 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/create_individual_file.R +\name{add_cij_columns} +\alias{add_cij_columns} +\title{Add CIJ-related columns} +\usage{ +add_cij_columns(episode_file) +} +\arguments{ +\item{episode_file}{Tibble containing episodic data} +} +\description{ +Add new columns related to CIJ +} diff --git a/man/add_cmh_columns.Rd b/man/add_cmh_columns.Rd new file mode 100644 index 000000000..a1d82cba6 --- /dev/null +++ b/man/add_cmh_columns.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/create_individual_file.R +\name{add_cmh_columns} +\alias{add_cmh_columns} +\title{Add CMH columns} +\usage{ +add_cmh_columns(episode_file, prefix, condition) +} +\arguments{ +\item{episode_file}{Tibble containing episodic data} + +\item{prefix}{Prefix to add to related columns, e.g. "Acute"} + +\item{condition}{Condition to create new columns based on} +} +\description{ +Add CMH columns +} diff --git a/man/add_dd_columns.Rd b/man/add_dd_columns.Rd new file mode 100644 index 000000000..08d9c0fe4 --- /dev/null +++ b/man/add_dd_columns.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/create_individual_file.R +\name{add_dd_columns} +\alias{add_dd_columns} +\title{Add DD columns} +\usage{ +add_dd_columns(episode_file, prefix, condition) +} +\arguments{ +\item{episode_file}{Tibble containing episodic data} + +\item{prefix}{Prefix to add to related columns, e.g. "Acute"} + +\item{condition}{Condition to create new columns based on} +} +\description{ +Add DD columns +} diff --git a/man/add_dn_columns.Rd b/man/add_dn_columns.Rd new file mode 100644 index 000000000..bf6af008f --- /dev/null +++ b/man/add_dn_columns.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/create_individual_file.R +\name{add_dn_columns} +\alias{add_dn_columns} +\title{Add DN columns} +\usage{ +add_dn_columns(episode_file, prefix, condition) +} +\arguments{ +\item{episode_file}{Tibble containing episodic data} + +\item{prefix}{Prefix to add to related columns, e.g. "Acute"} + +\item{condition}{Condition to create new columns based on} +} +\description{ +Add DN columns +} diff --git a/man/add_gls_columns.Rd b/man/add_gls_columns.Rd new file mode 100644 index 000000000..e71dc755b --- /dev/null +++ b/man/add_gls_columns.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/create_individual_file.R +\name{add_gls_columns} +\alias{add_gls_columns} +\title{Add GLS columns} +\usage{ +add_gls_columns(episode_file, prefix, condition) +} +\arguments{ +\item{episode_file}{Tibble containing episodic data} + +\item{prefix}{Prefix to add to related columns, e.g. "Acute"} + +\item{condition}{Condition to create new columns based on} +} +\description{ +Add GLS columns +} diff --git a/man/add_hc_columns.Rd b/man/add_hc_columns.Rd new file mode 100644 index 000000000..95d8f1d3b --- /dev/null +++ b/man/add_hc_columns.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/create_individual_file.R +\name{add_hc_columns} +\alias{add_hc_columns} +\title{Add HC columns} +\usage{ +add_hc_columns(episode_file, prefix, condition) +} +\arguments{ +\item{episode_file}{Tibble containing episodic data} + +\item{prefix}{Prefix to add to related columns, e.g. "Acute"} + +\item{condition}{Condition to create new columns based on} +} +\description{ +Add HC columns +} diff --git a/man/add_hl1_columns.Rd b/man/add_hl1_columns.Rd new file mode 100644 index 000000000..7600db5e9 --- /dev/null +++ b/man/add_hl1_columns.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/create_individual_file.R +\name{add_hl1_columns} +\alias{add_hl1_columns} +\title{Add HL1 columns} +\usage{ +add_hl1_columns(episode_file, prefix, condition) +} +\arguments{ +\item{episode_file}{Tibble containing episodic data} + +\item{prefix}{Prefix to add to related columns, e.g. "Acute"} + +\item{condition}{Condition to create new columns based on} +} +\description{ +Add HL1 columns +} diff --git a/man/add_ipdc_cols.Rd b/man/add_ipdc_cols.Rd new file mode 100644 index 000000000..0f91cbd90 --- /dev/null +++ b/man/add_ipdc_cols.Rd @@ -0,0 +1,23 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/create_individual_file.R +\name{add_ipdc_cols} +\alias{add_ipdc_cols} +\title{Add columns based on IPDC} +\usage{ +add_ipdc_cols(episode_file, prefix, condition, ipdc_d = TRUE, elective = TRUE) +} +\arguments{ +\item{episode_file}{Tibble containing episodic data} + +\item{prefix}{Prefix to add to related columns, e.g. "Acute"} + +\item{condition}{Condition to create new columns based on} + +\item{ipdc_d}{Whether to create columns based on IPDC = "D" (lgl)} + +\item{elective}{Whether to create columns based on Elective/Non-Elective cij_pattype (lgl)} +} +\description{ +Add columns based on value in IPDC column, which can +be further split by Elective/Non-Elective CIJ. +} diff --git a/man/add_mat_columns.Rd b/man/add_mat_columns.Rd new file mode 100644 index 000000000..aae729323 --- /dev/null +++ b/man/add_mat_columns.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/create_individual_file.R +\name{add_mat_columns} +\alias{add_mat_columns} +\title{Add Mat columns} +\usage{ +add_mat_columns(episode_file, prefix, condition) +} +\arguments{ +\item{episode_file}{Tibble containing episodic data} + +\item{prefix}{Prefix to add to related columns, e.g. "Acute"} + +\item{condition}{Condition to create new columns based on} +} +\description{ +Add Mat columns +} diff --git a/man/add_mh_columns.Rd b/man/add_mh_columns.Rd new file mode 100644 index 000000000..3c50c6cb8 --- /dev/null +++ b/man/add_mh_columns.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/create_individual_file.R +\name{add_mh_columns} +\alias{add_mh_columns} +\title{Add MH columns} +\usage{ +add_mh_columns(episode_file, prefix, condition) +} +\arguments{ +\item{episode_file}{Tibble containing episodic data} + +\item{prefix}{Prefix to add to related columns, e.g. "Acute"} + +\item{condition}{Condition to create new columns based on} +} +\description{ +Add MH columns +} diff --git a/man/add_nrs_columns.Rd b/man/add_nrs_columns.Rd new file mode 100644 index 000000000..9d7b3f8bf --- /dev/null +++ b/man/add_nrs_columns.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/create_individual_file.R +\name{add_nrs_columns} +\alias{add_nrs_columns} +\title{Add NRS columns} +\usage{ +add_nrs_columns(episode_file, prefix, condition) +} +\arguments{ +\item{episode_file}{Tibble containing episodic data} + +\item{prefix}{Prefix to add to related columns, e.g. "Acute"} + +\item{condition}{Condition to create new columns based on} +} +\description{ +Add NRS columns +} diff --git a/man/add_nsu_columns.Rd b/man/add_nsu_columns.Rd new file mode 100644 index 000000000..6a54bbcbf --- /dev/null +++ b/man/add_nsu_columns.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/create_individual_file.R +\name{add_nsu_columns} +\alias{add_nsu_columns} +\title{Add NSU columns} +\usage{ +add_nsu_columns(episode_file, prefix, condition) +} +\arguments{ +\item{episode_file}{Tibble containing episodic data} + +\item{prefix}{Prefix to add to related columns, e.g. "Acute"} + +\item{condition}{Condition to create new columns based on} +} +\description{ +Add NSU columns +} diff --git a/man/add_ooh_columns.Rd b/man/add_ooh_columns.Rd new file mode 100644 index 000000000..01814ab6d --- /dev/null +++ b/man/add_ooh_columns.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/create_individual_file.R +\name{add_ooh_columns} +\alias{add_ooh_columns} +\title{Add OoH columns} +\usage{ +add_ooh_columns(episode_file, prefix, condition) +} +\arguments{ +\item{episode_file}{Tibble containing episodic data} + +\item{prefix}{Prefix to add to related columns, e.g. "Acute"} + +\item{condition}{Condition to create new columns based on} +} +\description{ +Add OoH columns +} diff --git a/man/add_op_columns.Rd b/man/add_op_columns.Rd new file mode 100644 index 000000000..08c4419e2 --- /dev/null +++ b/man/add_op_columns.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/create_individual_file.R +\name{add_op_columns} +\alias{add_op_columns} +\title{Add OP columns} +\usage{ +add_op_columns(episode_file, prefix, condition) +} +\arguments{ +\item{episode_file}{Tibble containing episodic data} + +\item{prefix}{Prefix to add to related columns, e.g. "Acute"} + +\item{condition}{Condition to create new columns based on} +} +\description{ +Add OP columns +} diff --git a/man/add_pis_columns.Rd b/man/add_pis_columns.Rd new file mode 100644 index 000000000..b582acf2e --- /dev/null +++ b/man/add_pis_columns.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/create_individual_file.R +\name{add_pis_columns} +\alias{add_pis_columns} +\title{Add PIS columns} +\usage{ +add_pis_columns(episode_file, prefix, condition) +} +\arguments{ +\item{episode_file}{Tibble containing episodic data} + +\item{prefix}{Prefix to add to related columns, e.g. "Acute"} + +\item{condition}{Condition to create new columns based on} +} +\description{ +Add PIS columns +} diff --git a/man/add_sds_columns.Rd b/man/add_sds_columns.Rd new file mode 100644 index 000000000..d5a5fb2cf --- /dev/null +++ b/man/add_sds_columns.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/create_individual_file.R +\name{add_sds_columns} +\alias{add_sds_columns} +\title{Add SDS columns} +\usage{ +add_sds_columns(episode_file, prefix, condition) +} +\arguments{ +\item{episode_file}{Tibble containing episodic data} + +\item{prefix}{Prefix to add to related columns, e.g. "Acute"} + +\item{condition}{Condition to create new columns based on} +} +\description{ +Add SDS columns +} diff --git a/man/add_standard_cols.Rd b/man/add_standard_cols.Rd new file mode 100644 index 000000000..744aa49de --- /dev/null +++ b/man/add_standard_cols.Rd @@ -0,0 +1,28 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/create_individual_file.R +\name{add_standard_cols} +\alias{add_standard_cols} +\title{Add standard columns} +\usage{ +add_standard_cols( + episode_file, + prefix, + condition, + episode = FALSE, + cost = FALSE +) +} +\arguments{ +\item{episode_file}{Tibble containing episodic data} + +\item{prefix}{Prefix to add to related columns, e.g. "Acute"} + +\item{condition}{Condition to create new columns based on} + +\item{episode}{Whether to create prefix_episodes col, e.g. "Acute_episodes"} + +\item{cost}{Whether to create prefix_cost col, e.g. "Acute_cost"} +} +\description{ +Add standard columns (DoB, postcode, gpprac, episodes, cost) to episode file. +} diff --git a/man/aggregate_by_chi.Rd b/man/aggregate_by_chi.Rd new file mode 100644 index 000000000..73804ad9b --- /dev/null +++ b/man/aggregate_by_chi.Rd @@ -0,0 +1,15 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/create_individual_file.R +\name{aggregate_by_chi} +\alias{aggregate_by_chi} +\title{Aggregate by CHI} +\usage{ +aggregate_by_chi(episode_file) +} +\arguments{ +\item{episode_file}{Tibble containing episodic data} +} +\description{ +Aggregate episode file by CHI to convert into +individual file. +} diff --git a/man/aggregate_by_chi_zihao.Rd b/man/aggregate_by_chi_zihao.Rd new file mode 100644 index 000000000..3d4961e19 --- /dev/null +++ b/man/aggregate_by_chi_zihao.Rd @@ -0,0 +1,15 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/aggregate_by_chi_zihao.R +\name{aggregate_by_chi_zihao} +\alias{aggregate_by_chi_zihao} +\title{Aggregate by CHI} +\usage{ +aggregate_by_chi_zihao(episode_file) +} +\arguments{ +\item{episode_file}{Tibble containing episodic data} +} +\description{ +Aggregate episode file by CHI to convert into +individual file. +} diff --git a/man/aggregate_ch_episodes.Rd b/man/aggregate_ch_episodes.Rd new file mode 100644 index 000000000..2753da14f --- /dev/null +++ b/man/aggregate_ch_episodes.Rd @@ -0,0 +1,14 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/create_individual_file.R +\name{aggregate_ch_episodes} +\alias{aggregate_ch_episodes} +\title{Aggregate CIS episodes} +\usage{ +aggregate_ch_episodes(episode_file) +} +\arguments{ +\item{episode_file}{Tibble containing episodic data} +} +\description{ +Aggregate CH variables by CHI and CIS. +} diff --git a/man/aggregate_ch_episodes_zihao.Rd b/man/aggregate_ch_episodes_zihao.Rd new file mode 100644 index 000000000..808262654 --- /dev/null +++ b/man/aggregate_ch_episodes_zihao.Rd @@ -0,0 +1,14 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/aggregate_by_chi_zihao.R +\name{aggregate_ch_episodes_zihao} +\alias{aggregate_ch_episodes_zihao} +\title{Aggregate CIS episodes} +\usage{ +aggregate_ch_episodes_zihao(episode_file) +} +\arguments{ +\item{episode_file}{Tibble containing episodic data} +} +\description{ +Aggregate CH variables by CHI and CIS. +} diff --git a/man/clean_individual_file.Rd b/man/clean_individual_file.Rd new file mode 100644 index 000000000..fb2d3ae13 --- /dev/null +++ b/man/clean_individual_file.Rd @@ -0,0 +1,16 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/create_individual_file.R +\name{clean_individual_file} +\alias{clean_individual_file} +\title{Clean individual file} +\usage{ +clean_individual_file(individual_file, year) +} +\arguments{ +\item{individual_file}{Individual file where each row represents a unique CHI} + +\item{year}{Financial year e.g 1718} +} +\description{ +Clean up columns in individual file +} diff --git a/man/clean_up_ch.Rd b/man/clean_up_ch.Rd new file mode 100644 index 000000000..0182c84e8 --- /dev/null +++ b/man/clean_up_ch.Rd @@ -0,0 +1,16 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/create_individual_file.R +\name{clean_up_ch} +\alias{clean_up_ch} +\title{Clean up CH} +\usage{ +clean_up_ch(episode_file, year) +} +\arguments{ +\item{episode_file}{Tibble containing episodic data} + +\item{year}{The year to process, in FY format.} +} +\description{ +Clean up CH-related columns. +} diff --git a/man/clean_up_gender.Rd b/man/clean_up_gender.Rd new file mode 100644 index 000000000..edf05bfc8 --- /dev/null +++ b/man/clean_up_gender.Rd @@ -0,0 +1,14 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/create_individual_file.R +\name{clean_up_gender} +\alias{clean_up_gender} +\title{Clean up gender column} +\usage{ +clean_up_gender(individual_file) +} +\arguments{ +\item{individual_file}{Individual file where each row represents a unique CHI} +} +\description{ +Clean up column containing gender. +} diff --git a/man/condition_cols.Rd b/man/condition_cols.Rd new file mode 100644 index 000000000..ba037a609 --- /dev/null +++ b/man/condition_cols.Rd @@ -0,0 +1,13 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/create_individual_file.R +\name{condition_cols} +\alias{condition_cols} +\title{Condition columns} +\usage{ +condition_cols() +} +\description{ +Returns chr vector of column names +which follow format "condition" and "condition_date" e.g. +"dementia" and "dementia_date" +} diff --git a/man/create_individual_file.Rd b/man/create_individual_file.Rd new file mode 100644 index 000000000..fa759e7b1 --- /dev/null +++ b/man/create_individual_file.Rd @@ -0,0 +1,34 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/create_individual_file.R +\name{create_individual_file} +\alias{create_individual_file} +\title{Create individual file} +\usage{ +create_individual_file( + episode_file, + year, + write_to_disk = TRUE, + anon_chi_in = TRUE, + anon_chi_out = TRUE +) +} +\arguments{ +\item{episode_file}{Tibble containing episodic data} + +\item{year}{The year to process, in FY format.} + +\item{write_to_disk}{(optional) Should the data be written to disk default is +\code{TRUE} i.e. write the data to disk.} + +\item{anon_chi_in}{(Default:TRUE) Is \code{anon_chi} used in the input +(instead of chi)} + +\item{anon_chi_out}{(Default:TRUE) Should \code{anon_chi} be used in the output +(instead of chi)} +} +\value{ +The processed individual file +} +\description{ +Creates individual file from episode file +} diff --git a/man/join_sc_client.Rd b/man/join_sc_client.Rd new file mode 100644 index 000000000..a30719698 --- /dev/null +++ b/man/join_sc_client.Rd @@ -0,0 +1,26 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/create_individual_file.R +\name{join_sc_client} +\alias{join_sc_client} +\title{Join sc client variables onto individual file} +\usage{ +join_sc_client( + individual_file, + year, + sc_client = read_file(get_source_extract_path(year, "Client")), + sc_demographics = read_file(get_sc_demog_lookup_path(), col_select = + c("sending_location", "social_care_id", "chi")) +) +} +\arguments{ +\item{individual_file}{the processed individual file} + +\item{year}{financial year.} + +\item{sc_client}{SC client lookup} + +\item{sc_demographics}{SC Demographic lookup} +} +\description{ +Match on sc client variables. +} diff --git a/man/join_slf_lookup_vars.Rd b/man/join_slf_lookup_vars.Rd new file mode 100644 index 000000000..980c66f31 --- /dev/null +++ b/man/join_slf_lookup_vars.Rd @@ -0,0 +1,27 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/create_individual_file.R +\name{join_slf_lookup_vars} +\alias{join_slf_lookup_vars} +\title{Join slf lookup variables} +\usage{ +join_slf_lookup_vars( + individual_file, + slf_postcode_lookup = read_file(get_slf_postcode_path()), + slf_gpprac_lookup = read_file(get_slf_gpprac_path(), col_select = c("gpprac", + "cluster", "hbpraccode")), + hbrescode_var = "hb2018" +) +} +\arguments{ +\item{individual_file}{the processed individual file.} + +\item{slf_postcode_lookup}{SLF processed postcode lookup} + +\item{slf_gpprac_lookup}{SLF processed gpprac lookup} + +\item{hbrescode_var}{hbrescode variable} +} +\description{ +Join lookup variables from slf postcode lookup and slf gpprac +lookup. +} diff --git a/man/max_no_inf.Rd b/man/max_no_inf.Rd new file mode 100644 index 000000000..79b9a1057 --- /dev/null +++ b/man/max_no_inf.Rd @@ -0,0 +1,16 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/create_individual_file.R +\name{max_no_inf} +\alias{max_no_inf} +\title{Custom maximum} +\usage{ +max_no_inf(x) +} +\arguments{ +\item{x}{Vector to return max of} +} +\description{ +Custom maximum function which removes +missing values but doesn't return Inf if all values +are missing (instead returns NA) +} diff --git a/man/min_no_inf.Rd b/man/min_no_inf.Rd new file mode 100644 index 000000000..38029214f --- /dev/null +++ b/man/min_no_inf.Rd @@ -0,0 +1,16 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/create_individual_file.R +\name{min_no_inf} +\alias{min_no_inf} +\title{Custom minimum} +\usage{ +min_no_inf(x) +} +\arguments{ +\item{x}{Vector to return min of} +} +\description{ +Custom minimum function which removes +missing values but doesn't return Inf if all values +are missing (instead returns NA) +} diff --git a/man/recode_gender.Rd b/man/recode_gender.Rd new file mode 100644 index 000000000..526d2829d --- /dev/null +++ b/man/recode_gender.Rd @@ -0,0 +1,14 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/create_individual_file.R +\name{recode_gender} +\alias{recode_gender} +\title{Recode gender} +\usage{ +recode_gender(episode_file) +} +\arguments{ +\item{episode_file}{Tibble containing episodic data} +} +\description{ +Recode gender to 1.5 if 0 or 9. +} diff --git a/man/remove_blank_chi.Rd b/man/remove_blank_chi.Rd new file mode 100644 index 000000000..9cba40a8f --- /dev/null +++ b/man/remove_blank_chi.Rd @@ -0,0 +1,14 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/create_individual_file.R +\name{remove_blank_chi} +\alias{remove_blank_chi} +\title{Remove blank CHI} +\usage{ +remove_blank_chi(episode_file) +} +\arguments{ +\item{episode_file}{Tibble containing episodic data} +} +\description{ +Convert blank strings to NA and remove NAs from CHI column +} diff --git a/man/select.Rd b/man/select.Rd new file mode 100644 index 000000000..435096d9a --- /dev/null +++ b/man/select.Rd @@ -0,0 +1,30 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/aggregate_by_chi_zihao.R +\name{vars_end_with} +\alias{vars_end_with} +\alias{vars_start_with} +\alias{vars_contain} +\title{select columns ending with some patterns} +\usage{ +vars_end_with(data, vars, ignore_case = FALSE) + +vars_start_with(data, vars, ignore_case = FALSE) + +vars_contain(data, vars, ignore_case = FALSE) +} +\description{ +select columns ending with some patterns + +select columns starting with some patterns + +select columns contains some characters +} +\section{Functions}{ +\itemize{ +\item \code{vars_end_with()}: columns based on patterns + +\item \code{vars_start_with()}: columns based on patterns + +\item \code{vars_contain()}: columns based on patterns + +}} From 55d5948a50a7714a4110d0c940ad28fd2f14663b Mon Sep 17 00:00:00 2001 From: James McMahon Date: Wed, 19 Jul 2023 12:55:42 +0100 Subject: [PATCH 11/16] Fix missed bracket in _targets.R --- _targets.R | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/_targets.R b/_targets.R index ef2fbbe74..af0fd257c 100644 --- a/_targets.R +++ b/_targets.R @@ -561,8 +561,9 @@ list( process_tests_individual_file( data = individual_file, year = year -), - tar_target( + ) + ), + tar_target( episode_file_dataset, arrow::write_dataset( dataset = episode_file, From e4c14652ec8d919fdaa077f193dfab31ca3f2782 Mon Sep 17 00:00:00 2001 From: James McMahon Date: Wed, 19 Jul 2023 15:14:25 +0100 Subject: [PATCH 12/16] Update arrow dataset targets --- _targets.R | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/_targets.R b/_targets.R index af0fd257c..0886466a8 100644 --- a/_targets.R +++ b/_targets.R @@ -567,13 +567,31 @@ list( episode_file_dataset, arrow::write_dataset( dataset = episode_file, - path = fs::path_ext_remove(slf_episode_path), + path = fs::path( + get_year_dir(year), + stringr::str_glue("source-episode-file-{year}") + ), format = "parquet", # Should correspond to the available slfhelper filters partitioning = c("recid", "hscp2018"), compression = "zstd", version = "latest" ) + ), + tar_target( + individual_file_dataset, + arrow::write_dataset( + dataset = individual_file, + path = fs::path( + get_year_dir(year), + stringr::str_glue("source-individual-file-{year}") + ), + format = "parquet", + # Should correspond to the available slfhelper filters + partitioning = c("hscp2018"), + compression = "zstd", + version = "latest" + ) ) ) ) From 24012df030e61df8f7e0126507fd63c08747f38c Mon Sep 17 00:00:00 2001 From: James McMahon Date: Thu, 20 Jul 2023 19:27:36 +0100 Subject: [PATCH 13/16] Fix for years with no DN data (#755) * Fix for years with no DN data 21/22 and 22/23 we're failing because they didn't have `total_no_dn_contacts` this is a bit of a crude fix but should work for any year, and if the variable doesn't exist in the episode file it will be created as `NA` in the individual file. * Selectively clean up variables after --- R/create_individual_file.R | 36 +++++++++++++++++++++++------------- 1 file changed, 23 insertions(+), 13 deletions(-) diff --git a/R/create_individual_file.R b/R/create_individual_file.R index 675e2066a..b7812c806 100644 --- a/R/create_individual_file.R +++ b/R/create_individual_file.R @@ -25,7 +25,7 @@ create_individual_file <- function( } individual_file <- episode_file %>% - dplyr::select( + dplyr::select(dplyr::any_of(c( "year", "chi", "dob", @@ -57,7 +57,7 @@ create_individual_file <- function( "hc_hours_annual", "hc_reablement", "ooh_case_id" - ) %>% + ))) %>% remove_blank_chi() %>% add_cij_columns() %>% add_all_columns() %>% @@ -321,9 +321,21 @@ add_ooh_columns <- function(episode_file, prefix, condition) { #' @inheritParams add_acute_columns add_dn_columns <- function(episode_file, prefix, condition) { condition <- substitute(condition) - episode_file %>% - add_standard_cols(prefix, condition, episode = TRUE, cost = TRUE) %>% - dplyr::mutate("{prefix}_contacts" := dplyr::if_else(eval(condition), .data$total_no_dn_contacts, NA_integer_)) + if ("total_no_dn_contacts" %in% names(episode_file)) { + episode_file %>% + add_standard_cols(prefix, condition, episode = TRUE, cost = TRUE) %>% + dplyr::mutate( + "{prefix}_contacts" := dplyr::if_else( + eval(condition), + .data$total_no_dn_contacts, + NA_integer_ + ) + ) + } else { + episode_file %>% + add_standard_cols(prefix, condition, episode = TRUE, cost = TRUE) %>% + dplyr::mutate("{prefix}_contacts" := NA_integer_) + } } #' Add CMH columns @@ -760,14 +772,12 @@ clean_individual_file <- function(individual_file, year) { cli::cli_alert_info("Clean individual file function started at {Sys.time()}") individual_file %>% - dplyr::select( - !c( - "ch_no_cost", - "no_paid_items", - "total_no_dn_contacts", - "cost_total_net_inc_dnas" - ) - ) %>% + dplyr::select(dplyr::any_of(!c( + "ch_no_cost", + "no_paid_items", + "total_no_dn_contacts", + "cost_total_net_inc_dnas" + ))) %>% clean_up_gender() %>% dplyr::mutate(age = compute_mid_year_age(year, .data$dob)) } From 9b676fc2bcc0f656ed43d96a218258f2526a30fc Mon Sep 17 00:00:00 2001 From: James McMahon Date: Thu, 20 Jul 2023 19:28:00 +0100 Subject: [PATCH 14/16] Avoid duplcate `health_net_cost_inc_dnas` (#756) * Selectively clean up variables after * Avoid selecting variables we don't want `cost_total_net_inc_dnas` was being picked up here which we didn't want. --- R/aggregate_by_chi_zihao.R | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/R/aggregate_by_chi_zihao.R b/R/aggregate_by_chi_zihao.R index 0eee203e8..7d9ce5ed3 100644 --- a/R/aggregate_by_chi_zihao.R +++ b/R/aggregate_by_chi_zihao.R @@ -81,8 +81,7 @@ aggregate_by_chi_zihao <- function(episode_file) { "other", "dn", "nhs24", - "pcc", - "_dnas" + "pcc" ) ), vars_start_with( From 4eb6b9395b2b27fd6366b7a1874437005bc33ea4 Mon Sep 17 00:00:00 2001 From: James McMahon Date: Fri, 21 Jul 2023 12:25:41 +0100 Subject: [PATCH 15/16] Fix a typo --- R/create_individual_file.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/create_individual_file.R b/R/create_individual_file.R index b7812c806..e2cf996a1 100644 --- a/R/create_individual_file.R +++ b/R/create_individual_file.R @@ -772,7 +772,7 @@ clean_individual_file <- function(individual_file, year) { cli::cli_alert_info("Clean individual file function started at {Sys.time()}") individual_file %>% - dplyr::select(dplyr::any_of(!c( + dplyr::select(!dplyr::any_of(c( "ch_no_cost", "no_paid_items", "total_no_dn_contacts", From 1efe25e7a092310fafc931f2557d49e065a11a1c Mon Sep 17 00:00:00 2001 From: Jennit07 <67372904+Jennit07@users.noreply.github.com> Date: Mon, 31 Jul 2023 12:54:46 +0100 Subject: [PATCH 16/16] Add tests for delayed discharges extract (#760) * Add tests for delayed discharges extract * Style code * Change calculation to TRUE/FALSE Co-authored-by: James McMahon * Remove TODO and add DD tests to targets pipeline --------- Co-authored-by: Jennit07 Co-authored-by: James McMahon --- NAMESPACE | 1 + R/process_tests_delayed_discharges.R | 50 +++++++++++++++++++++++++ _targets.R | 8 +++- man/process_tests_delayed_discharges.Rd | 20 ++++++++++ man/produce_source_dd_tests.Rd | 28 ++++++++++++++ man/produce_source_pis_tests.Rd | 4 ++ 6 files changed, 110 insertions(+), 1 deletion(-) create mode 100644 R/process_tests_delayed_discharges.R create mode 100644 man/process_tests_delayed_discharges.Rd create mode 100644 man/produce_source_dd_tests.Rd diff --git a/NAMESPACE b/NAMESPACE index d87bf9397..464cced34 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -115,6 +115,7 @@ export(process_tests_ae) export(process_tests_alarms_telecare) export(process_tests_care_home) export(process_tests_cmh) +export(process_tests_delayed_discharges) export(process_tests_district_nursing) export(process_tests_episode_file) export(process_tests_gp_ooh) diff --git a/R/process_tests_delayed_discharges.R b/R/process_tests_delayed_discharges.R new file mode 100644 index 000000000..b540d1f74 --- /dev/null +++ b/R/process_tests_delayed_discharges.R @@ -0,0 +1,50 @@ +#' Process Delayed Discharges tests +#' +#' @description Takes the processed Delayed Discharges extract and produces +#' a test comparison with the previous data. This is written to disk as a CSV. +#' +#' @param data a [tibble][tibble::tibble-package] of the processed data extract. +#' @param year the financial year of the extract in the format '1718'. +#' +#' @return a [tibble][tibble::tibble-package] containing a test comparison. +#' +#' @export +process_tests_delayed_discharges <- function(data, year) { + old_data <- get_existing_data_for_tests(data) + + comparison <- produce_test_comparison( + old_data = produce_source_dd_tests(old_data), + new_data = produce_source_dd_tests(data) + ) %>% + write_tests_xlsx(sheet_name = "DD", year) + + return(comparison) +} + +#' Delayed Discharges extract tests +#' +#' @description Produce tests for the delayed discharges extract. +#' +#' @param data new or old data for testing summary flags +#' (data is from [get_source_extract_path()]) +#' +#' @return a dataframe with a count of each flag +#' from [calculate_measures()] +#' +#' @family extract test functions +#' for creating test flags +#' @seealso calculate_measures +produce_source_dd_tests <- function(data) { + test_flags <- data %>% + dplyr::mutate( + n_delay_episodes = 1L, + code9_episodes = .data$primary_delay_reason == "9" + ) %>% + create_hb_test_flags(.data$hbtreatcode) %>% + # keep variables for comparison + dplyr::select(c("n_delay_episodes":dplyr::last_col())) %>% + # use function to sum new test flags + calculate_measures(measure = "sum") + + return(test_flags) +} diff --git a/_targets.R b/_targets.R index 0886466a8..f50045aed 100644 --- a/_targets.R +++ b/_targets.R @@ -310,12 +310,18 @@ list( year ) ), - # TODO add tests for the Delayed Discharges extract tar_target(source_dd_extract, process_extract_delayed_discharges( dd_data, year, write_to_disk = write_to_disk )), + tar_target( + tests_source_dd_extract, + process_tests_delayed_discharges( + source_dd_extract, + year + ) + ), tar_target(source_dn_extract, process_extract_district_nursing( dn_data, year, diff --git a/man/process_tests_delayed_discharges.Rd b/man/process_tests_delayed_discharges.Rd new file mode 100644 index 000000000..68e1b8f17 --- /dev/null +++ b/man/process_tests_delayed_discharges.Rd @@ -0,0 +1,20 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/process_tests_delayed_discharges.R +\name{process_tests_delayed_discharges} +\alias{process_tests_delayed_discharges} +\title{Process Delayed Discharges tests} +\usage{ +process_tests_delayed_discharges(data, year) +} +\arguments{ +\item{data}{a \link[tibble:tibble-package]{tibble} of the processed data extract.} + +\item{year}{the financial year of the extract in the format '1718'.} +} +\value{ +a \link[tibble:tibble-package]{tibble} containing a test comparison. +} +\description{ +Takes the processed Delayed Discharges extract and produces +a test comparison with the previous data. This is written to disk as a CSV. +} diff --git a/man/produce_source_dd_tests.Rd b/man/produce_source_dd_tests.Rd new file mode 100644 index 000000000..2eb9f6455 --- /dev/null +++ b/man/produce_source_dd_tests.Rd @@ -0,0 +1,28 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/process_tests_delayed_discharges.R +\name{produce_source_dd_tests} +\alias{produce_source_dd_tests} +\title{Delayed Discharges extract tests} +\usage{ +produce_source_dd_tests(data) +} +\arguments{ +\item{data}{new or old data for testing summary flags +(data is from \code{\link[=get_source_extract_path]{get_source_extract_path()}})} +} +\value{ +a dataframe with a count of each flag +from \code{\link[=calculate_measures]{calculate_measures()}} +} +\description{ +Produce tests for the delayed discharges extract. +} +\seealso{ +calculate_measures + +Other extract test functions +for creating test flags: +\code{\link{produce_source_pis_tests}()} +} +\concept{extract test functions +for creating test flags} diff --git a/man/produce_source_pis_tests.Rd b/man/produce_source_pis_tests.Rd index 070cc789d..487ad2fd7 100644 --- a/man/produce_source_pis_tests.Rd +++ b/man/produce_source_pis_tests.Rd @@ -24,6 +24,10 @@ episode date variables. } \seealso{ calculate_measures + +Other extract test functions +for creating test flags: +\code{\link{produce_source_dd_tests}()} } \concept{extract test functions for creating test flags}