add duplicated chi number to the tests in process_tests_individual_file

Public-Health-Scotland · Aug 9, 2023 · 61451d0 · 61451d0
1 parent b36386e
commit 61451d0
Showing 1 changed file with 114 additions and 15 deletions.
diff --git a/R/process_tests_individual_file.R b/R/process_tests_individual_file.R
@@ -1,18 +1,117 @@
-#' process_tests_individual_file
+#' Process Individual file tests
 #'
-#' @description check whether individual files have duplicated rows for the same chi
+#' @description Takes the processed individual file and produces
+#' a test comparison with the previous data. This is written to disk as a CSV.
 #'
-#' @return NULL if no duplicated chi, OR rows with duplicated chi.
-process_tests_individual_file <- function(individual_file, anon_chi_in = FALSE) {
-  chi_col <- dplyr::if_else(anon_chi_in, "anon_chi", "chi")
-  duplicated_chi <- duplicated(individual_file[[chi_col]])
-  dup_num <- sum(duplicated_chi)
-  if (dup_num < 1L) {
-    print("There is no duplicated CHI")
-    return(NULL)
-  } else {
-    print("There are duplicated CHIs")
-    return(individual_file %>%
-      dplyr::filter(!!sym(chi_col)) %in% duplicated_chi)
-  }
+#' @inherit process_tests_acute
+#'
+#' @export
+process_tests_individual_file <- function(data, year) {
+  data <- data %>%
+    dplyr::select(
+      "year",
+      "anon_chi",
+      "gender",
+      "postcode",
+      "dob",
+      "hbrescode",
+      "health_net_cost",
+      slfhelper::ltc_vars,
+      dplyr::contains(c(
+        "beddays",
+        "cost",
+        "episodes",
+        "attendances",
+        "admissions",
+        "cases",
+        "consultations"
+      ))
+    ) %>%
+    slfhelper::get_chi()
+
+  old_data <- get_existing_data_for_tests(data, file_version = "individual")
+
+  comparison <- produce_test_comparison(
+    old_data = produce_individual_file_tests(old_data),
+    new_data = produce_individual_file_tests(data)
+  ) %>%
+    write_tests_xlsx(sheet_name = "indiv_file", year)
+
+  return(comparison)
+}
+
+#' Source Extract Tests
+#'
+#' @description Produce a set of tests which can be used by most
+#' of the extracts.
+#' This will produce counts of various demographics
+#' using [create_demog_test_flags()] counts of episodes for every `hbrescode`
+#' using [create_hb_test_flags()], a total cost for each `hbrescode` using
+#' [create_hb_cost_test_flags()].
+#' It will also produce various summary statistics for bedday, cost and
+#' episode date variables.
+#'
+#' @param data new or old data for testing summary flags
+#' (data is from [get_source_extract_path()])
+#'
+#' @return a dataframe with a count of each flag
+#' from [calculate_measures()]
+produce_individual_file_tests <- function(data) {
+  names(data) <- tolower(names(data))
+
+  test_flags <- data %>%
+    # use functions to create HB and partnership flags
+    create_demog_test_flags() %>%
+    create_hb_test_flags(.data$hbrescode) %>%
+    create_hb_cost_test_flags(.data$hbrescode, .data$health_net_cost) %>%
+    # keep variables for comparison
+    dplyr::select(c("valid_chi":dplyr::last_col())) %>%
+    # use function to sum new test flags
+    calculate_measures(measure = "sum")
+
+  all_measures <- data %>%
+    calculate_measures(
+      vars = c(
+        "beddays",
+        "cost",
+        "episodes",
+        "attendances",
+        "admissions",
+        "cases",
+        "consulations"
+      ),
+      measure = "all"
+    )
+
+  min_max_measures <- data %>%
+    calculate_measures(
+      vars = c(
+        "health_net_cost"
+      ),
+      measure = "min-max"
+    )
+
+  sum_measures <- data %>%
+    dplyr::select(slfhelper::ltc_vars) %>%
+    calculate_measures(
+      vars = c(
+        slfhelper::ltc_vars
+      ),
+      measure = "sum"
+    )
+
+  dup_chi <- data.frame(measure = "duplicated chi number",
+                        value = duplicated(data$chi) %>%
+                          sum() %>% as.integer())
+
+  join_output <- list(
+    test_flags,
+    all_measures,
+    min_max_measures,
+    sum_measures,
+    dup_chi
+  ) %>%
+    purrr::reduce(dplyr::full_join, by = c("measure", "value"))
+
+  return(join_output)
 }