Merge branch 'mar-23-update' into bug-service_use_cohort

Public-Health-Scotland · Feb 14, 2024 · 760e2e5 · 760e2e5
2 parents b1a9523 + 842e616
commit 760e2e5
Show file tree

Hide file tree

Showing 9 changed files with 232 additions and 153 deletions.
diff --git a/R/fix_sc_dates.R b/R/fix_sc_dates.R
@@ -9,7 +9,7 @@
 #' @return A date vector with replaced end dates
 fix_sc_start_dates <- function(start_date, period_start) {
   # Fix sds_start_date is missing by setting start_date to be the start of
-  # financial year
+  # financial period
   start_date <- dplyr::if_else(
     is.na(start_date),
     period_start,
@@ -30,12 +30,12 @@ fix_sc_start_dates <- function(start_date, period_start) {
 #' @param period Social care latest submission period.
 #'
 #' @return A date vector with replaced end dates
-fix_sc_end_dates <- function(start_date, end_date, period) {
+fix_sc_end_dates <- function(start_date, end_date, period_end_date) {
   # Fix sds_end_date is earlier than sds_start_date by setting end_date to be
   # the end of financial year
   end_date <- dplyr::if_else(
     start_date > end_date,
-    end_fy(year = stringr::str_sub(period, 1L, 4L), "alternate"),
+    period_end_date,
     end_date
   )
 
@@ -57,7 +57,7 @@ fix_sc_end_dates <- function(start_date, end_date, period) {
 #' @return A date vector with replaced end dates
 fix_sc_missing_end_dates <- function(end_date, period_end) {
   # Fix sds_end_date is earlier than sds_start_date by setting end_date to be
-  # the end of financial year
+  # the end of financial period
   end_date <- dplyr::if_else(
     is.na(end_date),
     period_end,

diff --git a/R/process_extract_homelessness.R b/R/process_extract_homelessness.R
@@ -100,6 +100,36 @@ process_extract_homelessness <- function(
         )
       )
     ) %>%
+    dplyr::mutate(property_type_code = as.character(property_type_code)) %>%
+    dplyr::mutate(
+      property_type_code = dplyr::case_when(
+        property_type_code == "1" ~ "1 - Own Property - LA Tenancy",
+        property_type_code == "2" ~ "2 - Own Property - RSL Tenancy",
+        property_type_code == "3" ~ "3 - Own Property - private rented tenancy",
+        property_type_code == "4" ~ "4 - Own Property - tenancy secured through employment/tied house",
+        property_type_code == "5" ~ "5 - Own Property - owning/buying",
+        property_type_code == "6" ~ "6 - Parental / family home / relatives",
+        property_type_code == "7" ~ " 7 - Friends / partners",
+        property_type_code == "8" ~ "8 - Armed Services Accommodation",
+        property_type_code == "9" ~ "9 - Prison",
+        property_type_code == "10" ~ "10 - Hospital",
+        property_type_code == "11" ~ "11 - Children's residential accommodation (looked after by the local authority)",
+        property_type_code == "12" ~ "12 - Supported accommodation",
+        property_type_code == "13" ~ "13 - Hostel (unsupported)",
+        property_type_code == "14" ~ "14 - Bed & Breakfast",
+        property_type_code == "15" ~ "15 - Caravan / mobile home",
+        property_type_code == "16" ~ "16 - Long-term roofless",
+        property_type_code == "17" ~ "17 - Long-term sofa surfing",
+        property_type_code == "18" ~ "18 - Other",
+        property_type_code == "19" ~ "19 - Not known / refused",
+        property_type_code == "20" ~ "20 - Own property - Shared ownership/Shared equity/ LCHO",
+        property_type_code == "21" ~ "21 - Lodger",
+        property_type_code == "22" ~ "22 - Shared Property - Private Rented Sector",
+        property_type_code == "23" ~ "23 - Shared Property - Local Authority",
+        property_type_code == "24" ~ "24 - Shared Property - RSL",
+        TRUE ~ property_type_code
+      )
+    ) %>%
     dplyr::left_join(
       la_code_lookup,
       by = dplyr::join_by("sending_local_authority_code_9" == "CA")

diff --git a/R/process_lookup_sc_demographics.R b/R/process_lookup_sc_demographics.R
@@ -28,30 +28,46 @@ process_lookup_sc_demographics <- function(
     dplyr::pull(.data$pc7)
 
 
-  # Data Cleaning ---------------------------------------
-
+  #  Fill in missing data and flag latest cases to keep ---------------------------------------
   sc_demog <- data %>%
-    dplyr::mutate(
-      # use chi if upi is NA
-      upi = dplyr::coalesce(.data$upi, .data$chi_upi),
-      # check gender code - replace code 99 with 9
-      submitted_gender = replace(.data$submitted_gender, .data$submitted_gender == 99L, 9L)
+    dplyr::rename(
+      chi = chi_upi,
+      gender = chi_gender_code,
+      dob = chi_date_of_birth
     ) %>%
+    # fill in missing demographic details
+    dplyr::arrange(period, social_care_id) %>%
+    dplyr::group_by(social_care_id, sending_location) %>%
+    tidyr::fill(chi, .direction = ("updown")) %>%
+    tidyr::fill(dob, .direction = ("updown")) %>%
+    tidyr::fill(date_of_death, .direction = ("updown")) %>%
+    tidyr::fill(gender, .direction = ("updown")) %>%
+    tidyr::fill(chi_postcode, .direction = ("updown")) %>%
+    tidyr::fill(submitted_postcode, .direction = ("updown")) %>%
+    dplyr::ungroup() %>%
+    # format postcodes using `phsmethods`
+    dplyr::mutate(dplyr::across(tidyselect::contains("postcode"), ~ phsmethods::format_postcode(.x, format = "pc7"))) # are sc postcodes even used anywhere?
+
+
+  # flag unique cases of chi and sc_id, and flag the latest record (sc_demographics latest flag is not accurate)
+  sc_demog <- sc_demog %>%
+    dplyr::group_by(chi, sending_location) %>%
+    dplyr::mutate(latest = dplyr::last(period)) %>% # flag latest period for chi
+    dplyr::group_by(chi, social_care_id, sending_location) %>%
+    dplyr::mutate(latest_sc_id = dplyr::last(period)) %>% # flag latest period for social care
+    dplyr::group_by(chi, sending_location) %>%
+    dplyr::mutate(last_sc_id = dplyr::last(social_care_id)) %>%
     dplyr::mutate(
-      # use CHI sex if available
-      gender = dplyr::if_else(
-        is.na(.data$chi_gender_code) | .data$chi_gender_code == 9L,
-        .data$submitted_gender,
-        .data$chi_gender_code
-      ),
-      # Use CHI DoB if available
-      dob = dplyr::coalesce(.data$chi_date_of_birth, .data$submitted_date_of_birth)
+      latest_flag = ifelse((latest == period & last_sc_id == social_care_id) | is.na(chi), 1, 0),
+      keep = ifelse(latest_sc_id == period, 1, 0)
     ) %>%
-    # format postcodes using `phsmethods`
-    dplyr::mutate(dplyr::across(
-      tidyselect::contains("postcode"),
-      ~ phsmethods::format_postcode(.x, format = "pc7")
-    ))
+    dplyr::ungroup()
+
+  sc_demog <- sc_demog %>%
+    dplyr::select(-period, -latest_record_flag, -latest, -last_sc_id, -latest_sc_id) %>%
+    dplyr::distinct()
+
+  # postcodes ---------------------------------------------------------------
 
   # count number of na postcodes
   na_postcodes <- sc_demog %>%
@@ -69,29 +85,32 @@ process_lookup_sc_demographics <- function(
       ~ dplyr::if_else(stringr::str_detect(.x, uk_pc_regexp), .x, NA)
     )) %>%
     dplyr::select(
-      "latest_record_flag",
-      "extract_date",
       "sending_location",
       "social_care_id",
-      "upi",
+      "chi",
       "gender",
       "dob",
+      "date_of_death",
       "submitted_postcode",
-      "chi_postcode"
+      "chi_postcode",
+      "keep",
+      "latest_flag"
     ) %>%
     # check if submitted_postcode matches with postcode lookup
     dplyr::mutate(
-      valid_pc = .data$submitted_postcode %in% valid_spd_postcodes
+      valid_pc_submitted = .data$submitted_postcode %in% valid_spd_postcodes,
+      valid_pc_chi = .data$chi_postcode %in% valid_spd_postcodes
     ) %>%
     # use submitted_postcode if valid, otherwise use chi_postcode
     dplyr::mutate(postcode = dplyr::case_when(
-      (!is.na(.data$submitted_postcode) & .data$valid_pc) ~ .data$submitted_postcode,
-      (is.na(.data$submitted_postcode) & !.data$valid_pc) ~ .data$chi_postcode
+      (!is.na(.data$chi_postcode) & .data$valid_pc_chi) ~ .data$chi_postcode,
+      ((is.na(.data$chi_postcode) | !(.data$valid_pc_chi)) & !(is.na(.data$submitted_postcode)) & .data$valid_pc_submitted) ~ .data$submitted_postcode,
+      (is.na(.data$submitted_postcode) & !.data$valid_pc_submitted) ~ .data$chi_postcode
     )) %>%
     dplyr::mutate(postcode_type = dplyr::case_when(
-      (!is.na(.data$submitted_postcode) & .data$valid_pc) ~ "submitted",
-      (is.na(.data$submitted_postcode) & !.data$valid_pc) ~ "chi",
-      (is.na(.data$submitted_postcode) & is.na(.data$chi_postcode)) ~ "missing"
+      (postcode == chi_postcode) ~ "chi",
+      (postcode == submitted_postcode) ~ "submitted",
+      (is.na(.data$submitted_postcode) & is.na(.data$chi_postcode) | is.na(.data$postcode)) ~ "missing"
     ))
 
   # Check where the postcodes are coming from
@@ -102,26 +121,32 @@ process_lookup_sc_demographics <- function(
   na_replaced_postcodes <- sc_demog %>%
     dplyr::count(dplyr::across(tidyselect::ends_with("_postcode"), ~ is.na(.x)))
 
-
   sc_demog_lookup <- sc_demog %>%
+    dplyr::filter(keep == 1) %>% # filter to only keep latest record for sc id and chi
+    dplyr::select(-postcode_type, -valid_pc_submitted, -valid_pc_chi, -submitted_postcode, -chi_postcode) %>%
+    dplyr::distinct() %>%
     # group by sending location and ID
-    dplyr::group_by(.data$sending_location, .data$social_care_id) %>%
+    dplyr::group_by(.data$sending_location, .data$chi, .data$social_care_id, .data$latest_flag) %>%
     # arrange so latest submissions are last
     dplyr::arrange(
       .data$sending_location,
       .data$social_care_id,
-      .data$latest_record_flag,
-      .data$extract_date
+      .data$latest_flag
     ) %>%
     # summarise to select the last (non NA) submission
     dplyr::summarise(
-      chi = dplyr::last(.data$upi),
       gender = dplyr::last(.data$gender),
       dob = dplyr::last(.data$dob),
-      postcode = dplyr::last(.data$postcode)
+      postcode = dplyr::last(.data$postcode),
+      date_of_death = dplyr::last(.data$date_of_death)
     ) %>%
     dplyr::ungroup()
 
+  # check to make sure all cases of chi are still there
+  dplyr::n_distinct(sc_demog_lookup$chi) # 524810
+  dplyr::n_distinct(sc_demog_lookup$social_care_id) # 636404
+
+
   if (write_to_disk) {
     write_file(
       sc_demog_lookup,