From 5f565fc341ba2131221c95878a4a30ffe2aafa40 Mon Sep 17 00:00:00 2001 From: Jennit07 <67372904+Jennit07@users.noreply.github.com> Date: Wed, 22 May 2024 09:54:11 +0100 Subject: [PATCH] Update `00_sort_bi_extracts` to write anon_chi (#952) * Update `00_sort_BI_extracts` Save a new file with `anon-` prefix and use slfhelper to get the anon_chi * remove file copy * Update `00_sort_bi_extracts` note * Style code * Update chi when this is different e.g UPI number or PAT_UPI * remove storing as a dataframe * Add condition if CHI exists in data file * update 00_Sort_BI_Extracts replace for loop by function to enable parallel computing with lapply * Style code * merge similar code * simplify sort_bi_extracts --------- Co-authored-by: Jennit07 Co-authored-by: Zihao Li Co-authored-by: lizihao-anu --- 00_Sort_BI_Extracts.R | 42 ++++++++++++++++++++++++++++++------------ 1 file changed, 30 insertions(+), 12 deletions(-) diff --git a/00_Sort_BI_Extracts.R b/00_Sort_BI_Extracts.R index 7cf7d0708..0638123b9 100644 --- a/00_Sort_BI_Extracts.R +++ b/00_Sort_BI_Extracts.R @@ -1,5 +1,5 @@ # Define the source directory and financial year pattern -compress_files <- FALSE +compress_files <- TRUE source_dir <- "/conf/sourcedev/Source_Linkage_File_Updates/Extracts Temp" pattern <- "-20(\\d{4})\\.csv" @@ -20,31 +20,49 @@ extract_financial_year <- function(filename) { } } -# Create directories for each financial year and move files -for (csv_file in csv_files) { +# Create a function to read variable names +is_chi_in_file <- function(filename) { + data <- read.csv(filename, nrow = 1) + return(grepl("UPI", names(data)) %>% any()) +} + +# function to move files +move_temps_to_year_extract <- function(csv_file, compress_files = TRUE) { financial_year <- extract_financial_year(csv_file) # check if year directory exists if (!is.null(financial_year)) { financial_year_dir <- file.path("/conf/sourcedev/Source_Linkage_File_Updates", financial_year, "Extracts") - # if not, create the year directory + # if financial_year_dir does not exist, create the year directory if (!dir.exists(financial_year_dir)) { dir.create(financial_year_dir) } + new_file_path <- file.path(financial_year_dir, paste0("anon-", basename(csv_file))) + + # set up new file path location to move each file to their destination. + chi_in_file <- is_chi_in_file(csv_file) + if (chi_in_file) { + read_file(csv_file) %>% + dplyr::rename_with(~ paste0("chi"), tidyselect::contains("UPI")) %>% + slfhelper::get_anon_chi() %>% + readr::write_csv(file = new_file_path) + cat("Replaced chi with anon chi:", csv_file, "to", new_file_path, "\n") + } else { + fs::file_copy(csv_file, new_file_path, overwrite = TRUE) + cat("Moved", csv_file, "to", new_file_path, "\n") + } + # compress file if (compress_files) { - cat("Compressing:", basename(csv_file), "\n") + cat("Compressing:", basename(new_file_path), "\n") system2( command = "gzip", - args = shQuote(csv_file) + args = shQuote(new_file_path) ) - csv_file <- paste0(csv_file, ".gz") } - - # move file - new_file_path <- file.path(financial_year_dir, basename(csv_file)) - fs::file_copy(csv_file, new_file_path, overwrite = TRUE) + # remove old files file.remove(csv_file) - cat("Moved:", csv_file, "to", new_file_path, "\n") } } + +lapply(csv_files, move_temps_to_year_extract, compress_files = compress_files)