Skip to content

Commit

Permalink
update 00_Sort_BI_Extracts
Browse files Browse the repository at this point in the history
replace for loop by function to enable parallel computing with lapply
  • Loading branch information
lizihao-anu committed May 20, 2024
1 parent 2824542 commit d70c33d
Showing 1 changed file with 42 additions and 26 deletions.
68 changes: 42 additions & 26 deletions 00_Sort_BI_Extracts.R
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# Define the source directory and financial year pattern
compress_files <- FALSE
compress_files <- TRUE
source_dir <- "/conf/sourcedev/Source_Linkage_File_Updates/Extracts Temp"

Check failure on line 3 in 00_Sort_BI_Extracts.R

View workflow job for this annotation

GitHub Actions / Check Spelling

`sourcedev` is not a recognized word. (unrecognized-spelling)
pattern <- "-20(\\d{4})\\.csv"

Expand All @@ -20,43 +20,59 @@ extract_financial_year <- function(filename) {
}
}

# Create directories for each financial year and move files
for (csv_file in csv_files) {
# Create a function to read variable names
is_chi_in_file <- function(filename){
data <- read.csv(filename, nrow = 1)
return(grepl("UPI", names(data)) %>% any())
}

# function to move files
move_temps_to_year_extract <- function(csv_file, compress_files = TRUE){
financial_year <- extract_financial_year(csv_file)
# check if year directory exists
if (!is.null(financial_year)) {
financial_year_dir <- file.path("/conf/sourcedev/Source_Linkage_File_Updates", financial_year, "Extracts")

Check failure on line 34 in 00_Sort_BI_Extracts.R

View workflow job for this annotation

GitHub Actions / Check Spelling

`sourcedev` is not a recognized word. (unrecognized-spelling)
# if not, create the year directory
# if financial_year_dir does not exist, create the year directory
if (!dir.exists(financial_year_dir)) {
dir.create(financial_year_dir)
}

# compress file
if (compress_files) {
cat("Compressing:", basename(csv_file), "\n")
system2(
command = "gzip",
args = shQuote(csv_file)
)
csv_file <- paste0(csv_file, ".gz")
}

# set up new file path location to move each file to their destination.
new_file_path <- file.path(financial_year_dir, paste0("anon-", basename(csv_file)))

# Read in each file and replace chi with anon_chi
for (csv_file in csv_files) {
if (any(grepl("UPI", names(csv_file)))) {
read_file(csv_file) %>%
dplyr::rename_with(~ paste0("chi"), tidyselect::contains("UPI")) %>%
slfhelper::get_anon_chi(chi = chi) %>%
readr::write_csv(file = new_file_path)
} else {
fs::file_copy(csv_file, new_file_path, overwrite = TRUE)
chi_in_file <- is_chi_in_file(csv_file)
if(chi_in_file){
new_file_path <- file.path(financial_year_dir,
paste0("anon-", basename(csv_file)))
read_file(csv_file) %>%
dplyr::rename_with(~ paste0("chi"), tidyselect::contains("UPI")) %>%
slfhelper::get_anon_chi() %>%
readr::write_csv(file = new_file_path)
cat("Replaced chi with anon chi:", csv_file, "to", new_file_path, "\n")

# compress file
if (compress_files) {
cat("Compressing:", basename(new_file_path), "\n")
system2(
command = "gzip",
args = shQuote(new_file_path)
)
}
file.remove(csv_file)
}else{
new_file_path <- file.path(financial_year_dir, basename(csv_file))
fs::file_copy(csv_file, new_file_path, overwrite = TRUE)
cat("Moved", csv_file, "to", new_file_path, "\n")

# compress file
if (compress_files) {
cat("Compressing:", basename(new_file_path), "\n")
system2(
command = "gzip",
args = shQuote(new_file_path)
)
}
file.remove(csv_file)
cat("Replaced chi with anon chi:", csv_file, "to", new_file_path, "\n")
}
}
}

lapply(csv_files, move_temps_to_year_extract, compress_files = compress_files)

0 comments on commit d70c33d

Please sign in to comment.