From f6854d6ad5690294a023f17b53f89a1fe71dbcd3 Mon Sep 17 00:00:00 2001 From: James McMahon Date: Tue, 18 Jul 2023 13:45:02 +0100 Subject: [PATCH 1/2] Revert "Write out as a partitioned arrow dataset (#726)" This reverts commit 654dc00232a5ff87c1e97d614ad360cf42e03fcd. --- R/run_episode_file.R | 15 ++------------- 1 file changed, 2 insertions(+), 13 deletions(-) diff --git a/R/run_episode_file.R b/R/run_episode_file.R index 16b7ee3c2..1f2bb33ed 100644 --- a/R/run_episode_file.R +++ b/R/run_episode_file.R @@ -118,8 +118,7 @@ run_episode_file <- function( } if (write_to_disk) { - # TODO make the slf_path a function - slf_episode_path <- get_file_path( + slf_path <- get_file_path( get_year_dir(year), stringr::str_glue( "source-episode-file-{year}.parquet" @@ -127,17 +126,7 @@ run_episode_file <- function( check_mode = "write" ) - write_file(episode_file, slf_episode_path) - - arrow::write_dataset( - dataset = episode_file, - path = fs::path_ext_remove(slf_episode_path), - format = "parquet", - # Should correspond to the available slfhelper filters - partitioning = c("recid", "hscp2018"), - compression = "zstd", - version = "latest" - ) + write_file(episode_file, slf_path) } return(episode_file) From 54c16369dc4acf9867441a543af170036e19029f Mon Sep 17 00:00:00 2001 From: James McMahon Date: Tue, 18 Jul 2023 14:06:59 +0100 Subject: [PATCH 2/2] Write the episode file out as a partitioned dataset This is its own target so it won't hold up the rest of the processing. --- R/run_episode_file.R | 5 +++-- _targets.R | 12 ++++++++++++ 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/R/run_episode_file.R b/R/run_episode_file.R index 1f2bb33ed..668a40124 100644 --- a/R/run_episode_file.R +++ b/R/run_episode_file.R @@ -118,7 +118,8 @@ run_episode_file <- function( } if (write_to_disk) { - slf_path <- get_file_path( + # TODO make the slf_path a function + slf_episode_path <- get_file_path( get_year_dir(year), stringr::str_glue( "source-episode-file-{year}.parquet" @@ -126,7 +127,7 @@ run_episode_file <- function( check_mode = "write" ) - write_file(episode_file, slf_path) + write_file(episode_file, slf_episode_path) } return(episode_file) diff --git a/_targets.R b/_targets.R index 869d0d194..58e5f573f 100644 --- a/_targets.R +++ b/_targets.R @@ -547,6 +547,18 @@ list( data = episode_file, year = year ) + ), + tar_target( + episode_file_dataset, + arrow::write_dataset( + dataset = episode_file, + path = fs::path_ext_remove(slf_episode_path), + format = "parquet", + # Should correspond to the available slfhelper filters + partitioning = c("recid", "hscp2018"), + compression = "zstd", + version = "latest" + ) ) ) )