diff --git a/.github/workflows/delete-model-runs.yaml b/.github/workflows/delete-model-runs.yaml new file mode 100644 index 00000000..687b25a6 --- /dev/null +++ b/.github/workflows/delete-model-runs.yaml @@ -0,0 +1,59 @@ +# Workflow that can be manually dispatched to delete test model runs that +# do not need to be persisted indefinitely. +# +# Gated such that it's impossible to delete runs older than the current +# assessment cycle, where each assessment cycle starts in April. + +name: delete-model-runs + +on: + workflow_dispatch: + inputs: + run-ids: + description: > + Run IDs: Space-delimited list of IDs of model runs to delete. Note + that the workflow assumes these IDs correspond to model runs for the + current assessment cycle, and if that's not the case the deletion + script will raise an error. + required: true + type: string + default: 2024-01-01-foo-bar 2024-01-02-bar-baz + +jobs: + delete-model-runs: + runs-on: ubuntu-latest + permissions: + # Needed to interact with GitHub's OIDC Token endpoint so we can auth AWS + contents: read + id-token: write + steps: + - name: Checkout repo code + uses: actions/checkout@v4 + + - name: Setup R + uses: r-lib/actions/setup-r@v2 + with: + use-public-rspm: true + + - name: Install system dependencies + run: sudo apt-get install libgit2-dev + shell: bash + + - name: Disable renv sandbox to speed up install time + run: echo "RENV_CONFIG_SANDBOX_ENABLED=FALSE" >> .Renviron + shell: bash + + - name: Setup renv + uses: r-lib/actions/setup-renv@v2 + + - name: Configure AWS credentials + uses: aws-actions/configure-aws-credentials@v4 + with: + role-to-assume: ${{ secrets.AWS_IAM_ROLE_MODEL_DELETION_ARN }} + aws-region: us-east-1 + + - name: Delete model runs + run: Rscript ./R/delete_current_year_model_runs.R "${RUN_IDS// /,}" + shell: bash + env: + RUN_IDS: ${{ inputs.run-ids }} diff --git a/R/delete_current_year_model_runs.R b/R/delete_current_year_model_runs.R new file mode 100644 index 00000000..a1a95019 --- /dev/null +++ b/R/delete_current_year_model_runs.R @@ -0,0 +1,83 @@ +# Script to delete a list of model runs by ID from AWS. +# +# Accepts one argument, a comma-delimited list of run IDs for model runs +# whose artifacts should be deleted. +# +# Assumes that model runs are restricted to the current assessment cycle, where +# each assessment cycle starts in May. Raises an error if no objects matching +# a given ID for the current year could be located in S3. This error will get +# raised before any deletion occurs, so if one or more IDs are invalid then +# no objects will be deleted. +# +# Example usage (delete the runs 123, 456, and 789 in the current year): +# +# delete_current_year_model_runs.R 123,456,789 + +library(glue) +library(here) +library(magrittr) +source(here("R", "helpers.R")) + +current_date <- as.POSIXct(Sys.Date()) +current_month <- current_date %>% format("%m") +current_year <- current_date %>% format("%Y") + +# The following heuristic determines the current upcoming assessment cycle year: +# +# * From May to December (post assessment), `year` = next year +# * From January to April (during assessment), `year` = current year +year <- if (current_month < "05") { + current_year +} else { + as.character(as.numeric(current_year) + 1) +} + +# Convert the comma-delimited input to a vector of run IDs. Accepting one or +# more positional arguments would be a cleaner UX, but since this script is +# intended to be called from a dispatched GitHub workflow, it's easier to parse +# one comma-delimited string than convert a space-separated string passed as a +# workflow input to an array of function arguments +raw_run_ids <- commandArgs(trailingOnly = TRUE) +run_ids <- raw_run_ids %>% + strsplit(split = ",", fixed = TRUE) %>% + unlist() + +"Confirming artifacts exist for run IDs in year {year}: {raw_run_ids}" %>% + glue::glue() %>% + print() + +# We consider a run ID to be valid if it has any matching data in S3 for +# the current year +run_id_is_valid <- function(run_id, year) { + return( + model_get_s3_artifacts_for_run(run_id, year) %>% + sapply(aws.s3::object_exists) %>% + any() + ) +} + +# We check for validity separate from the deletion operation for two reasons: +# +# 1. The aws.s3::delete_object API does not raise an error if an object does +# not exist, so a delete operation alone won't alert us for an incorrect +# ID +# 2. Even if aws.s3::delete_object could raise an error for missing objects, +# we want to alert the caller that one or more of the IDs were incorrect +# before deleting any objects so that this script is nondestructive +# in the case of a malformed ID +valid_run_ids <- run_ids %>% sapply(run_id_is_valid, year = year) + +if (!all(valid_run_ids)) { + invalid_run_ids <- run_ids[which(valid_run_ids == FALSE)] %>% + paste(collapse = ", ") + + "Some run IDs are missing all S3 artifacts for {year}: {invalid_run_ids}" %>% + glue::glue() %>% + stop() +} + +"Deleting S3 artifacts run IDs in year {year}: {run_ids}" %>% + glue::glue() %>% + print() + +run_ids %>% sapply(model_delete_run, year = year) diff --git a/R/helpers.R b/R/helpers.R index 0fbddd5f..6bfd069b 100644 --- a/R/helpers.R +++ b/R/helpers.R @@ -28,18 +28,16 @@ model_file_dict <- function(run_id = NULL, year = NULL) { return(dict) } - -# Used to delete erroneous, incomplete, or otherwise unwanted runs -# Use with caution! Deleted models are retained for a period of time before -# being permanently deleted -model_delete_run <- function(run_id, year) { +# Get a vector of S3 paths to the artifacts for a given model run +model_get_s3_artifacts_for_run <- function(run_id, year) { # Get paths of all run objects based on the file dictionary paths <- model_file_dict(run_id, year) s3_objs <- grep("s3://", unlist(paths), value = TRUE) bucket <- strsplit(s3_objs[1], "/")[[1]][3] # First get anything partitioned only by year - s3_objs_limited <- grep(".parquet$|.zip$|.rds$", s3_objs, value = TRUE) + s3_objs_limited <- grep(".parquet$|.zip$|.rds$", s3_objs, value = TRUE) %>% + unname() # Next get the prefix of anything partitioned by year and run_id s3_objs_dir_path <- file.path( @@ -53,16 +51,21 @@ model_delete_run <- function(run_id, year) { ) s3_objs_dir_path <- gsub(paste0("s3://", bucket, "/"), "", s3_objs_dir_path) s3_objs_dir_path <- gsub("//", "/", s3_objs_dir_path) - s3_objs_w_run_id <- unlist(purrr::map( - s3_objs_dir_path, - ~ aws.s3::get_bucket_df(bucket, .x)$Key - )) - - # Delete current version of objects - purrr::walk(s3_objs_limited, aws.s3::delete_object) - purrr::walk(s3_objs_w_run_id, aws.s3::delete_object, bucket = bucket) + s3_objs_w_run_id <- s3_objs_dir_path %>% + purrr::map(~ aws.s3::get_bucket_df(bucket, .x)$Key) %>% + unlist() %>% + purrr::map_chr(~ glue::glue("s3://{bucket}/{.x}")) + + return(c(s3_objs_limited, s3_objs_w_run_id)) } +# Used to delete erroneous, incomplete, or otherwise unwanted runs +# Use with caution! Deleted models are retained for a period of time before +# being permanently deleted +model_delete_run <- function(run_id, year) { + model_get_s3_artifacts_for_run(run_id, year) %>% + purrr::walk(aws.s3::delete_object) +} # Used to fetch a run's output from S3 and populate it locally. Useful for # running reports and performing local troubleshooting