diff --git a/FIXME.Rproj b/FIXME.Rproj deleted file mode 100644 index 718d2734..00000000 --- a/FIXME.Rproj +++ /dev/null @@ -1,19 +0,0 @@ -Version: 1.0 - -RestoreWorkspace: No -SaveWorkspace: No -AlwaysSaveHistory: Default - -EnableCodeIndexing: Yes -UseSpacesForTab: Yes -NumSpacesForTab: 2 -Encoding: UTF-8 - -RnwWeave: Sweave -LaTeX: pdfLaTeX - -AutoAppendNewline: Yes -StripTrailingWhitespace: Yes -LineEndingConversion: Posix - -BuildType: Website diff --git a/config.yaml b/config.yaml index 63382b36..5aef8d66 100644 --- a/config.yaml +++ b/config.yaml @@ -69,6 +69,7 @@ episodes: - branch.Rmd - parallel.Rmd - quarto.Rmd +- hpc.Rmd # Information for Learners learners: diff --git a/episodes/files/plans/plan_slurm.R b/episodes/files/plans/plan_slurm.R new file mode 100644 index 00000000..4d1c710c --- /dev/null +++ b/episodes/files/plans/plan_slurm.R @@ -0,0 +1,49 @@ +library(crew.cluster) +library(targets) +library(tarchetypes) +library(palmerpenguins) +library(broom) +suppressPackageStartupMessages(library(tidyverse)) + +source("R/packages.R") +source("R/functions.R") + +tar_option_set( + controller = crew_controller_slurm( + workers = 3, + script_lines = "module load R", + slurm_memory_gigabytes_per_cpu = 1 + ) +) + +tar_plan( + # Load raw data + tar_file_read( + penguins_data_raw, + path_to_file("penguins_raw.csv"), + read_csv(!!.x, show_col_types = FALSE) + ), + # Clean data + penguins_data = clean_penguin_data(penguins_data_raw), + # Build models + models = list( + combined_model = lm( + bill_depth_mm ~ bill_length_mm, data = penguins_data), + species_model = lm( + bill_depth_mm ~ bill_length_mm + species, data = penguins_data), + interaction_model = lm( + bill_depth_mm ~ bill_length_mm * species, data = penguins_data) + ), + # Get model summaries + tar_target( + model_summaries, + glance_with_mod_name_slow(models), + pattern = map(models) + ), + # Get model predictions + tar_target( + model_predictions, + augment_with_mod_name_slow(models), + pattern = map(models) + ) +) diff --git a/episodes/files/plans/plan_slurm_gpu.R b/episodes/files/plans/plan_slurm_gpu.R new file mode 100644 index 00000000..badbab1d --- /dev/null +++ b/episodes/files/plans/plan_slurm_gpu.R @@ -0,0 +1,52 @@ +graphics_devices <- function(){ + system2("lshw", c("-class", "display"), stdout=TRUE, stderr=FALSE) +} + +library(crew) +library(targets) +library(tarchetypes) +library(crew.cluster) + +source("R/packages.R") +source("R/functions.R") + +tar_option_set( + controller = crew_controller_group( + crew_controller_slurm( + name = "cpu_worker", + workers = 1, + script_lines = "module load R", + slurm_memory_gigabytes_per_cpu = 1, + slurm_cpus_per_task = 1 + ), + + crew_controller_slurm( + name = "gpu_worker", + workers = 1, + script_lines = c( + "#SBATCH --partition=gpuq", + "#SBATCH --gres=gpu:1", + "module load R" + ), + slurm_memory_gigabytes_per_cpu = 1, + slurm_cpus_per_task = 1 + ) + ) +) + +tar_plan( + tar_target( + cpu_hardware, + graphics_devices(), + resources = tar_resources( + crew = tar_resources_crew(controller = "cpu_worker") + ) + ), + tar_target( + gpu_hardware, + graphics_devices(), + resources = tar_resources( + crew = tar_resources_crew(controller = "gpu_worker") + ) + ) +) diff --git a/episodes/files/plans/plan_slurm_memory.R b/episodes/files/plans/plan_slurm_memory.R new file mode 100644 index 00000000..80f2ae13 --- /dev/null +++ b/episodes/files/plans/plan_slurm_memory.R @@ -0,0 +1,39 @@ +library(crew) +library(targets) +library(tarchetypes) +library(crew.cluster) + +source("R/packages.R") +source("R/functions.R") + +small_memory <- crew_controller_slurm( + name = "small_memory", + script_lines = "module load R", + slurm_memory_gigabytes_per_cpu = 1 +) +big_memory <- crew_controller_slurm( + name = "big_memory", + script_lines = "module load R", + slurm_memory_gigabytes_per_cpu = 2 +) + +tar_option_set( + controller = crew_controller_group(small_memory, big_memory) +) + +list( + tar_target( + name = big_memory_task, + command = Sys.getenv("SLURM_MEM_PER_CPU"), + resources = tar_resources( + crew = tar_resources_crew(controller = "big_memory") + ) + ), + tar_target( + name = small_memory_task, + command = Sys.getenv("SLURM_MEM_PER_CPU"), + resources = tar_resources( + crew = tar_resources_crew(controller = "small_memory") + ) + ) +) diff --git a/episodes/hpc.Rmd b/episodes/hpc.Rmd new file mode 100644 index 00000000..eee9b0ed --- /dev/null +++ b/episodes/hpc.Rmd @@ -0,0 +1,324 @@ +--- +title: 'Deploying Targets on HPC' +teaching: 10 +exercises: 2 +--- + +```{R} +#| echo: false +# Exit sensibly when Slurm isn't installed +if (!nzchar(Sys.which("sbatch"))){ + knitr::knit_exit("sbatch was not detected. Likely Slurm is not installed. Exiting.") +} +``` + +::: questions + +- Why would we use HPC to run Targets workflows? +- How can we run Targets workflows on Slurm? + +::: + +::: objectives + +- Be able to run a Targets workflow on Slurm +- Understand how workers relate to targets +- Know how to configure Slurm jobs within targets +- Be able to create a pipeline with heterogeneous workers + +::: + +```{r} +#| label: setup +#| echo: false +#| message: false +#| warning: false +library(targets) +library(tarchetypes) +library(quarto) # don't actually need to load, but put here so renv catches it +source("files/lesson_functions.R") # nolint + +# Increase width for printing tibbles +options(width = 140) +``` + +## Advantages of HPC + +If your analysis involves computationally intensive or long-running tasks such as training machine learning models or processing very large amounts of data, it will quickly become infeasible to use a single machine to run this. +If you have access to a High Performance Computing (HPC) cluster, you can leverage the numerous machines with Targets to scale up your analysis. +This differs from the execution we have learned so far, which spawns extra R processes on the *same machine* to speed up execution. + +## Configuring Targets for Slurm + +::: {.prereq} + +### Install required packages + +You will need to install `crew.cluster` to enable the HPC integration: + +```{r} +#| label: 'install-crew.cluster' +#| eval: false +install.packages("crew.cluster") +``` +::: + +To adapt Targets to use the Slurm HPC scheduler, we change the `controller`. +In this section we will assume that our HPC uses Slurm as its job scheduler, but you can use other schedulers such as PBS/TORQUE, Sun Grid Engine (SGE) or LSF. + +In the Parallel Processing section, we used the following configuration: +```{R} +#| label: one-machine-crew +tar_option_set( + controller = crew_controller_local(workers = 2) +) +``` +To configure this for Slurm, we swap out the controller with [`crew_controller_slurm()`](https://wlandau.github.io/crew.cluster/reference/crew_controller_slurm.html) a new one from the [`crew.cluster`](https://wlandau.github.io/crew.cluster/index.html) package: + +```{R} +#| label = "slurm-crew", +#| eval = FALSE, +#| code = readLines("files/plans/plan_slurm.R")[11:17] +``` + +::: callout +If you were using a scheduler other than Slurm, you would instead select [`crew_controller_lsf()`](https://wlandau.github.io/crew.cluster/reference/crew_controller_lsf.html), [`crew_controller_pbs()`](https://wlandau.github.io/crew.cluster/reference/crew_controller_pbs.html) or [`crew_controller_sge()`](https://wlandau.github.io/crew.cluster/reference/crew_controller_sge.html) instead of `crew_controller_slurm()`. + +These functions have their own unique arguments which are associated with the scheduler. +::: + +There are a number of options you can pass to `crew_controller_slurm()` to fine-tune the Slurm execution, [which you can find here](https://wlandau.github.io/crew.cluster/reference/crew_controller_slurm.html). +Here we are only using three: + + * `workers` sets the number of jobs that are submitted to Slurm to process targets. + * `script_lines` adds some lines to the Slurm submit script used by Targets. This is useful for loading Environment Modules as we have done here. + * `slurm_memory_gigabytes_per_cpu` specifies the amount of memory we need. + +Let's run the modified workflow: + +```{R, eval=FALSE} +#| label = "slurm-workflow-show", +#| eval = FALSE, +#| code = readLines("files/plans/plan_slurm.R") +``` +```{R} +#| label: slurm-workflow-hide +#| echo: false +tar_dir({ + write_example_plan("plan_slurm.R") + tar_make() +}) +``` + +We've successfully transferred our analysis onto a Slurm cluster! + +::: challenge +## Increasing Resources + +Q: How would you modify your `_targets.R` if your functions needed 2 CPUs? + +::: hint +Check the arguments for [`crew_controller_slurm`](https://wlandau.github.io/crew.cluster/reference/crew_controller_slurm.html#arguments-1). +::: +::: solution +```R +tar_option_set( + controller = crew_controller_slurm( + workers = 3, + script_lines = "module load R", + slurm_memory_gigabytes_per_cpu = 1, + # Added this + slurm_cpus_per_task = 2 + ) +) +``` +::: +::: + +## SBATCH Options + +The `script_lines` argument shown above can also be used to add `#SBATCH` flags, to configure your worker job. +Each entry in the vector will be treated as a new line to be added to the `sbatch` script that is generated. +However, you have to be careful to put all of your `#SBATCH` lines before any other bash commands. +`sbatch` flags [are listed here](https://slurm.schedmd.com/sbatch.html#SECTION_OPTIONS) in the Slurm documentation. +For instance, to request that the worker has a GPU available, you could do the following: +```R +tar_option_set( + controller = crew_controller_slurm( + workers = 3, + script_lines = c( + "#SBATCH --gres=gpu:1", + "module load R" + ), + slurm_memory_gigabytes_per_cpu = 1 + ) +) +``` + +In general, it's better to use a dedicated `crew_controller_slurm` argument than to use `script_lines`, if one exists. +For example, prefer `slurm_cpus_per_task=2` to `script_lines="--cpus-per-task=2"` and set `name="my_name"` rather than using `script_lines="--job-name=my_name"`. + +## HPC Workers + +`crew` uses a persistent worker strategy. +This means that `crew` does not submit one Slurm job for each target. +Instead, you define a pool of workers when configuring the workflow. +In our example above we specified a maximum of 3 workers. +For each worker, `crew` submits a single Slurm job, and these workers will process multiple targets over their lifetime. + +We can verify that this has happened using `sacct`, which we can use to query information about our past jobs. +All the Slurm jobs with the same hash (the part after `crew-`) belong to the same Slurm controller: + +```{bash} +sacct --starttime now-5minutes --allocations +``` + +The upside of this approach is that we don't have to know how long each target takes to build, or what resources it needs. +It also means that we don't submit a lot of jobs, making our Slurm usage easy to monitor. + +The downside of this mechanism is that **the resources of the worker have to be sufficient to build all of your targets**. +In other words, you need to work out the maximum RAM and CPUs used across all of your targets, and specify those maximum resources in the `crew_controller_slurm()` function. + +::: challenge +## Choosing a Worker + +Q: Say we have two targets. One uses 100 GB of RAM and 1 CPU, and the other needs 10 GB of RAM and 8 CPUs to run a multi-threaded function. What worker configuration do we use? + +::: solution +We need to choose the maximum of all resources if we have a single worker. +It will need 100 GB of RAM and 8 CPUs. +To do this we might use a controller a bit like this: +```{R, results="hide"} +crew_controller_slurm( + name = "cpu_worker", + workers = 3, + script_lines = "module load R", + slurm_cpus_per_task = 8, + slurm_memory_gigabytes_per_cpu = 100 / 8 +) +``` +::: +::: + +## Heterogeneous Workers + +In some cases we may prefer to use more than one different Slurm job processing our targets, especially if some of our targets need different hardware from others, such as a GPU. +When we do this, we say we have "heterogeneous workers", meaning that not all worker jobs are the same as each other. +To do this, we firstly define each worker configuration by adding the `name` argument to `crew_controller_slurm`: + +```{R} +#| eval = FALSE, +#| code = readLines("files/plans/plan_slurm_memory.R")[8:19] +``` + +Next, we tell Targets about these controllers using `tar_option_set` as before, with one difference: we have to combine them in a controller group: + +```{R} +#| eval = FALSE, +#| code = readLines("files/plans/plan_slurm_memory.R")[20:23] +tar_option_set( + controller = crew_controller_group(small_memory, big_memory) +) +``` + +Then we specify each controller by name in each target definition: + +```{R} +#| eval = FALSE, +#| code = readLines("files/plans/plan_slurm_memory.R")[24:39] +``` + +When we run the pipeline, we can see the differing results: +```{R} +#| label: het-example-show +#| eval: false +tar_make() +tar_read(big_memory_task) +tar_read(small_memory_task) +``` +```{R} +#| label: het-example-hide +#| eval: true +#| echo: false +tar_dir({ + write_example_plan("plan_slurm_memory.R") + tar_make() + tar_read(big_memory_task) |> print() + tar_read(small_memory_task) |> print() +}) +``` + +::: challenge +## Mixing GPU and CPU targets + +Q: Say we have the following targets workflow. How would we modify it so that `gpu_task` is only run in a GPU Slurm job? +```{R, eval=FALSE} +graphics_devices <- function(){ + system2("lshw", c("-class", "display"), stdout=TRUE, stderr=FALSE) +} + +tar_plan( + tar_target( + cpu_hardware, + graphics_devices() + ), + tar_target( + gpu_hardware, + graphics_devices() + ) +) +``` + +::: hint +You will need to define two different crew controllers. +Also, [you will need to request a GPU from Slurm](https://slurm.schedmd.com/gres.html#Running_Jobs). +You can find an example of this above. +::: +::: solution +```{R} +#| label = 'heterogeneous-controllers-show', +#| eval = FALSE, +#| code = readLines("files/plans/plan_slurm_gpu.R")[13:35] +``` +```{R} +#| label: heterogeneous-controllers-hide +#| eval: true +#| echo: false +tar_dir({ + write_example_plan("plan_slurm_gpu.R") + tar_make() + tar_load_everything() +}) +``` +```{R} +#| label: heterogeneous-controllers-cpu-show +#| eval: false +tar_read("cpu_hardware") +``` +```{R} +#| label: heterogeneous-controllers-cpu-hide +#| eval: true +#| echo: false +cpu_hardware +``` +```{R} +#| label: heterogeneous-controllers-gpu-show +#| eval: false +tar_read("gpu_hardware") +``` +```{R} +#| label: heterogeneous-controllers-gpu-hide +#| eval: true +#| echo: false +gpu_hardware +``` +::: +::: + +::::::::::::::::::::::::::::::::::::: keypoints + +- `crew.cluster::crew_controller_slurm()` is used to configure a workflow to use Slurm +- Crew uses persistent workers on HPC, and you need to choose your resources accordingly +- You can create heterogeneous workers by using multiple calls to `crew_controller_slurm(name=)` + +::::::::::::::::::::::::::::::::::::::::::::::::