diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index d7563e15..c2022b66 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -51,6 +51,7 @@ jobs: teal riskmetric tidyCDISC + mirai - name: Install tinytex run: quarto install tool tinytex diff --git a/.github/workflows/update_post_dates.yml b/.github/workflows/update_post_dates.yml index 52addc09..d34359ef 100644 --- a/.github/workflows/update_post_dates.yml +++ b/.github/workflows/update_post_dates.yml @@ -14,7 +14,8 @@ jobs: - name: Checkout repository uses: actions/checkout@v4 with: - ref: main # replace with the branch you want to checkout + ref: main + token: ${{ secrets.PHARMAVERSE_BOT }} - name: Run update_post_dates run: Rscript R/update_post_dates.R # running the R script with Rscript @@ -22,25 +23,12 @@ jobs: - name: Configure Git safe directory run: git config --global --add safe.directory /__w/blog/blog - - name: Check for changes - id: check_changes - shell: bash - run: | - git config --global user.name 'github-actions[bot]' - git config --global user.email 'github-actions[bot]@users.noreply.github.com' - if [[ `git status --porcelain` ]]; then - echo "changes detected" - echo "changes=true" >> $GITHUB_ENV - else - echo "no changes" - echo "changes=false" >> $GITHUB_ENV - fi - - - name: Commit results - if: env.changes == 'true' - run: | - git add . - git commit -m "Auto-update blog post date" - git push origin main - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Commit and push changes + uses: stefanzweifel/git-auto-commit-action@v5 + with: + commit_message: "[skip actions] Auto-update blog post date" + file_pattern: "." + commit_user_name: github-actions + commit_user_email: >- + 41898282+github-actions[bot]@users.noreply.github.com + continue-on-error: true diff --git a/inst/WORDLIST.txt b/inst/WORDLIST.txt index b675dd58..b95f1281 100644 --- a/inst/WORDLIST.txt +++ b/inst/WORDLIST.txt @@ -1120,6 +1120,33 @@ astrazeneca laura MeetLaura needleman +Alexandros +clusterSetRNGStream +CMRG +doFuture +elbersb +filelock +HPC +Kouretsis +L'Ecuyer +mirai +nanonext +opre +parallelization +Parallelization +parallelizes +parallelMap +pubsonline +reprovision +shikokuchuo +tensio +tidylog +tidyr +wlandau +zzz +axecute +readRDS +saveRDS WEL Parmar RMarkdown diff --git a/posts/2024-10-16_the__tensio.../appendix.R b/posts/2024-10-16_the__tensio.../appendix.R new file mode 100644 index 00000000..c69926b0 --- /dev/null +++ b/posts/2024-10-16_the__tensio.../appendix.R @@ -0,0 +1,73 @@ +suppressMessages(library(dplyr)) +# markdown helpers -------------------------------------------------------- + +markdown_appendix <- function(name, content) { + paste(paste("##", name, "{.appendix}"), " ", content, sep = "\n") +} +markdown_link <- function(text, path) { + paste0("[", text, "](", path, ")") +} + + + +# worker functions -------------------------------------------------------- + +insert_source <- function(repo_spec, name, + collection = "posts", + branch = "main", + host = "https://github.com", + text = "Source", + file_name) { + path <- paste( + host, + repo_spec, + "tree", + branch, + collection, + name, + file_name, + sep = "/" + ) + return(markdown_link(text, path)) +} + +insert_timestamp <- function(tzone = Sys.timezone()) { + time <- lubridate::now(tzone = tzone) + stamp <- as.character(time, tz = tzone, usetz = TRUE) + return(stamp) +} + +insert_lockfile <- function(repo_spec, name, + collection = "posts", + branch = "main", + host = "https://github.com", + text = "Session info") { + path <- path <- "https://pharmaverse.github.io/blog/session_info.html" + + return(markdown_link(text, path)) +} + + + +# top level function ------------------------------------------------------ + +insert_appendix <- function(repo_spec, name, collection = "posts", file_name) { + appendices <- paste( + markdown_appendix( + name = "Last updated", + content = insert_timestamp() + ), + " ", + markdown_appendix( + name = "Details", + content = paste( + insert_source(repo_spec, name, collection, file_name = file_name), + # get renv information, + insert_lockfile(repo_spec, name, collection), + sep = ", " + ) + ), + sep = "\n" + ) + knitr::asis_output(appendices) +} diff --git a/posts/2024-10-16_the__tensio.../cache_execution.rds b/posts/2024-10-16_the__tensio.../cache_execution.rds new file mode 100644 index 00000000..880f563c Binary files /dev/null and b/posts/2024-10-16_the__tensio.../cache_execution.rds differ diff --git a/posts/2024-10-16_the__tensio.../mirai_workflow.R b/posts/2024-10-16_the__tensio.../mirai_workflow.R new file mode 100644 index 00000000..ad953a4d --- /dev/null +++ b/posts/2024-10-16_the__tensio.../mirai_workflow.R @@ -0,0 +1,26 @@ +{ + library("mirai") + library("dplyr") + log_file <- tempfile() + mirai::daemons(4) + mirai::everywhere( + { + library("dplyr") + library("tidylog") + log_to_file <- function(txt) { + cat(txt, + file = log_file, + sep = "\n", append = TRUE + ) + } + options(tidylog.display = list(message, log_to_file)) + }, + log_file = log_file + ) + m <- mirai_map(letters[1:5], function(x) { + mutate(tibble(.rows = 1), `:=`("{x}", sample(1:100, 1))) + }) + result <- bind_cols(m[]) + mirai::daemons(0) + print(list(logs = readLines(log_file), result = result)) +} diff --git a/posts/2024-10-16_the__tensio.../mirai_workflow.log b/posts/2024-10-16_the__tensio.../mirai_workflow.log new file mode 100644 index 00000000..1a089751 --- /dev/null +++ b/posts/2024-10-16_the__tensio.../mirai_workflow.log @@ -0,0 +1,171 @@ +-------------------------------------------------------------------------------- +- logrx Metadata - +-------------------------------------------------------------------------------- +This log was generated using logrx 0.3.1 +logrx package version: 0.3.1 +logrx build: RSPM (R 4.3.0) +logrx link to repository: https://github.com/pharmaverse/logrx +-------------------------------------------------------------------------------- +- User and File Information - +-------------------------------------------------------------------------------- +User: ale +File Name: mirai_workflow.R +File Path: /home/ale/blog/posts/zzz_DO_NOT_EDIT_the__tensio... +File HashSum: 93e86d1e6a59ab5475d3c9fbd1948be36228a1fe +-------------------------------------------------------------------------------- +- Session Information - +-------------------------------------------------------------------------------- +─ Session info ─────────────────────────────────────────────────────────────── + setting value + version R version 4.3.2 (2023-10-31) + os Ubuntu 22.04.3 LTS + system x86_64, linux-gnu + ui X11 + language (EN) + collate en_US.UTF-8 + ctype en_US.UTF-8 + tz Etc/UTC + date 2024-10-07 + pandoc 3.1.1 @ /usr/lib/rstudio-server/bin/quarto/bin/tools/ (via rmarkdown) + +─ Packages ─────────────────────────────────────────────────────────────────── + package * version date (UTC) lib source + backports 1.4.1 2021-12-13 [1] RSPM (R 4.3.0) + callr 3.7.3 2022-11-02 [1] RSPM (R 4.3.0) + cli 3.6.2 2023-12-11 [1] RSPM (R 4.3.0) + crayon 1.5.2 2022-09-29 [1] RSPM (R 4.3.0) + cyclocomp 1.1.1 2023-08-30 [1] RSPM (R 4.3.0) + data.table 1.15.0 2024-01-30 [1] RSPM (R 4.3.0) + desc 1.4.3 2023-12-10 [1] RSPM (R 4.3.0) + digest 0.6.34 2024-01-11 [1] RSPM (R 4.3.0) + dplyr * 1.1.4 2023-11-17 [1] RSPM (R 4.3.0) + evaluate 0.23 2023-11-01 [1] RSPM (R 4.3.0) + fansi 1.0.6 2023-12-08 [1] RSPM (R 4.3.0) + fastmap 1.2.0 2024-05-15 [1] RSPM (R 4.3.0) + generics 0.1.3 2022-07-05 [1] RSPM (R 4.3.0) + glue 1.7.0 2024-01-09 [1] RSPM (R 4.3.0) + htmltools 0.5.7 2023-11-03 [1] RSPM (R 4.3.0) + htmlwidgets 1.6.4 2023-12-06 [1] RSPM (R 4.3.0) + httpuv 1.6.15 2024-03-26 [1] RSPM (R 4.3.0) + jsonlite 1.8.8 2023-12-04 [1] RSPM (R 4.3.0) + knitr 1.45 2023-10-30 [1] RSPM (R 4.3.0) + later 1.3.2 2023-12-06 [1] RSPM (R 4.3.0) + lazyeval 0.2.2 2019-03-15 [1] RSPM (R 4.3.0) + lifecycle 1.0.4 2023-11-07 [1] RSPM (R 4.3.0) + lintr 3.1.2 2024-03-25 [1] RSPM (R 4.3.0) + logrx 0.3.1 2024-04-12 [1] RSPM (R 4.3.0) + magrittr 2.0.3 2022-03-30 [1] RSPM (R 4.3.0) + mime 0.12 2021-09-28 [1] RSPM (R 4.3.0) + miniUI 0.1.1.1 2018-05-18 [1] RSPM (R 4.3.0) + mirai * 1.2.0 2024-08-18 [1] RSPM (R 4.3.0) + nanonext 1.2.1 2024-08-19 [1] RSPM (R 4.3.0) + pillar 1.9.0 2023-03-22 [1] RSPM (R 4.3.0) + pkgconfig 2.0.3 2019-09-22 [1] RSPM (R 4.3.0) + processx 3.8.3 2023-12-10 [1] RSPM (R 4.3.0) + promises 1.3.0 2024-04-05 [1] RSPM (R 4.3.0) + ps 1.7.6 2024-01-18 [1] RSPM (R 4.3.0) + purrr 1.0.2 2023-08-10 [1] RSPM (R 4.3.0) + R6 2.5.1 2021-08-19 [1] RSPM (R 4.3.0) + Rcpp 1.0.12 2024-01-09 [1] RSPM (R 4.3.0) + remotes 2.5.0 2024-03-17 [1] RSPM (R 4.3.0) + rex 1.2.1 2021-11-26 [1] RSPM (R 4.3.0) + rlang 1.1.3 2024-01-10 [1] RSPM (R 4.3.0) + rmarkdown 2.28 2024-08-17 [1] RSPM (R 4.3.0) + rstudioapi 0.15.0 2023-07-07 [1] RSPM (R 4.3.0) + sessioninfo 1.2.2 2021-12-06 [1] RSPM (R 4.3.0) + shiny 1.9.1 2024-08-01 [1] RSPM (R 4.3.0) + stringi 1.8.3 2023-12-11 [1] RSPM (R 4.3.0) + stringr 1.5.1 2023-11-14 [1] RSPM (R 4.3.0) + tibble 3.2.1 2023-03-20 [1] RSPM (R 4.3.0) + tidyr 1.3.1 2024-01-24 [1] RSPM (R 4.3.0) + tidyselect 1.2.0 2022-10-10 [1] RSPM (R 4.3.0) + utf8 1.2.4 2023-10-22 [1] RSPM (R 4.3.0) + vctrs 0.6.5 2023-12-01 [1] RSPM (R 4.3.0) + waiter 0.2.5 2022-01-03 [1] RSPM (R 4.3.0) + withr 3.0.0 2024-01-16 [1] RSPM (R 4.3.0) + xfun 0.42 2024-02-08 [1] RSPM (R 4.3.0) + xml2 1.3.6 2023-12-04 [1] RSPM (R 4.3.0) + xtable 1.8-4 2019-04-21 [1] RSPM (R 4.3.0) + yaml 2.3.8 2023-12-11 [1] RSPM (R 4.3.0) + + [1] /usr/local/lib/R/site-library + [2] /usr/local/lib/R/library + +─ External software ────────────────────────────────────────────────────────── + setting value + cairo 1.16.0 + cairoFT + pango 1.50.6 + png 1.6.37 + jpeg 8.0 + tiff LIBTIFF, Version 4.3.0 + tcl + curl 7.81.0 + zlib 1.2.11 + bzlib 1.0.8, 13-Jul-2019 + xz 5.2.5 + PCRE 10.39 2021-10-29 + ICU 70.1 + TRE TRE 0.8.0 R_fixes (BSD) + iconv glibc 2.35 + readline 8.1 + BLAS /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3 + + lapack /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.20.so + lapack_version 3.10.0 + +─ Python configuration ─────────────────────────────────────────────────────── + Python is not available + +────────────────────────────────────────────────────────────────────────────── +-------------------------------------------------------------------------------- +- Masked Functions - +-------------------------------------------------------------------------------- +function `plot` from {package:base} by package:graphics +function `body<-` from {package:base} by package:methods +function `kronecker` from {package:base} by package:methods +-------------------------------------------------------------------------------- +- Used Package and Functions - +-------------------------------------------------------------------------------- +{!!! NOT FOUND !!!} `:=` +{package:base} library, tempfile, cat, options, list, print, readLines +{package:dplyr} mutate, bind_cols +{package:mirai} daemons, everywhere, mirai_map +{package:tidyr} tibble +-------------------------------------------------------------------------------- +- Program Run Time Information - +-------------------------------------------------------------------------------- +Start time: 2024-10-07 22:16:34 UTC +End time: 2024-10-07 22:16:36 UTC +Run time: 2 seconds +-------------------------------------------------------------------------------- +- Errors and Warnings - +-------------------------------------------------------------------------------- +Errors: + + +Warnings: + +-------------------------------------------------------------------------------- +- Messages, Output, and Result - +-------------------------------------------------------------------------------- + +Result: + $logs + [1] "mutate: new variable 'b' (character) with one unique value and 0% NAmutate: new variable 'a' (character) with one unique value and 0% NA" + [2] "" + [3] "mutate: new variable 'd' (character) with one unique value and 0% NA" + [4] "mutate: new variable 'e' (character) with one unique value and 0% NA" + [5] "mutate: new variable 'c' (character) with one unique value and 0% NA" + + $result + # A tibble: 1 × 5 + a b c d e + + 1 foo foo foo foo foo + +-------------------------------------------------------------------------------- +- Log Output File - +-------------------------------------------------------------------------------- +Log name: mirai_workflow.log +Log path: /home/ale/blog/posts/zzz_DO_NOT_EDIT_the__tensio... diff --git a/posts/2024-10-16_the__tensio.../pharmaverse.PNG b/posts/2024-10-16_the__tensio.../pharmaverse.PNG new file mode 100644 index 00000000..7ee40c66 Binary files /dev/null and b/posts/2024-10-16_the__tensio.../pharmaverse.PNG differ diff --git a/posts/2024-10-16_the__tensio.../the__tension_of__high-_performance__computing:__reproducibility_vs.__parallelization.qmd b/posts/2024-10-16_the__tensio.../the__tension_of__high-_performance__computing:__reproducibility_vs.__parallelization.qmd new file mode 100644 index 00000000..1964a354 --- /dev/null +++ b/posts/2024-10-16_the__tensio.../the__tension_of__high-_performance__computing:__reproducibility_vs.__parallelization.qmd @@ -0,0 +1,316 @@ +--- +title: 'The Tension of High-Performance Computing: Reproducibility vs. Parallelization' +author: +- name: Alexandros Kouretsis +- name: APPSILON +description: Discover how to manage parallel processing and ensure reproducibility + in drug development using the {mirai} package and other HPC tools. +date: '2024-10-16' +categories: +- Submissions +- Technical +image: pharmaverse.PNG +--- + + + +```{r setup, include=FALSE} +long_slug <- "zzz_DO_NOT_EDIT_the__tensio..." +# renv::use(lockfile = "renv.lock") +``` + + + +## Harnessing HPC for Drug Development + +In pharmaceutical research, high-performance computing (HPC) plays a pivotal +role in driving advancements in drug discovery and development. From analyzing +vast genomic datasets to simulating drug interactions across diverse +populations, HPC enables researchers to tackle complex computational tasks at +high speeds. As pharmaceutical research becomes increasingly data-driven, the +need for powerful computational tools has grown, allowing for more accurate +predictions, faster testing, and more efficient processes. However, with the +growing complexity and scale of these computations, ensuring reproducibility +of results becomes a significant challenge. + +In this blog post, we will explore common reproducibility challenges in drug +development and simulations, using the +[`{mirai}`](https://shikokuchuo.net/mirai/) package as a backend solution to +manage parallelization. + +## The Problem: Reproducibility in Parallel Processing + +Imagine a research team working on a cutting-edge drug development project. To +process and analyze vast amounts of data efficiently, they leverage parallel +processing, distributing tasks across multiple processors. This approach +significantly accelerates their work, enabling them to handle large datasets and +complex computations in a fraction of the time. + +However, the team soon encounters an issue. Each time they rerun the same +processing tasks with identical input parameters, the results differ slightly. +This raises a major concern: *the results are not reproducible.* In industries +like pharmaceuticals, where accuracy and consistency are critical, +reproducibility is not just important—it's a regulatory requirement. + +For example, in large-scale Monte Carlo simulations, small differences can arise +not only from changes in execution order across processors but also from +inconsistencies between workers or difficulties in maintaining synchronized +random number generation (RNG) streams. Furthermore, the more complex the +environment—with multiple components such as distributed workers, different +hardware, or varying system configurations—the harder it becomes to reprovision +the exact same environment and repeat the computations exactly. As these +variations accumulate, ensuring consistent and reproducible results becomes +a significant challenge in data-driven research. + +### Tracking Operations in Parallel Computing + +Let’s explore a simple scenario where parallelization creates confusion in +tracking operations due to the asynchronous nature of task execution and +logging. For this, we will also use the +[`{tidylog}`](https://github.com/elbersb/tidylog) package, which tracks and logs +`{dplyr}` operations, providing insight into how the computations are executed +across multiple workers. + +We'll create our workflow in a script and run it using the `{logrx}` package +from Pharmaverse. The workflow will be written as an expression using +`base::substitute()`, which will help generate the complete script. In our +example, we'll start four daemons. A daemon is a background process that runs in +the background continuously and handles specific computing tasks. + +```{r, message=FALSE, eval=TRUE} +mirai_workflow <- substitute({ + library("mirai") + library("dplyr") + + log_file <- tempfile() + + # start parallel workers + mirai::daemons(4) + + # load libraries on each worker and set up logging to a file + mirai::everywhere( + { + library("dplyr") + library("tidylog") + + # Define function to log messages to the log file + log_to_file <- \(txt) cat(txt, file = log_file, sep = "\n", append = TRUE) + options("tidylog.display" = list(message, log_to_file)) + }, + log_file = log_file + ) + + # perform computations in parallel + m <- mirai_map(letters[1:5], \(x) { + mutate(tibble(.rows = 1), "{x}" := sample(1:100, 1)) + }) + + # collect results + result <- m[] |> bind_cols() + + mirai::daemons(0) + + print( + list( + logs = readLines(log_file), + result = result + ) + ) +}) +``` + +In the above code chunk, we set up a parallel processing environment using the +`{mirai}` package. The function `mirai_map()` is used to apply a mutating +function in parallel to a tibble for each element of `letters`, logging the +operations to a file using the `{tidylog}` package. However, while we can log +each operation as it happens, due to the parallel nature of `{mirai}`, the +logging does not occur in a controlled or sequential order. *Each daemon +executes its task independently, and the order of logging in the file will +depend on the completion times of these parallel processes rather than the +intended flow of operations.* + +> Parallel computations can obscure the traceability of operations + +This lack of control can lead to a situation where the log entries do not +reflect the actual sequence in which the `{dplyr}` commands were expected to be +processed. Although the operations themselves are carried out correctly, the +asynchronous logging may create challenges in *tracing* and *debugging* the +process, as entries in the log file could appear out of order, giving an +incomplete or misleading representation of the task flow. + +Let's first save the code to an R script called `mirai_workflow.R`. This step +helps ensure that the execution can be properly tracked and documented: + +```{r} +mirai_workflow |> + deparse() |> + writeLines("mirai_workflow.R") +``` + +Next, we execute the script using `logrx::axecute()`, which not only runs the +workflow but also logs key metadata and outputs for enhanced traceability and +reproducibility: + +```{r, eval=FALSE} +logrx::axecute("mirai_workflow.R", to_report = "result") +``` + +```{r cache_exec, eval=FALSE, echo=FALSE} +# run this to refresh cache and get a non ordered log file +res_to_cache <- source("mirai_workflow.R") +saveRDS(res_to_cache$value, "cache_execution.rds") +``` + +```{r, echo=FALSE} +readRDS("cache_execution.rds") +``` + +Upon examining the log file generated, you'll notice that the entries are not in +the same order as the commands were dispatched. This illustrates the inherent +difficulty in maintaining a consistent logging sequence for parallel tasks, +especially since the timing of each process completion and log recording is +unpredictable. + +Additionally, it is worth noting that `logrx` does not capture the logging +performed by `{tidylog}` during the execution of tasks on `{mirai}` daemons. +This is because the daemons run as independent R processes, and the logging +messages are not propagated back to the parent process in a straightforward +manner. As described in `{mirai}`'s documentation, daemons are responsible for +handling tasks asynchronously, and messages logged within these processes do not +automatically integrate into the parent session. Therefore, we access +`{tidylog}` messages indirectly, by reading the dedicated log file +(`log_file`) that each worker writes to during execution. + +### Task Dispatching and RNG Management + +By default, `{mirai}` uses an advanced dispatcher to manage task distribution +efficiently, scheduling tasks in a First-In-First-Out manner and leveraging +[`{nanonext}`](https://shikokuchuo.net/nanonext/) primitives for zero-latency, +resource-free task management. However, its asynchronous execution can hinder +reproducibility, especially with random number generation (RNG) or tasks needing +strict order. + +To enhance reproducibility, `{mirai}` allows disabling the dispatcher which +usually decides the order in which tasks are run. Instead, it connects directly +to the workers one by one in a simple order +(see [round-robin](https://en.wikipedia.org/wiki/Round-robin_scheduling)). While +less efficient, this approach provides greater control over task execution and +is better suited for ensuring reproducibility by initializing +[L'Ecuyer-CMRG RNG streams](doi:10.1287/opre.47.1.159). + +In the following example, we simulate drug efficacy across different patient +cohorts using parallel processing with the `{mirai}` package. We define three +cohorts, each with a different mean drug effect and standard deviation, and +initialize four daemons to handle the computations. + +```{r} +library(mirai) +library(dplyr, warn.conflicts = FALSE) + +# Parameters for the simulation +cohorts <- tribble( + ~patient_count, ~mean_effect, ~sd_effect, + 1000, 0.7, 0.1, + 1000, 0.65, 0.15, + 1000, 0.75, 0.05 +) + +# Start daemons with consistent RNG streams +x <- mirai::daemons( + n = 4, + dispatcher = "none", # For mirai versions below 1.3.0, use dispatcher = FALSE + seed = 123 +) + +# Parallel simulation for each row of the cohorts table +m <- mirai::mirai_map(cohorts, \(patient_count, mean_effect, sd_effect) { + dplyr::tibble( + patient_id = 1:patient_count, + efficacy = rnorm(patient_count, mean = mean_effect, sd = sd_effect) + ) +}) + +results <- m[] |> bind_rows() + +x <- mirai::daemons(0, dispatcher = "none") + +results %>% + group_by(patient_id) %>% + summarise( + mean_efficacy = mean(efficacy), + sd_efficacy = sd(efficacy) + ) +``` + +We used `tribble()` to define the simulation parameters and +initialize 4 daemons with `dispatcher = "none"` and a fixed seed to ensure +consistent random number generation across tasks. The `mirai_map()` function +parallelizes the drug efficacy simulation, and the results are combined using +`bind_rows()` for further analysis. + +Disabling the dispatcher gives more control over task execution, ensuring +reproducibility. If you repeat the computation you +will notice that it generates consistent results. However, this approach comes +at a cost. Disabling the dispatcher may lead to inefficient resource utilization +when tasks are unevenly distributed, as some daemons may remain idle. While +reproducibility is prioritized, we sacrifice some performance, especially +when handling tasks with varying workloads. + +Reproducibility becomes trickier when using parallelization frameworks like +`{parallelMap}`, `{doFuture}`, and `{future}`, as each handles random number +generation (RNG) differently. While `set.seed()` works for sequential tasks, +parallel tasks need careful management of RNG streams, often using specific +methods like "L’Ecuyer-CMRG" or functions like `clusterSetRNGStream()` to keep +results consistent. Each framework has its own approach, so it's important to +understand how each one manages RNG to ensure reproducibility. + +Even without random numbers, simple tasks—like adding floating-point numbers—can +give different results in parallel processing. This happens because +floating-point numbers aren’t exactly represented, and the order of operations +can affect the outcome. In parallel environments, where tasks finish in +different orders, these small differences can add up, making it harder to +reproduce results in large computations. + +## Closing Thoughts + +While we've explored the basics of reproducibility in parallel computing with +simple examples, the challenges extend beyond random number generation. Issues +such as *process synchronization*, using tools like lock files (see for example +[`{filelock}`](https://r-lib.github.io/filelock/)), become critical +in multi-process environments. *Floating-point arithmetic* adds complexity, +particularly when distributed across heterogeneous systems with varying +architectures and precision. *Managing dependencies* also becomes more +intricate as tasks grow in complexity, and ensuring *error recovery* in a +controlled manner is vital to avoid crashes or inconsistent results in +large-scale operations. + +Powerful tools like [`{targets}`](https://docs.ropensci.org/targets/) and +[`{crew}`](https://wlandau.github.io/crew/) can help tackle these advanced +challenges. `{targets}` is a workflow orchestration tool that manages +dependencies, automates reproducible pipelines, and ensures consistent results +across runs. Meanwhile, `{crew}` extends this by efficiently managing +distributed computing tasks, allowing for seamless scaling, load balancing, +and error handling across local processes or cloud environments. Together, these +tools simplify the execution of complex high-performance computing (HPC) +workflows, providing flexibility and robustness for scaling computations while +trying for maintaining control and reproducibility. + +This blog post has hopefully increased your intuition about the challenges that +may arise when incorporating HPC into your work. By understanding these +complexities, you’ll be better positioned to make informed decisions about the +trade-offs—such as balancing performance and reproducibility—that are most +relevant to your specific case. As your computations scale, finding the right +balance between efficiency, accuracy, and reproducibility will be crucial for +the success of your projects. + + + +```{r, echo=FALSE} +source("appendix.R") +insert_appendix( + repo_spec = "pharmaverse/blog", + name = long_slug, + # file_name should be the name of your file + file_name = list.files() %>% stringr::str_subset(".qmd") %>% first() +) +```