Skip to content

Commit

Permalink
Update vignette to only rebuild modified datasets (#290)
Browse files Browse the repository at this point in the history
Closes #287

---------

Co-authored-by: github-actions <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: Melkiades <[email protected]>
  • Loading branch information
3 people authored Aug 31, 2023
1 parent ad08fd2 commit 8a6332a
Show file tree
Hide file tree
Showing 4 changed files with 151 additions and 102 deletions.
3 changes: 3 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
# random.cdisc.data 0.3.14.9002

### New features
* Asynchronous cached data updates in `rebuild_cached_data` vignette - data is only updated if its (or its dependency's) source file has been updated.

# random.cdisc.data 0.3.14

### Breaking changes
Expand Down
2 changes: 1 addition & 1 deletion R/radab.R
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ radab <- function(adsl,
cached = FALSE) {
checkmate::assert_flag(cached)
if (cached) {
return(get_cached_data("cadpc"))
return(get_cached_data("cadab"))
}

checkmate::assert_data_frame(adpc)
Expand Down
Binary file modified data/cadab.RData
Binary file not shown.
248 changes: 147 additions & 101 deletions vignettes/rebuild_cached_data.Rmd
Original file line number Diff line number Diff line change
@@ -1,128 +1,174 @@
---
title: "Rebuilding Random CDISC Cached Data"
author: "NEST team"
date: "8/17/2020"
title: "Rebuilding Cached Random CDISC Data"
output: rmarkdown::html_vignette
vignette: >
%\VignetteIndexEntry{Rebuilding Random CDISC Cached Data}
%\VignetteIndexEntry{Rebuilding Cached Random CDISC Data}
%\VignetteEngine{knitr::rmarkdown}
%\VignetteEncoding{UTF-8}{inputenc}
---

# Getting Started
## Getting Started

The following script is used to create, compare and save cached data to `data/` directory.
The following script is used to create, compare and save cached data to the `data/` directory.

Starting in `R 3.6.0` the default kind of under-the-hood random-number generator was changed.
Now, in order to get the results from `set.seed()` to match, you have to first call the
function `RNGkind(sample.kind = "Rounding")`.

It does throw an expected warning
It throws the expected warning:

```
Warning message:
In RNGkind(sample.kind = "Rounding") : non-uniform 'Rounding' sampler used
Warning: non-uniform 'Rounding' sampler used
```

# Code maintenance
## Code Maintenance

Currently, when a new random.cdisc.data function is created then this script needs to be manually updated to include the new cached data set in the rebuild process. If an existing function is modified then the cached data also need to be updated by running through the rebuild process.
Currently, when a `random.cdisc.data` data-generating function is created or modified, then the below code chunk must
be run to build the new/updated cached dataset and add it to the `data/` directory. If a dataset that is a dependency
for another dataset has been updated then the dependent dataset will also be updated. To manually specify which
datasets should be updated, edit the `data_to_update` vector below, entering the desired dataset names.

# Code to run interactively
## Update Cached Data

**Note:** Prior to running the following code chunk, please ensure that you have reinstalled the `random.cdisc.data`
package after completing all dataset modifications.

```{r, eval=FALSE}
# Helper functions
#
flatten_list_of_deps <- function(updated_data, data_deps) {
# Get higher deps fnc
get_higher_deps <- function(cur_dep, data_deps) {
sapply(seq_along(data_deps), function(x) {
if (any(cur_dep %in% data_deps[[x]])) {
names(data_deps)[x]
}
})
}
# Get lower deps fnc
get_lower_deps <- function(cur_dep, data_deps) {
data_deps[sapply(cur_dep, function(x) which(x == names(data_deps)))]
}
# Sort data_deps
sort_data_deps <- function(upd_data, data_deps) {
iup <- upd_data
for (ud in upd_data) {
up <- unlist(get_lower_deps(ud, data_deps))
if (any(up %in% upd_data)) {
iup <- unique(unlist(c(up[up %in% upd_data], iup)))
}
}
iup
}
# Firstly, lets sort by dependencies the initial updated data
fin_up <- sort_data_deps(updated_data, data_deps)
# Extracting higher dependencies for each value
cnt <- 1
while (cnt < length(fin_up)) {
cur_deps <- unlist(
get_higher_deps(fin_up[cnt], data_deps)
)
if (!is.null(cur_deps)) {
cur_deps <- sort_data_deps(cur_deps, data_deps)
fin_up <- unique(c(fin_up[seq_len(cnt)], cur_deps, fin_up[-seq_len(cnt)]))
}
cnt <- cnt + 1
}
fin_up
}
```

```{r, eval=FALSE}
library(dplyr)
library(random.cdisc.data)
library(diffdf)
library(dplyr)
# call function to be able to match random number generation from previous R versions
# Call function to match random number generation from previous R versions
RNGkind(sample.kind = "Rounding")
# create the new cached data sets
cadsl_new <- radsl(seed = 1, na_vars = list()) %>% `attr<-`("creation date", Sys.Date())
cadae_new <- radae(cadsl_new, seed = 1, na_vars = list()) %>% `attr<-`("creation date", Sys.Date())
cadaette_new <- radaette(cadsl_new, seed = 1, na_vars = list()) %>% `attr<-`("creation date", Sys.Date())
cadcm_new <- radcm(cadsl_new, seed = 1, who_coding = TRUE, na_vars = list()) %>% `attr<-`("creation date", Sys.Date())
cadex_new <- radex(cadsl_new, seed = 1, na_vars = list()) %>% `attr<-`("creation date", Sys.Date())
cadlb_new <- radlb(cadsl_new, seed = 1, na_vars = list()) %>% `attr<-`("creation date", Sys.Date())
cadmh_new <- radmh(cadsl_new, seed = 1, na_vars = list()) %>% `attr<-`("creation date", Sys.Date())
cadpc_new <- radpc(cadsl_new, seed = 1, na_vars = list()) %>% `attr<-`("creation date", Sys.Date())
cadpp_new <- radpp(cadsl_new, seed = 1, na_vars = list()) %>% `attr<-`("creation date", Sys.Date())
cadab_new <- radab(cadsl_new, cadpc_new, seed = 1, na_vars = list()) %>% `attr<-`("creation date", Sys.Date())
cadqs_new <- radqs(cadsl_new, seed = 1, na_vars = list()) %>% `attr<-`("creation date", Sys.Date())
cadrs_new <- radrs(cadsl_new, seed = 1, na_vars = list()) %>% `attr<-`("creation date", Sys.Date())
cadtte_new <- radtte(cadsl_new, seed = 1, na_vars = list()) %>% `attr<-`("creation date", Sys.Date())
cadvs_new <- radvs(cadsl_new, seed = 1, na_vars = list()) %>% `attr<-`("creation date", Sys.Date())
cadeg_new <- radeg(cadsl_new, seed = 1, na_vars = list()) %>% `attr<-`("creation date", Sys.Date())
cadtr_new <- radtr(cadsl_new, seed = 1, na_vars = list()) %>% `attr<-`("creation date", Sys.Date())
caddv_new <- raddv(cadsl_new, seed = 1, na_vars = list()) %>% `attr<-`("creation date", Sys.Date())
cadsub_new <- radsub(cadsl_new, seed = 1, na_vars = list()) %>% `attr<-`("creation date", Sys.Date())
cadhy_new <- radhy(cadsl_new, seed = 1) %>% `attr<-`("creation date", Sys.Date())
cadqlqc_new <- radqlqc(cadsl_new, percent = 80, number = 2, seed = 1) %>% `attr<-`("creation date", Sys.Date())
# use diffdf package: diffdf() to compare cached vs new and review/verify differences
diffdf(cadsl, cadsl_new)
diffdf(cadab, cadab_new)
diffdf(cadae, cadae_new)
diffdf(cadaette, cadaette_new)
diffdf(cadcm, cadcm_new)
diffdf(cadex, cadex_new)
diffdf(cadlb, cadlb_new)
diffdf(cadmh, cadmh_new)
diffdf(cadpc, cadpc_new)
diffdf(cadpp, cadpp_new)
diffdf(cadqs, cadqs_new)
diffdf(cadrs, cadrs_new)
diffdf(cadtte, cadtte_new)
diffdf(cadvs, cadvs_new)
diffdf(cadeg, cadeg_new)
diffdf(cadtr, cadtr_new)
diffdf(caddv, caddv_new)
diffdf(cadsub, cadsub_new)
diffdf(cadhy, cadhy_new)
diffdf(cadqlqc, cadqlqc_new)
# when diff reflects changes accurately then create environment objects to save out to cache
cadsl <- cadsl_new
cadab <- cadab_new
cadae <- cadae_new
cadaette <- cadaette_new
cadcm <- cadcm_new
cadeg <- cadeg_new
cadex <- cadex_new
cadlb <- cadlb_new
cadmh <- cadmh_new
cadpc <- cadpc_new
cadpp <- cadpp_new
cadqs <- cadqs_new
cadrs <- cadrs_new
cadtr <- cadtr_new
cadtte <- cadtte_new
cadvs <- cadvs_new
caddv <- caddv_new
cadsub <- cadsub_new
cadhy <- cadhy_new
cadqlqc <- cadqlqc_new
# update cache
save(cadsl, file = "data/cadsl.RData", compress = "xz")
save(cadab, file = "data/cadab.RData", compress = "xz")
save(cadae, file = "data/cadae.RData", compress = "xz")
save(cadaette, file = "data/cadaette.RData", compress = "xz")
save(cadcm, file = "data/cadcm.RData", compress = "xz")
save(cadeg, file = "data/cadeg.RData", compress = "xz")
save(cadex, file = "data/cadex.RData", compress = "xz")
save(cadlb, file = "data/cadlb.RData", compress = "xz")
save(cadmh, file = "data/cadmh.RData", compress = "xz")
save(cadpc, file = "data/cadpc.RData", compress = "xz")
save(cadpp, file = "data/cadpp.RData", compress = "xz")
save(cadqs, file = "data/cadqs.RData", compress = "xz")
save(cadrs, file = "data/cadrs.RData", compress = "xz")
save(cadtr, file = "data/cadtr.RData", compress = "xz")
save(cadtte, file = "data/cadtte.RData", compress = "xz")
save(cadvs, file = "data/cadvs.RData", compress = "xz")
save(caddv, file = "data/caddv.RData", compress = "xz")
save(cadsub, file = "data/cadsub.RData", compress = "xz")
save(cadhy, file = "data/cadhy.RData", compress = "xz")
save(cadqlqc, file = "data/cadqlqc.RData", compress = "xz")
# Datasets must be listed after all of their dependencies
# e.g. adsl is a dependency for all other datasets so it is listed first.
pkg_dir <- dirname(getwd())
# Listing source files and extraction of datasets' names
src_files <- list.files(paste0(pkg_dir, "/R"))
data_nms <- src_files[grepl("^ra*", src_files)] %>%
stringr::str_remove(pattern = "^r") %>%
stringr::str_remove(pattern = ".R$") %>%
sort()
# Exception handling
data_nms <- data_nms[data_nms != "adsaftte"] # Unbuilt for now
# Construction of dependency tree based on formals
data_deps <- sapply(
data_nms,
function(x) {
dat_args <- names(formals(paste0("r", x)))
dat_args[dat_args %in% data_nms]
}
)
git_call <- "git diff origin/main --name-only"
updated_files <- tryCatch(
system(git_call, intern = TRUE),
error = function(e) e
)
status_uf <- attr(updated_files, "status")
if (is(updated_files, "error") || (!is.null(status_uf) && status_uf == 1)) {
message("Found following error in git call: ", git_call)
message(e)
message(
"The calculation continues as default by recreating all datasets ",
"and updating the cached data if any change is found."
)
updated_data <- data_nms
} else {
updated_data <- updated_files[grepl("^R\\/", updated_files)] %>%
stringr::str_remove("^R\\/") %>%
stringr::str_remove(pattern = "^r") %>%
stringr::str_remove(pattern = ".R$")
}
if (length(updated_data) != 0) {
stopifnot(all(updated_data %in% names(data_deps)))
data_to_update <- flatten_list_of_deps(updated_data, data_deps)
default_args <- list(seed = 1, na_vars = list(), who_coding = TRUE, percent = 80, number = 2)
# Generate and save updated cached datasets
for (dat in data_to_update) {
# Match arguments with defaults
dat_args <- default_args[names(default_args) %in% names(formals(paste0("r", dat)))]
# Get the data deps cache that is already there (if adsl returns list())
dat_deps <- lapply(data_deps[[dat]], function(x) get(paste0("c", x)))
# Main call to creation function
cdataset <- do.call(paste0("r", dat), c(dat_args, dat_deps))
# Preview differences
cat("\nSaving cached data for dataset", paste0("*", dat, "*"), "with the following changes found (diffdf):\n")
diff_test <- diffdf(get(paste0("c", dat)), cdataset)
print(diff_test)
# Check if there is any actual change to the data
if (length(diff_test) > 0) { # If no difference -> list()
# Save new cached dataset
assign(paste0("c", dat), cdataset)
fl_save <- paste0(dirname(getwd()), "/data/c", dat, ".RData")
attr(cdataset, "creation date") <- lubridate::date() # This should NOT be updated if no changes in diffdf # nolint
save(list = paste0("c", dat), file = fl_save, compress = "xz")
cat("Cached dataset updated for", paste0("*", dat, "*"), "in", paste0("data/", basename(fl_save), "."), "\n")
} else {
message("No update detected on the final data. No cached data was updated for *", dat, "*.")
}
}
} else {
message("No source files changed: no cached datasets currently require updates.")
}
```

0 comments on commit 8a6332a

Please sign in to comment.