diff --git a/.Rbuildignore b/.Rbuildignore index 4ff0ac6e..940803e6 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -12,4 +12,7 @@ ^docs$ ^pkgdown$ ^data-raw$ +^.DS_Store$ +^CRAN-SUBMISSION$ +^cran-comments\.md$ diff --git a/.github/workflows/check_nightly_cran.yaml b/.github/workflows/on_nightly_cran.yaml similarity index 96% rename from .github/workflows/check_nightly_cran.yaml rename to .github/workflows/on_nightly_cran.yaml index aaeffe7f..c74f3bba 100644 --- a/.github/workflows/check_nightly_cran.yaml +++ b/.github/workflows/on_nightly_cran.yaml @@ -4,7 +4,7 @@ on: schedule: - cron: '0 4 * * 1' -name: Check Nightly (CRAN) +name: On Nightly (CRAN) jobs: diff --git a/.github/workflows/check_nightly_main.yaml b/.github/workflows/on_nightly_main.yaml similarity index 96% rename from .github/workflows/check_nightly_main.yaml rename to .github/workflows/on_nightly_main.yaml index f8f2be78..bdca4992 100644 --- a/.github/workflows/check_nightly_main.yaml +++ b/.github/workflows/on_nightly_main.yaml @@ -4,7 +4,7 @@ on: schedule: - cron: '0 4 * * 1,2,3,4,5' -name: Check Nightly (Main) +name: On Nightly (Main) jobs: diff --git a/.github/workflows/check_pr.yaml b/.github/workflows/on_pr.yaml similarity index 62% rename from .github/workflows/check_pr.yaml rename to .github/workflows/on_pr.yaml index f69f8830..42532fd1 100644 --- a/.github/workflows/check_pr.yaml +++ b/.github/workflows/on_pr.yaml @@ -5,7 +5,7 @@ on: paths-ignore: - 'misc/**' -name: Check Pull Requests +name: On Pull Request jobs: @@ -36,10 +36,26 @@ jobs: - name: Build src uses: ./.github/actions/build-src - - name: Check run: | options(crayon.enabled = TRUE) rcmdcheck::rcmdcheck(args = c("--no-manual", "--as-cran"), error_on = "warning") shell: Rscript {0} - + + + vignettes: + strategy: + fail-fast: true + matrix: + config: + - { image: "ghcr.io/${{ github.repository_owner }}/rbmi:r404"} + - { image: "ghcr.io/${{ github.repository_owner }}/rbmi:r410"} + runs-on: ubuntu-latest + container: + image: ${{ matrix.config.image }} + steps: + - name: Checkout + uses: actions/checkout@v2 + - name: testthat + run: | + Rscript ./vignettes/build.R diff --git a/.github/workflows/check_pr_cran.yaml b/.github/workflows/on_pr_cran.yaml similarity index 97% rename from .github/workflows/check_pr_cran.yaml rename to .github/workflows/on_pr_cran.yaml index 884f31c3..dffacc93 100644 --- a/.github/workflows/check_pr_cran.yaml +++ b/.github/workflows/on_pr_cran.yaml @@ -7,7 +7,7 @@ on: branches: - cran -name: Check Pull Requests (CRAN) +name: On Pull Request CRAN jobs: diff --git a/.gitignore b/.gitignore index fa907402..2b6b7585 100644 --- a/.gitignore +++ b/.gitignore @@ -48,6 +48,7 @@ po/*~ src/* .DS_store +.DS_Store nul diff --git a/DESCRIPTION b/DESCRIPTION index c5d1c138..d7db4e6d 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -7,7 +7,7 @@ Authors@R: c( person("Marcel", "Wolbers", email = "marcel.wolbers@roche.com", role = "ctb"), person("Roche", role = c("cph", "fnd")) ) -Description: Implements reference based multiple imputation allowing for the imputation of longditudinal datasets using pre-defined strategies. +Description: Implements reference based multiple imputation allowing for the imputation of longitudinal datasets using pre-defined strategies. Encoding: UTF-8 LazyData: true Roxygen: list(markdown = TRUE) @@ -26,7 +26,8 @@ Suggests: bookdown, lubridate, purrr, - ggplot2 + ggplot2, + R.rsp Biarch: true Config/testthat/edition: 3 Imports: @@ -50,4 +51,4 @@ SystemRequirements: GNU make Depends: R (>= 3.4.0) License: Apache License (>= 2) -VignetteBuilder: knitr +VignetteBuilder: R.rsp diff --git a/R/ancova.R b/R/ancova.R index 42d64ce0..57a08af4 100644 --- a/R/ancova.R +++ b/R/ancova.R @@ -159,7 +159,7 @@ ancova <- function(data, vars, visits = NULL, weights = c("proportional", "equal #' @param group Character, the name of the group variable in `data`. #' @param covariates Character vector containing the name of any additional covariates #' to be included in the model as well as any interaction terms. -#' @param weights Character, specifies wether to use "proportional" or "equal" weighting for each +#' @param weights Character, specifies whether to use "proportional" or "equal" weighting for each #' categorical covariate combination when calculating the lsmeans. #' #' @details diff --git a/R/dataclasses.R b/R/dataclasses.R index c6da88f3..7323a1fc 100644 --- a/R/dataclasses.R +++ b/R/dataclasses.R @@ -199,7 +199,7 @@ print.imputation_df <- function(x, ...) { #' which contains a matrix of [imputation_single()] objects grouped by a single `id`. The matrix #' is split so that it has D columns (i.e. for non-bmlmi methods this will always be 1) #' -#' The `id` attribute is deterimined by extracting the `id` attribute from the contributing +#' The `id` attribute is determined by extracting the `id` attribute from the contributing #' [imputation_single()] objects. An error is throw if multiple `id` are detected imputation_list_single <- function(imputations, D = 1) { diff --git a/R/delta.R b/R/delta.R index f79bef2e..62fa4bca 100644 --- a/R/delta.R +++ b/R/delta.R @@ -231,7 +231,7 @@ get_delta_template <- function(imputations) { } -#' Calculate delta from a lagged Sscale coefficient +#' Calculate delta from a lagged scale coefficient #' #' @description #' Calculates a delta value based upon a baseline delta value and a diff --git a/R/draws.R b/R/draws.R index ea7bb118..246c1a0d 100644 --- a/R/draws.R +++ b/R/draws.R @@ -133,7 +133,7 @@ #' - `group`: name of the column in `data` which contains the group variable. #' - `outcome`: name of the column in `data` which contains the outcome variable. #' - `covariates`: vector of characters which contains the covariates to be included -#' in the model (including interactions which are specified as "covariateName1*covariateName2"). +#' in the model (including interactions which are specified as `"covariateName1*covariateName2"``). #' If no covariates are provided the default model specification of `outcome ~ 1 + visit + group` will be used. #' Please note that the `group*visit` interaction #' is **not** included in the model by default. @@ -283,7 +283,7 @@ draws.bmlmi <- function(data, data_ice = NULL, vars, method, ncores = 1, quiet = #' #' This function takes a `Stack` object which contains multiple lists of patient ids. The function #' takes this Stack and pulls a set ids and then constructs a dataset just consisting of these -#' patients (i.e. potentially a boostrap or a jackknife sample). +#' patients (i.e. potentially a bootstrap or a jackknife sample). #' #' The function then fits a MMRM model to this dataset to create a sample object. The function #' repeats this process until `n_target_samples` have been reached. If more than `failure_limit` diff --git a/R/impute.R b/R/impute.R index 4c2dfb49..ec15eb0e 100644 --- a/R/impute.R +++ b/R/impute.R @@ -221,7 +221,7 @@ impute_internal <- function(draws, references = NULL, update_strategy, strategie #' that are required for that dataset. The total number of ID's must by equal to the #' total number of rows within all of `imputes$imputations` #' -#' To accomdate for `method_bmlmi()` the [impute_data_individual()] function returns +#' To accommodate for `method_bmlmi()` the [impute_data_individual()] function returns #' a list of [imputation_list_single()] objects with 1 object per each subject. #' #' [imputation_list_single()] stores the subjects imputations as a matrix where the columns @@ -280,7 +280,7 @@ impute_internal <- function(draws, references = NULL, update_strategy, strategie #' ) #' ``` #' -#' Note that the different repititions (i.e. the value set for D) are grouped together +#' Note that the different repetitions (i.e. the value set for D) are grouped together #' sequentially. #' convert_to_imputation_list_df <- function(imputes, sample_ids) { diff --git a/R/longData.R b/R/longData.R index 4253cb2f..1b489404 100644 --- a/R/longData.R +++ b/R/longData.R @@ -491,7 +491,7 @@ longDataConstructor <- R6::R6Class( #' @description #' Constructor function. - #' @param data longditudinal dataset. + #' @param data longitudinal dataset. #' @param vars an `ivars` object created by [set_vars()]. initialize = function(data, vars) { data_raw <- as_dataframe(data) diff --git a/R/lsmeans.R b/R/lsmeans.R index a0670917..f4dd226f 100644 --- a/R/lsmeans.R +++ b/R/lsmeans.R @@ -31,7 +31,7 @@ #' @param ... Fixes specific variables to specific values i.e. #' `trt = 1` or `age = 50`. The name of the argument must be the name #' of the variable within the dataset. -#' @param .weights Character, specifies wether to use "proportional" or "equal" weighting for each +#' @param .weights Character, specifies whether to use "proportional" or "equal" weighting for each #' categorical covariate combination when calculating the lsmeans. #' #' @references \url{https://CRAN.R-project.org/package=emmeans} diff --git a/R/mcmc.R b/R/mcmc.R index 0dea0e2a..2ff35615 100644 --- a/R/mcmc.R +++ b/R/mcmc.R @@ -126,8 +126,8 @@ fit_mcmc <- function( } ignorable_warnings <- c( - "Bulk Effective Samples Size (ESS) is too low, indicating posterior means and medians may be unreliable.\nRunning the chains for more iterations may help. See\nhttp://mc-stan.org/misc/warnings.html#bulk-ess", - "Tail Effective Samples Size (ESS) is too low, indicating posterior variances and tail quantiles may be unreliable.\nRunning the chains for more iterations may help. See\nhttp://mc-stan.org/misc/warnings.html#tail-ess" + "Bulk Effective Samples Size (ESS) is too low, indicating posterior means and medians may be unreliable.\nRunning the chains for more iterations may help. See\nhttps://mc-stan.org/misc/warnings.html#bulk-ess", + "Tail Effective Samples Size (ESS) is too low, indicating posterior variances and tail quantiles may be unreliable.\nRunning the chains for more iterations may help. See\nhttps://mc-stan.org/misc/warnings.html#tail-ess" ) # handle warning: display only warnings if diff --git a/R/mmrm.R b/R/mmrm.R index ac320669..270d2d38 100644 --- a/R/mmrm.R +++ b/R/mmrm.R @@ -175,7 +175,7 @@ extract_params <- function(fit) { #' @param group a character / factor vector. Indicates which treatment group the patient belongs to. #' @param cov_struct a character value. Specifies which covariance structure to use. Must be one of #' `"us"`, `"toep"`, `"cs"` or `"ar1"` -#' @param REML logical. Specifies whether restricted maximium likelihood should be used +#' @param REML logical. Specifies whether restricted maximum likelihood should be used #' @param same_cov logical. Used to specify if a shared or individual covariance matrix should be used #' per `group` #' @param initial_values a list with names `beta` and `theta`. Specifies the initial values to start diff --git a/R/parallel.R b/R/parallel.R index a9051816..7ed779a1 100644 --- a/R/parallel.R +++ b/R/parallel.R @@ -5,7 +5,7 @@ #' #' This function spawns a PSOCK cluster and exports all of the #' rbmi namespace into the the sub processes as well as loading -#' assertthat and glmmTMB +#' `assertthat` and `glmmTMB` get_cluster <- function(ncores = 1) { if (ncores == 1) { return(NULL) diff --git a/R/strategies.R b/R/strategies.R index d86809b5..b1fb2681 100644 --- a/R/strategies.R +++ b/R/strategies.R @@ -20,7 +20,7 @@ #' @references #' Carpenter, James R., James H. Roger, and Michael G. Kenward. "Analysis of longitudinal #' trials with protocol deviation: a framework for relevant, accessible assumptions, and -#' inference via multiple imputation." Journal of biopharmaceutical statistics 23.6 (2013): +#' inference via multiple imputation." Journal of Biopharmaceutical statistics 23.6 (2013): #' 1352-1371. compute_sigma <- function(sigma_group, sigma_ref, index_mar) { diff --git a/R/utilities.R b/R/utilities.R index ea08412c..cd4fa8cb 100644 --- a/R/utilities.R +++ b/R/utilities.R @@ -506,7 +506,7 @@ is_num_char_fact <- function(x) { #' @param x a data.frame like object #' #' Utility function to convert a "data.frame-like" object to an actual `data.frame` -#' to avoid issues with inconsitencies on methods (such as `[`() and dplyr's grouped dataframes) +#' to avoid issues with inconsistency on methods (such as `[`() and dplyr's grouped dataframes) as_dataframe <- function(x) { x2 <- as.data.frame(x) row.names(x2) <- NULL diff --git a/cran-comments.md b/cran-comments.md index a6982094..345713a3 100644 --- a/cran-comments.md +++ b/cran-comments.md @@ -1,12 +1,3 @@ -## Test environments - -The package was tested in the following environments (in GitHub actions): - -- Ubuntu 20.04, R release -- Windows latest, R release -- Mac OS latest, R release -- Ubuntu 20.04, R devel - ## R CMD check results There were no ERRORs or WARNINGs. @@ -26,7 +17,18 @@ There were 3 NOTEs: ❯ checking for GNU extensions in Makefiles ... NOTE GNU make is a SystemRequirements. -Both of the above notes are a consequence of using rstan in the package following the usage steps as described by the stan developers [here](https://cran.r-project.org/web/packages/rstantools/vignettes/minimal-rstan-package.html). Our understanding from the developers is that they are acceptable to ignore. +Both of the above notes are a consequence of using rstan in the package following the usage steps as described by the stan developers [here](https://cran.r-project.org/web/packages/rstantools/vignettes/minimal-rstan-package.html). Our understanding from the [developers](https://discourse.mc-stan.org/t/using-rstan-in-an-r-package-generates-r-cmd-check-notes/26628) is that they are acceptable to ignore. + + +## Test environments + +The package was tested in the following environments (in GitHub actions): + +- Ubuntu 20.04, R release +- Windows latest, R release +- Mac OS latest, R release +- Ubuntu 20.04, R devel + ## Downstream dependencies diff --git a/man/ancova_single.Rd b/man/ancova_single.Rd index 1352866c..464e652a 100644 --- a/man/ancova_single.Rd +++ b/man/ancova_single.Rd @@ -22,7 +22,7 @@ ancova_single( \item{covariates}{Character vector containing the name of any additional covariates to be included in the model as well as any interaction terms.} -\item{weights}{Character, specifies wether to use "proportional" or "equal" weighting for each +\item{weights}{Character, specifies whether to use "proportional" or "equal" weighting for each categorical covariate combination when calculating the lsmeans.} } \description{ diff --git a/man/as_dataframe.Rd b/man/as_dataframe.Rd index de85c8c5..97356ae6 100644 --- a/man/as_dataframe.Rd +++ b/man/as_dataframe.Rd @@ -10,7 +10,7 @@ as_dataframe(x) \item{x}{a data.frame like object Utility function to convert a "data.frame-like" object to an actual \code{data.frame} -to avoid issues with inconsitencies on methods (such as \code{[}() and dplyr's grouped dataframes)} +to avoid issues with inconsistency on methods (such as \code{[}() and dplyr's grouped dataframes)} } \description{ Convert object to dataframe diff --git a/man/compute_sigma.Rd b/man/compute_sigma.Rd index d60a1d8c..bbbba3e3 100644 --- a/man/compute_sigma.Rd +++ b/man/compute_sigma.Rd @@ -26,6 +26,6 @@ et al. (2013) \references{ Carpenter, James R., James H. Roger, and Michael G. Kenward. "Analysis of longitudinal trials with protocol deviation: a framework for relevant, accessible assumptions, and -inference via multiple imputation." Journal of biopharmaceutical statistics 23.6 (2013): +inference via multiple imputation." Journal of Biopharmaceutical statistics 23.6 (2013): 1352-1371. } diff --git a/man/convert_to_imputation_list_df.Rd b/man/convert_to_imputation_list_df.Rd index df0744a4..3a1abd4a 100644 --- a/man/convert_to_imputation_list_df.Rd +++ b/man/convert_to_imputation_list_df.Rd @@ -15,7 +15,7 @@ must contain a vector of "ID"'s which correspond to the \code{\link[=imputation_ that are required for that dataset. The total number of ID's must by equal to the total number of rows within all of \code{imputes$imputations} -To accomdate for \code{method_bmlmi()} the \code{\link[=impute_data_individual]{impute_data_individual()}} function returns +To accommodate for \code{method_bmlmi()} the \code{\link[=impute_data_individual]{impute_data_individual()}} function returns a list of \code{\link[=imputation_list_single]{imputation_list_single()}} objects with 1 object per each subject. \code{\link[=imputation_list_single]{imputation_list_single()}} stores the subjects imputations as a matrix where the columns @@ -68,7 +68,7 @@ Then \code{convert_to_imputation_df(imputes, sample_ids)} would result in:\prefo ) } -Note that the different repititions (i.e. the value set for D) are grouped together +Note that the different repetitions (i.e. the value set for D) are grouped together sequentially.} } \description{ diff --git a/man/d_lagscale.Rd b/man/d_lagscale.Rd index 16867327..b9ae829f 100644 --- a/man/d_lagscale.Rd +++ b/man/d_lagscale.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/delta.R \name{d_lagscale} \alias{d_lagscale} -\title{Calculate delta from a lagged Sscale coefficient} +\title{Calculate delta from a lagged scale coefficient} \usage{ d_lagscale(delta, dlag, is_post_ice) } diff --git a/man/draws.Rd b/man/draws.Rd index 6aa8298a..4a628edc 100644 --- a/man/draws.Rd +++ b/man/draws.Rd @@ -187,9 +187,7 @@ The \code{vars} argument is a named list that specifies the names of key variabl \item \code{group}: name of the column in \code{data} which contains the group variable. \item \code{outcome}: name of the column in \code{data} which contains the outcome variable. \item \code{covariates}: vector of characters which contains the covariates to be included -in the model (including interactions which are specified as "covariateName1*covariateName2"). -If no covariates are provided the default model specification of \code{outcome ~ 1 + visit + group} will be used. -Please note that the \code{group*visit} interaction +in the model (including interactions which are specified as \verb{"covariateName1*covariateName2"``). If no covariates are provided the default model specification of }outcome ~ 1 + visit + group\verb{will be used. Please note that the}group*visit` interaction is \strong{not} included in the model by default. \item \code{strata}: covariates used as stratification variables in the bootstrap sampling. By default only the \code{vars$group} is set as stratification variable. diff --git a/man/fit_mmrm.Rd b/man/fit_mmrm.Rd index 32d3500b..a9955be7 100644 --- a/man/fit_mmrm.Rd +++ b/man/fit_mmrm.Rd @@ -34,7 +34,7 @@ the same subject.} \item{cov_struct}{a character value. Specifies which covariance structure to use. Must be one of \code{"us"}, \code{"toep"}, \code{"cs"} or \code{"ar1"}} -\item{REML}{logical. Specifies whether restricted maximium likelihood should be used} +\item{REML}{logical. Specifies whether restricted maximum likelihood should be used} \item{same_cov}{logical. Used to specify if a shared or individual covariance matrix should be used per \code{group}} diff --git a/man/get_cluster.Rd b/man/get_cluster.Rd index c00a3c0f..d1b8bb55 100644 --- a/man/get_cluster.Rd +++ b/man/get_cluster.Rd @@ -11,7 +11,7 @@ get_cluster(ncores = 1) This function spawns a PSOCK cluster and exports all of the rbmi namespace into the the sub processes as well as loading -assertthat and glmmTMB} +\code{assertthat} and \code{glmmTMB}} } \description{ Create cluster diff --git a/man/get_draws_mle.Rd b/man/get_draws_mle.Rd index a92d7c8b..14a4d10c 100644 --- a/man/get_draws_mle.Rd +++ b/man/get_draws_mle.Rd @@ -72,7 +72,7 @@ specified by \code{method$n_samples}. Returns the parameter estimates from the m \details{ This function takes a \code{Stack} object which contains multiple lists of patient ids. The function takes this Stack and pulls a set ids and then constructs a dataset just consisting of these -patients (i.e. potentially a boostrap or a jackknife sample). +patients (i.e. potentially a bootstrap or a jackknife sample). The function then fits a MMRM model to this dataset to create a sample object. The function repeats this process until \code{n_target_samples} have been reached. If more than \code{failure_limit} diff --git a/man/imputation_list_single.Rd b/man/imputation_list_single.Rd index e52ed7ca..9883f5ac 100644 --- a/man/imputation_list_single.Rd +++ b/man/imputation_list_single.Rd @@ -17,7 +17,7 @@ This is a constructor function to create a \code{imputation_list_single} object which contains a matrix of \code{\link[=imputation_single]{imputation_single()}} objects grouped by a single \code{id}. The matrix is split so that it has D columns (i.e. for non-bmlmi methods this will always be 1) -The \code{id} attribute is deterimined by extracting the \code{id} attribute from the contributing +The \code{id} attribute is determined by extracting the \code{id} attribute from the contributing \code{\link[=imputation_single]{imputation_single()}} objects. An error is throw if multiple \code{id} are detected} } \description{ diff --git a/man/longDataConstructor.Rd b/man/longDataConstructor.Rd index c07b27a6..f8cc6795 100644 --- a/man/longDataConstructor.Rd +++ b/man/longDataConstructor.Rd @@ -333,7 +333,7 @@ Constructor function. \subsection{Arguments}{ \if{html}{\out{
simulate_data()
rbmi
under reference-based imputationThe purpose of this vignette is to provide an overview of some more advanced features of the rbmi
package.
+The sections of the vignette are relatively self-contained, i.e. readers should be able to jump directly to the section which covers the functionality that they are most interested in.
simulate_data()
In order to demonstrate the advanced functions we will first create a simulated dataset with the rbmi
function simulate_data()
.
+The simulate_data()
function generates data from a randomized clinical trial with longitudinal continuous outcomes and up to two different types of intercurrent events (ICEs).
+One intercurrent event (ICE1) may be thought of as a discontinuation from study treatment due to study drug or condition related (SDCR) reasons.
+The other event (ICE2) may be thought of as discontinuation from study treatment due to not study drug or condition related (NSDCR) reasons.
+For the purpose of this vignette, we simulate data similarly to the simulation study reported in Wolbers et al. (2021) (though we change some of the simulation parameters) and include only one ICE type (ICE1).
Specifically, we simulate a 1:1 randomized trial of an active drug (intervention) versus placebo (control) with 100 subjects per group and 6 post-baseline assessments (bi-monthly visits until 12 months) under the following assumptions:
+The function simulate_data()
requires 3 arguments (see the function documentation help(simulate_data)
for more details):
pars_c
: The simulation parameters of the control grouppars_t
: The simulation parameters of the intervention grouppost_ice1_traj
: Specifies how observed outcomes after ICE1 are simulatedBelow, we report how data according to the specifications above can be simulated with function simulate_data()
:
library(rbmi)
+library(dplyr)
+library(ggplot2)
+library(purrr)
+
+set.seed(122)
+
+<- 100
+ n <- c(0, 2, 4, 6, 8, 10, 12)
+ time
+# Mean trajectory control
+<- c(50.0, 51.66667, 53.33333, 55.0, 56.66667, 58.33333, 60.0)
+ muC
+# Mean trajectory intervention
+<- c(50.0, 51.66667, 53.33333, 54.16667, 55.0, 55.83333, 56.66667)
+ muT
+# Create Sigma
+<- 2.5
+ sd_error <- rbind(
+ covRE c(25.0, 6.25),
+ c(6.25, 25.0)
+
+ )
+<- cbind(1, time / 12) %*% covRE %*% rbind(1, time / 12) + diag(sd_error^2, nrow = length(time))
+ Sigma
+# Set probability of discontinuation
+<- 0.02
+ probDisc_C <- 0.03
+ probDisc_T <- 1.10 # +1 point increase => +10% odds of discontinuation
+ or_outcome
+# Set drop-out rate following discontinuation
+<- 0.5
+ prob_dropout
+# Set simulation parameters of the control group
+<- set_simul_pars(
+ parsC mu = muC,
+ sigma = Sigma,
+ n = n,
+ prob_ice1 = probDisc_C,
+ or_outcome_ice1 = or_outcome,
+ prob_post_ice1_dropout = prob_dropout
+
+ )
+# Set simulation parameters of the intervention group
+<- parsC
+ parsT $mu <- muT
+ parsT$prob_ice1 <- probDisc_T
+ parsT
+# Set assumption about post-ice trajectory
+<- "CIR"
+ post_ice_traj
+# Simulate data
+<- simulate_data(
+ data pars_c = parsC,
+ pars_t = parsT,
+ post_ice1_traj = post_ice_traj
+
+ )
+head(data)
+#> id visit group outcome_bl outcome_noICE ind_ice1 ind_ice2 dropout_ice1 outcome
+#> 1 id_1 0 Control 57.32704 57.32704 0 0 0 57.32704
+#> 2 id_1 1 Control 57.32704 54.69751 1 0 1 NA
+#> 3 id_1 2 Control 57.32704 58.60702 1 0 1 NA
+#> 4 id_1 3 Control 57.32704 61.50119 1 0 1 NA
+#> 5 id_1 4 Control 57.32704 56.68363 1 0 1 NA
+#> 6 id_1 5 Control 57.32704 66.14799 1 0 1 NA
+
+# As a simple descriptive of the simulated data, summarize the number of subjects with ICEs and missing data
+%>%
+ data group_by(id) %>%
+ summarise(
+ group = group[1],
+ any_ICE = (any(ind_ice1 == 1)),
+ any_NA = any(is.na(outcome))) %>%
+ group_by(group) %>%
+ summarise(
+ subjects_with_ICE = sum(any_ICE),
+ subjects_with_missings = sum(any_NA)
+
+ )#> # A tibble: 2 × 3
+#> group subjects_with_ICE subjects_with_missings
+#> <fct> <int> <int>
+#> 1 Control 18 8
+#> 2 Intervention 25 14
rbmi
under reference-based imputationrbmi
always uses all non-missing outcome data from the input data set, i.e. such data are never overwritten during the imputation step or removed from the analysis step. This implies that if there are data which are considered to be irrelevant for treatment effect estimation (e.g. data after an ICE for which the estimand specified a hypothetical strategy), then such data need to be removed from the input data set by the user prior to calling the rbmi
functions.
For imputation under a missing at random (MAR
) strategy, all observed outcome data is also included in the fitting of the base imputation model. However, for ICEs handled using reference-based imputation methods (such as CIR
, CR
, and JR
), rbmi
excludes observed post-ICE data from the base imputation model. If these data were not excluded, then the base imputation model would mistakenly estimate mean trajectories based on a mixture of observed pre- and post-ICE data which are not relevant for reference-based imputations. However, any observed post-ICE data are added back into the data set after the fitting of the base imputation model and included as is in the subsequent imputation and analysis steps.
Post-ICE data in the control or reference group are also excluded from the base imputation model if the user specifies a reference-based imputation strategy for such ICEs. This ensures that an ICE has the same impact on the data included in the base imputation model regardless whether the ICE occurred in the control or the intervention group. On the other hand, imputation in the reference group is based on a MAR assumption even for reference-based imputation methods and it may be preferable in some settings to include such post-ICE data from the control group in the base imputation model. This can be implemented by specifying a MAR
strategy for the ICE in the control group and a reference-based strategy for the same ICE in the intervention group. We will use this latter approach in our example below.
The simulated trial data from section 2 assumed that outcomes in the intervention group observed after the ICE “treatment discontinuation” follow the increments observed in the control group. Thus the imputation of missing data in the intervention group after treatment discontinuation might be performed under a reference-based copy increments in reference (CIR
) assumption.
Specifically, we implement an estimator under the following assumptions:
+For illustration purposes, we chose MI based on approximate Bayesian posterior draws with 20 random imputations which is not very demanding from a computational perspective. In practical applications, the number of random imputations may need to be increased. Moreover, other imputations are also supported in rbmi
. For guidance regarding the choice of the imputation approach, we refer the user to a comparison between all implemented approaches in Section 3.9 of the “Statistical Specifications” vignette (vignette("stat_specs", package = "rbmi")
).
We first report the code to set the variables of the imputation and analysis models. If you are not yet familiar with the syntax, we recommend that you first check the “quickstart” vignette (vignette("quickstart", package = "rbmi")
).
# Create data_ice including the subject's first visit affected by the ICE and the imputation strategy
+# Imputation strategy for post-ICE data is CIR in the intervention group and MAR for the control group
+# (note that ICEs which are handled using MAR are optional and do not impact the analysis
+# because imputation of missing data under MAR is the default)
+<- data %>%
+ data_ice_CIR group_by(id) %>%
+ filter(ind_ice1 == 1) %>% # select visits with ICEs
+ mutate(strategy = ifelse(group == "Intervention", "CIR", "MAR")) %>%
+ summarise(
+ visit = visit[1], # Select first visit affected by the ICE
+ strategy = strategy[1]
+
+ )
+# Compute endpoint of interest: change from baseline and
+# remove rows corresponding to baseline visits
+<- data %>%
+ data filter(visit != 0) %>%
+ mutate(
+ change = outcome - outcome_bl,
+ visit = factor(visit, levels = unique(visit))
+
+ )
+# Define key variables for the imputation and analysis models
+<- set_vars(
+ vars subjid = "id",
+ visit = "visit",
+ outcome = "change",
+ group = "group",
+ covariates = c("visit*outcome_bl", "visit*group"),
+ strategy = "strategy"
+
+ )
+<- vars
+ vars_an $covariates <- "outcome_bl" vars_an
The chosen imputation method can be set with the function method_approxbayes()
as follows:
<- method_approxbayes(n_sample = 20) method
We can now sequentially call the 4 key functions of rbmi
to perform the multiple imputation. Please note that the management of observed post-ICE data is performed without additional complexity for the user. draws()
automatically excludes post-ICE data handled with a reference-based method (but keeps post-ICE data handled using MAR) using information provided by the argument data_ice
. impute()
will impute only truly missing data in data[[vars$outcome]]
.
<- draws(
+ draw_obj data = data,
+ data_ice = data_ice_CIR,
+ vars = vars,
+ method = method,
+ quiet = TRUE,
+ ncores = 2
+
+ )
+<- impute(
+ impute_obj_CIR
+ draw_obj,references = c("Control" = "Control", "Intervention" = "Control")
+
+ )
+<- analyse(
+ ana_obj_CIR
+ impute_obj_CIR,vars = vars_an
+
+ )
+<- pool(ana_obj_CIR)
+ pool_obj_CIR
+ pool_obj_CIR#>
+#> Pool Object
+#> -----------
+#> Number of Results Combined: 20
+#> Method: rubin
+#> Confidence Level: 0.95
+#> Alternative: two.sided
+#>
+#> Results:
+#>
+#> ==================================================
+#> parameter est se lci uci pval
+#> --------------------------------------------------
+#> trt_1 -0.486 0.512 -1.496 0.524 0.343
+#> lsm_ref_1 2.62 0.362 1.907 3.333 <0.001
+#> lsm_alt_1 2.133 0.362 1.42 2.847 <0.001
+#> trt_2 -0.066 0.542 -1.135 1.004 0.904
+#> lsm_ref_2 3.707 0.384 2.95 4.464 <0.001
+#> lsm_alt_2 3.641 0.383 2.885 4.397 <0.001
+#> trt_3 -1.782 0.607 -2.979 -0.585 0.004
+#> lsm_ref_3 5.841 0.428 4.997 6.685 <0.001
+#> lsm_alt_3 4.059 0.428 3.214 4.904 <0.001
+#> trt_4 -2.518 0.692 -3.884 -1.152 <0.001
+#> lsm_ref_4 7.656 0.492 6.685 8.627 <0.001
+#> lsm_alt_4 5.138 0.488 4.176 6.1 <0.001
+#> trt_5 -3.658 0.856 -5.346 -1.97 <0.001
+#> lsm_ref_5 9.558 0.598 8.379 10.738 <0.001
+#> lsm_alt_5 5.9 0.609 4.699 7.101 <0.001
+#> trt_6 -4.537 0.954 -6.42 -2.655 <0.001
+#> lsm_ref_6 11.049 0.666 9.735 12.362 <0.001
+#> lsm_alt_6 6.511 0.674 5.181 7.841 <0.001
+#> --------------------------------------------------
This last output gives an estimated difference of +-4.537 (95% CI -6.420 to -2.655) +between the two groups at the last visit with an associated p-value lower than 0.001.
+The draws()
function is by far the most computationally intensive function in rbmi
.
+In some settings, it may be important to explore the impact of a change in the
+reference-based imputation strategy on the results.
+Such a change does not affect the imputation model but it does
+affect the subsequent imputation step.
+In order to allow changes in the imputation strategy without having to re-run the
+draws()
function, the function impute()
has an additional argument update_strategies
.
However, please note that this functionality comes with some important limitations:
+As described at the beginning of Section 3, post-ICE outcomes are included in the input dataset for the base imputation model if the imputation method is MAR
but they are excluded for reference-based imputation methods (such as CIR
, CR
, and JR
).
+Therefore, updata_strategies
cannot be applied if the imputation strategy is changed from a MAR to a non-MAR strategy in the presence of observed post-ICE outcomes. Similarly, a change from a non-MAR strategy to MAR triggers a warning in the presence of observed post-ICE outcomes because the base imputation model was not fitted to all relevant data under MAR.
+Finally, update_strategies
cannot be applied if the timing of any of the ICEs is changed (in argument data_ice
) in addition to the imputation strategy.
As an example, we described an analysis under a copy increments in reference (CIR) assumption in the previous section. Let’s assume we want to change this strategy to a jump to reference imputation strategy for a sensitivity analysis. This can be efficiently implemented using update_strategies
as follows:
# Change ICE strategy from CIR to JR
+<- data_ice_CIR %>%
+ data_ice_JR mutate(strategy = ifelse(strategy == "CIR", "JR", strategy))
+
+<- impute(
+ impute_obj_JR
+ draw_obj,references = c("Control" = "Control", "Intervention" = "Control"),
+ update_strategy = data_ice_JR
+
+ )
+<- analyse(
+ ana_obj_JR
+ impute_obj_JR,vars = vars_an
+
+ )
+<- pool(ana_obj_JR)
+ pool_obj_JR
+ pool_obj_JR#>
+#> Pool Object
+#> -----------
+#> Number of Results Combined: 20
+#> Method: rubin
+#> Confidence Level: 0.95
+#> Alternative: two.sided
+#>
+#> Results:
+#>
+#> ==================================================
+#> parameter est se lci uci pval
+#> --------------------------------------------------
+#> trt_1 -0.485 0.513 -1.496 0.526 0.346
+#> lsm_ref_1 2.609 0.363 1.892 3.325 <0.001
+#> lsm_alt_1 2.124 0.361 1.412 2.836 <0.001
+#> trt_2 -0.06 0.535 -1.115 0.995 0.911
+#> lsm_ref_2 3.694 0.378 2.948 4.441 <0.001
+#> lsm_alt_2 3.634 0.381 2.882 4.387 <0.001
+#> trt_3 -1.767 0.598 -2.948 -0.587 0.004
+#> lsm_ref_3 5.845 0.422 5.012 6.677 <0.001
+#> lsm_alt_3 4.077 0.432 3.225 4.93 <0.001
+#> trt_4 -2.529 0.686 -3.883 -1.175 <0.001
+#> lsm_ref_4 7.637 0.495 6.659 8.614 <0.001
+#> lsm_alt_4 5.108 0.492 4.138 6.078 <0.001
+#> trt_5 -3.523 0.856 -5.213 -1.833 <0.001
+#> lsm_ref_5 9.554 0.61 8.351 10.758 <0.001
+#> lsm_alt_5 6.032 0.611 4.827 7.237 <0.001
+#> trt_6 -4.36 0.952 -6.238 -2.482 <0.001
+#> lsm_ref_6 11.003 0.676 9.669 12.337 <0.001
+#> lsm_alt_6 6.643 0.687 5.286 8 <0.001
+#> --------------------------------------------------
For imputations under a jump to reference assumption, we get an estimated difference of +-4.360 (95% CI -6.238 to -2.482) +between the two groups at the last visit with an associated p-value of +<0.001.
+Guizzaro et al. (2021) suggested to implement a treatment policy strategy via imputation under a MAR assumption after conditioning on the subject’s ICE status, +i.e. to impute missing post-ICE data based on observed post-ICE data. One possible implementation of this proposal is to add time-varying covariates to the imputation model. +A case study which implements this proposal and compares it to reference-based imputation methods for estimators in early Parkinson’s disease can be found in Noci et al. (2021).
+In some settings, this may be carried out by including a binary time-varying indicator of the subject’s ICE status at each visit (defined as 0 for pre-ICE visits and as 1 for post-ICE visits) to the imputation model. However, for the simulated data introduced in section 2, it may be more plausible to assume that treatment discontinuation leads to a change in the “slope” of the mean outcome trajectory. This can be implemented by including a time-varying covariate which is equal to 0 for visits prior to the treatment discontinuation and equal to the time from the treatment discontinuation for subsequent visits. The regression coefficient of the corresponding change in the post-ICE “slope” should then be allowed to depend on the assigned treatment group, i.e. the imputation model should include an interaction between the time-varying covariate and the treatment group.
+Let’s first define the time-varying covariate:
+<- data %>%
+ data group_by(id) %>%
+ mutate(time_from_ice1 = cumsum(ind_ice1)*2/12 ) # multiplication by 2/12 because visits are bi-monthly
We can then include the time-varying covariate in the imputation model, crossed with the group
variable:
<- set_vars(
+ vars_tv subjid = "id",
+ visit = "visit",
+ outcome = "change",
+ group = "group",
+ covariates = c("visit*outcome_bl", "visit*group", "time_from_ice1*group"),
+ strategy = "strategy"
+ )
We now sequentially call the 4 key rbmi
functions:
<- draws(
+ draw_obj data = data,
+ data_ice = NULL, # if NULL, MAR is assumed for all missing data
+ vars = vars_tv,
+ method = method,
+ quiet = TRUE
+
+ )
+<- impute(
+ impute_obj_tv
+ draw_obj,references = c("Control" = "Control", "Intervention" = "Intervention")
+
+ )
+<- analyse(
+ ana_obj_tv
+ impute_obj_tv,vars = vars_an
+
+ )
+pool(ana_obj_tv)
+#>
+#> Pool Object
+#> -----------
+#> Number of Results Combined: 20
+#> Method: rubin
+#> Confidence Level: 0.95
+#> Alternative: two.sided
+#>
+#> Results:
+#>
+#> ==================================================
+#> parameter est se lci uci pval
+#> --------------------------------------------------
+#> trt_1 -0.491 0.515 -1.507 0.524 0.341
+#> lsm_ref_1 2.623 0.362 1.908 3.338 <0.001
+#> lsm_alt_1 2.131 0.366 1.409 2.854 <0.001
+#> trt_2 0.018 0.55 -1.067 1.103 0.974
+#> lsm_ref_2 3.697 0.382 2.943 4.45 <0.001
+#> lsm_alt_2 3.715 0.394 2.936 4.493 <0.001
+#> trt_3 -1.802 0.614 -3.015 -0.59 0.004
+#> lsm_ref_3 5.815 0.429 4.97 6.661 <0.001
+#> lsm_alt_3 4.013 0.441 3.142 4.884 <0.001
+#> trt_4 -2.543 0.704 -3.932 -1.154 <0.001
+#> lsm_ref_4 7.609 0.486 6.65 8.568 <0.001
+#> lsm_alt_4 5.066 0.516 4.046 6.086 <0.001
+#> trt_5 -3.739 0.879 -5.475 -2.004 <0.001
+#> lsm_ref_5 9.499 0.606 8.302 10.695 <0.001
+#> lsm_alt_5 5.759 0.636 4.502 7.017 <0.001
+#> trt_6 -4.685 0.98 -6.622 -2.748 <0.001
+#> lsm_ref_6 10.988 0.667 9.67 12.305 <0.001
+#> lsm_alt_6 6.303 0.712 4.894 7.711 <0.001
+#> --------------------------------------------------
The following imputation strategies are implemented in rbmi
:
MAR
)JR
)CR
)CIR
)LMCF
)In addition, rbmi
allows the user to implement their own imputation strategy.
+To do this, the user needs to do three things:
data_ice
dataset provided to draws()
.impute()
.The imputation strategy function must take 3 arguments (pars_group
, pars_ref
, and index_mar
) and calculates the mean and covariance matrix of the subject’s marginal imputation distribution which will then be applied to subjects to which the strategy applies.
+Here, pars_group
contains the predicted mean trajectory (pars_group$mu
, a numeric vector) and covariance matrix (pars_group$sigma
) for a subject conditional on their assigned treatment group and covariates.
+pars_ref
contains the corresponding mean trajectory and covariance matrix conditional on the reference group and the subject’s covariates.
+index_mar
is a logical vector which specifies for each visit whether the visit is unaffected by an ICE handled using a non-MAR method or not.
+As an example, the user can check how the CIR strategy was implemented by looking at function strategy_CIR()
.
+ strategy_CIR#> function (pars_group, pars_ref, index_mar)
+#> {
+#> if (all(index_mar)) {
+#> return(pars_group)
+#> }
+#> else if (all(!index_mar)) {
+#> return(pars_ref)
+#> }
+#> mu <- pars_group$mu
+#> last_mar <- which(!index_mar)[1] - 1
+#> increments_from_last_mar_ref <- pars_ref$mu[!index_mar] -
+#> pars_ref$mu[last_mar]
+#> mu[!index_mar] <- mu[last_mar] + increments_from_last_mar_ref
+#> sigma <- compute_sigma(sigma_group = pars_group$sigma, sigma_ref = pars_ref$sigma,
+#> index_mar = index_mar)
+#> pars <- list(mu = mu, sigma = sigma)
+#> return(pars)
+#> }
+#> <bytecode: 0x7fe91cf7a468>
+#> <environment: namespace:rbmi>
To further illustrate this for a simple example, assume that a new strategy is to be implemented as follows: +- The marginal mean of the imputation distribution is equal to the marginal mean trajectory for the subject according to their assigned group and covariates up to the ICE. +- After the ICE the marginal mean of the imputation distribution is equal to the average of the visit-wise marginal means based on the subjects covariates and the assigned group or the reference group, respectively. +- For the covariance matrix of the marginal imputation distribution, the covariance matrix from the assigned group is taken.
+To do this, we first need to define the imputation function which for this example could be coded as follows:
+<- function(pars_group, pars_ref, index_mar) {
+ strategy_AVG <- (pars_group$mu + pars_ref$mu) / 2
+ mu_mean <- pars_group
+ x $mu[!index_mar] <- mu_mean[!index_mar]
+ xreturn(x)
+ }
And an example showing its use:
+<- list(
+ pars_group mu = c(1, 2, 3),
+ sigma = as_vcov(c(1, 3, 2), c(0.4, 0.5, 0.45))
+
+ )
+<- list(
+ pars_ref mu = c(5, 6, 7),
+ sigma = as_vcov(c(2, 1, 1), c(0.7, 0.8, 0.5))
+
+ )
+<- c(TRUE, TRUE, FALSE)
+ index_mar
+strategy_AVG(pars_group, pars_ref, index_mar)
+#> $mu
+#> [1] 1 2 5
+#>
+#> $sigma
+#> [,1] [,2] [,3]
+#> [1,] 1.0 1.2 1.0
+#> [2,] 1.2 9.0 2.7
+#> [3,] 1.0 2.7 4.0
To incorporate this into rbmi
, data_ice
needs to be updated such that the strategy AVG
is specified for visits affected by the ICE. Additionally, the function needs
+to be provided to impute()
via the getStrategies()
function as shown below:
<- data_ice_CIR %>%
+ data_ice_AVG mutate(strategy = ifelse(strategy == "CIR", "AVG", strategy))
+
+
+<- draws(
+ draw_obj data = data,
+ data_ice = data_ice_AVG,
+ vars = vars,
+ method = method,
+ quiet = TRUE
+
+ )
+<- impute(
+ impute_obj
+ draw_obj,references = c("Control" = "Control", "Intervention" = "Control"),
+ strategies = getStrategies(AVG = strategy_AVG)
+ )
Then, the analysis could proceed by calling analyse()
and pool()
as before.
By default rbmi
will analyse the data by using the ancova()
function.
+This analysis function fits an ANCOVA model to the outcomes from each visit separately,
+and returns the “treatment effect” estimate as well as the corresponding least square means
+for each group. If the user wants to perform a different analysis, or return different
+statistics from the analysis, then this can be done by using a custom analysis function.
+Beware that the validity of the conditional mean imputation method has only been formally established for analysis functions corresponding to linear models (such as ANCOVA) and caution is
+required when applying alternative analysis functions for this method.
The custom analysis function must take a data.frame
as its
+first argument and return a named list
with each element itself being a list
+containing at a minimum a point estimate, called est
.
+For method method_bayes()
or method_approxbayes()
, the list must additionally contain a
+standard error (element se
) and, if available, the degrees of freedom of the complete-data analysis model (element df
).
As a simple example, we replicate the ANCOVA analysis at the last visit for the CIR-based imputations with a user-defined analysis function below:
+<- function(data, ...) {
+ compare_change_lastvisit <- lm(change ~ group + outcome_bl, data = data, subset = (visit == 6) )
+ fit <- list(
+ res trt = list(
+ est = coef(fit)["groupIntervention"],
+ se = sqrt(vcov(fit)["groupIntervention", "groupIntervention"]),
+ df = df.residual(fit)
+
+ )
+ )return(res)
+
+ }
+<- analyse(
+ ana_obj_CIR6
+ impute_obj_CIR,fun = compare_change_lastvisit,
+ vars = vars_an
+
+ )
+pool(ana_obj_CIR6)
+#>
+#> Pool Object
+#> -----------
+#> Number of Results Combined: 20
+#> Method: rubin
+#> Confidence Level: 0.95
+#> Alternative: two.sided
+#>
+#> Results:
+#>
+#> =================================================
+#> parameter est se lci uci pval
+#> -------------------------------------------------
+#> trt -4.537 0.954 -6.42 -2.655 <0.001
+#> -------------------------------------------------
As a second example, assume that for a supplementary analysis the user wants to compare the proportion of subjects with a change from baseline of >10 points at the last +visit between the treatment groups with the baseline outcome as an additional covariate. This could lead to the following basic analysis function:
+<- function(data, ...) {
+ compare_prop_lastvisit <- glm(
+ fit I(change > 10) ~ group + outcome_bl,
+ family = binomial(),
+ data = data,
+ subset = (visit == 6)
+
+ )<- list(
+ res trt = list(
+ est = coef(fit)["groupIntervention"],
+ se = sqrt(vcov(fit)["groupIntervention", "groupIntervention"]),
+ df = NA
+
+ )
+ )return(res)
+
+ }
+ <- analyse(
+ ana_obj_prop
+ impute_obj_CIR,fun = compare_prop_lastvisit,
+ vars = vars_an
+
+ )
+<- pool(ana_obj_prop)
+ pool_obj_prop
+ pool_obj_prop#>
+#> Pool Object
+#> -----------
+#> Number of Results Combined: 20
+#> Method: rubin
+#> Confidence Level: 0.95
+#> Alternative: two.sided
+#>
+#> Results:
+#>
+#> =================================================
+#> parameter est se lci uci pval
+#> -------------------------------------------------
+#> trt -1.052 0.314 -1.667 -0.438 0.001
+#> -------------------------------------------------
+
+<- as.data.frame(pool_obj_prop) %>%
+ tmp mutate(
+ OR = exp(est),
+ OR.lci = exp(lci),
+ OR.uci = exp(uci)
+ %>%
+ ) select(parameter, OR, OR.lci, OR.uci)
+
+ tmp#> parameter OR OR.lci OR.uci
+#> 1 trt 0.3491078 0.188807 0.6455073
Note that if the user wants rbmi
to use a normal approximation to the pooled test statistics, then the degrees of freedom need to be set to df = NA
(as per the above example). If the degrees of freedom of the complete data test statistics are known or if the degrees of freedom are set to df = Inf
, then rbmi
pools the degrees of freedom across imputed datasets according to the rule by Barnard and Rubin (see the “Statistical Specifications” vignette (vignette("stat_specs", package = "rbmi")
for details). According to this rule, infinite degrees of freedom for the complete data analysis do not imply that the pooled degrees of freedom are also infinite.
+Rather, in this case the pooled degrees of freedom are (M-1)/lambda^2
, where M
is the number of imputations and lambda
is the fraction of missing information (see Barnard and Rubin (1999) for details).
Delta-adjustments are used to impute missing data under a not missing at random (NMAR) assumption. This reflects the belief that unobserved outcomes would have been systematically “worse” (or “better”) than “comparable” observed outcomes. For an extensive discussion of delta-adjustment methods, we refer to Cro et al. (2020).
+In rbmi
, a marginal delta-adjustment approach is implemented. This means that the delta-adjustment is applied to the dataset after data imputation under MAR or reference-based missing data assumptions and prior to the analysis of the imputed data.
+Sensitivity analysis using delta-adjustments can therefore be performed without having to re-fit the imputation model. In rbmi
, they are implemented via the delta
argument of the analyse()
function.
The delta
argument of analyse()
allows users to modify the outcome variable prior to the analysis.
+To do this, the user needs to provide a data.frame
which contains columns for the subject and visit (to identify the observation to be adjusted) plus an additional column called delta
which specifies the value which will be added to the outcomes prior to the analysis.
The delta_template()
function supports the user in creating this data.frame
: it creates a skeleton data.frame
containing one row per subject and visit with the value of delta
set to 0 for all observations:
<- delta_template(imputations = impute_obj_CIR)
+ dat_delta head(dat_delta)
+#> id visit group is_mar is_missing is_post_ice strategy delta
+#> 1 id_1 1 Control TRUE TRUE TRUE MAR 0
+#> 2 id_1 2 Control TRUE TRUE TRUE MAR 0
+#> 3 id_1 3 Control TRUE TRUE TRUE MAR 0
+#> 4 id_1 4 Control TRUE TRUE TRUE MAR 0
+#> 5 id_1 5 Control TRUE TRUE TRUE MAR 0
+#> 6 id_1 6 Control TRUE TRUE TRUE MAR 0
Note that the output of delta_template()
contains additional information which can be used to properly re-set variable delta
.
For example, assume that the user wants to implement a delta-adjustment to the imputed values under CIR described in section 3.
+Specifically, assume that a fixed “worsening adjustment” of +5 points is applied to all imputed values regardless of the treatment group. This could be programmed as follows:
# Set delta-value to 5 for all imputed (previously missing) outcomes and 0 for all other outcomes
+<- delta_template(imputations = impute_obj_CIR) %>%
+ dat_delta mutate(delta = is_missing * 5)
+
+# Repeat the analyses with the delta-adjusted values and pool results
+<- analyse(
+ ana_delta
+ impute_obj_CIR,delta = dat_delta,
+ vars = vars_an
+
+ )pool(ana_delta)
+#>
+#> Pool Object
+#> -----------
+#> Number of Results Combined: 20
+#> Method: rubin
+#> Confidence Level: 0.95
+#> Alternative: two.sided
+#>
+#> Results:
+#>
+#> ==================================================
+#> parameter est se lci uci pval
+#> --------------------------------------------------
+#> trt_1 -0.482 0.524 -1.516 0.552 0.359
+#> lsm_ref_1 2.718 0.37 1.987 3.448 <0.001
+#> lsm_alt_1 2.235 0.37 1.505 2.966 <0.001
+#> trt_2 -0.016 0.56 -1.12 1.089 0.978
+#> lsm_ref_2 3.907 0.396 3.125 4.688 <0.001
+#> lsm_alt_2 3.891 0.396 3.111 4.671 <0.001
+#> trt_3 -1.684 0.641 -2.949 -0.42 0.009
+#> lsm_ref_3 6.092 0.452 5.201 6.983 <0.001
+#> lsm_alt_3 4.408 0.452 3.515 5.3 <0.001
+#> trt_4 -2.359 0.741 -3.821 -0.897 0.002
+#> lsm_ref_4 7.951 0.526 6.913 8.99 <0.001
+#> lsm_alt_4 5.593 0.522 4.563 6.623 <0.001
+#> trt_5 -3.34 0.919 -5.153 -1.526 <0.001
+#> lsm_ref_5 9.899 0.643 8.631 11.168 <0.001
+#> lsm_alt_5 6.559 0.653 5.271 7.848 <0.001
+#> trt_6 -4.21 1.026 -6.236 -2.184 <0.001
+#> lsm_ref_6 11.435 0.718 10.019 12.851 <0.001
+#> lsm_alt_6 7.225 0.725 5.793 8.656 <0.001
+#> --------------------------------------------------
The same approach can be used to implement a tipping point analysis. Here, we apply different delta-adjustments to imputed data from the control and the intervention group, respectively. Assume that delta-adjustments by less then -5 points or by more than +15 points are considered implausible from a clinical perspective. Therefore, we vary the delta-values in each group between -5 to +15 points to investigate which delta combinations lead to a “tipping” of the primary analysis result, defined here as an analysis p-value \(\geq 0.05\).
+<- function(delta_control, delta_intervention) {
+ perform_tipp_analysis
+# Derive delta offset based on control and intervention specific deltas
+ <- delta_df_init %>%
+ delta_df mutate(
+ delta_ctl = (group == "Control") * is_missing * delta_control,
+ delta_int = (group == "Intervention") * is_missing * delta_intervention,
+ delta = delta_ctl + delta_int
+
+ )
+<- analyse(
+ ana_delta
+ impute_obj_CIR,fun = compare_change_lastvisit,
+ vars = vars_an,
+ delta = delta_df,
+
+ )
+<- as.data.frame(pool(ana_delta))
+ pool_delta
+list(
+ trt_effect_6 = pool_delta[["est"]],
+ pval_6 = pool_delta[["pval"]]
+
+ )
+ }
+# Get initial delta template
+<- delta_template(impute_obj_CIR)
+ delta_df_init
+<- expand.grid(
+ tipp_frame_grid delta_control = seq(-5, 15, by = 2),
+ delta_intervention = seq(-5, 15, by = 2)
+ %>%
+ ) as_tibble()
+
+
+<- tipp_frame_grid %>%
+ tipp_frame mutate(
+ results_list = map2(delta_control, delta_intervention, perform_tipp_analysis),
+ trt_effect_6 = map_dbl(results_list, "trt_effect_6"),
+ pval_6 = map_dbl(results_list, "pval_6")
+ %>%
+ ) select(-results_list) %>%
+ mutate(
+ pval = cut(
+
+ pval_6,c(0, 0.001, 0.01, 0.05, 0.2, 1),
+ right = FALSE,
+ labels = c("<0.001", "0.001 - <0.01", "0.01- <0.05", "0.05 - <0.20", ">= 0.20")
+
+ )
+ )
+# Show delta values which lead to non-significant analysis results
+%>%
+ tipp_frame filter(pval_6 >= 0.05)
+ #> # A tibble: 3 × 5
+#> delta_control delta_intervention trt_effect_6 pval_6 pval
+#> <dbl> <dbl> <dbl> <dbl> <fct>
+#> 1 -5 15 -1.99 0.0935 0.05 - <0.20
+#> 2 -3 15 -2.15 0.0704 0.05 - <0.20
+#> 3 -1 15 -2.31 0.0526 0.05 - <0.20
+
+ggplot(tipp_frame, aes(delta_control, delta_intervention, fill = pval)) +
+geom_raster() +
+ scale_fill_manual(values = c("darkgreen", "lightgreen", "lightyellow", "orange", "red"))
According to this analysis, the significant test result from the primary analysis under CIR could only be tipped to a non-significant result for rather extreme delta-adjustments. Please note that for a real analysis it is recommended to use a smaller step size in the grid than what has been used here.
+dlag
and delta
arguments of delta_template()
So far, we have only discussed simple delta
arguments which add the same value to all imputed values.
+However, the user may want to apply more flexible delta-adjustments to missing values after an intercurrent event (ICE) and vary the magnitude of the delta adjustment depending on the how far away the visit in question is from the ICE visit.
To facilitate the creation of such flexible delta-adjustments, the delta_template()
function has two optional additional arguments delta
+and dlag
. The delta
argument specifies the default amount of delta
+that should be applied to each post-ICE visit, whilst
+dlag
specifies the scaling coefficient to be applied based upon the visits proximity
+to the first visit affected by the ICE. By default, the delta will only be added to unobserved (i.e. imputed) post-ICE
+outcomes but this can be changed by setting the optional argument missing_only = FALSE
.
The usage of the delta
and dlag
arguments is best illustrated with a few examples:
Assume a setting with 4 visits and that the user specified delta = c(5,6,7,8)
and dlag=c(1,2,3,4)
.
For a subject for whom the first visit affected by the ICE is visit 2, these values of delta
and dlag
would imply the following delta offset:
v1 v2 v3 v4
+--------------
+ 5 6 7 8 # delta assigned to each visit
+ 0 1 2 3 # scaling starting from the first visit after the subjects ICE
+--------------
+ 0 6 14 24 # delta * scaling
+--------------
+ 0 6 20 44 # cumulative sum (i.e. delta) to be applied to each visit
+That is, the subject would have a delta offset of 0 applied to visit v1, 6 for visit v2, 20 for visit v3 and 44 for visit v4.
+Assume instead, that the subject’s first visit affected by the ICE was visit 3. Then, the above values of delta
and dlag
would imply the following delta offset:
v1 v2 v3 v4
+--------------
+ 5 6 7 8 # delta assigned to each visit
+ 0 0 1 2 # scaling starting from the first visit after the subjects ICE
+--------------
+ 0 0 7 16 # delta * scaling
+--------------
+ 0 0 7 23 # cumulative sum (i.e. delta) to be applied to each visit
+To apply a constant delta value of +5 to all visits affected by the ICE
+regardless of their proximity to the first ICE visit, one could set delta = c(5,5,5,5)
and dlag = c(1,0,0,0)
.
+Alternatively, it may be more straightforward for this setting to call the delta_template()
function without the delta
and dlag
arguments and then overwrite the delta
column of the resulting data.frame
as described in the previous section (and additionally relying on the is_post_ice
variable).
Another way of using these arguments is to set delta
to the difference in time
+between visits and dlag to be the amount of delta per unit of time. For example,
+let’s say that visits occur on weeks 1, 5, 6 and 9 and that we want a delta of 3
+to be applied for each week after an ICE.
+For simplicity, we assume that the ICE occurs immediately after the subject’s last visit which
+is not affected by the ICE. This this could be achieved by setting
+delta = c(1,4,1,3)
(the difference in weeks between each visit) and dlag = c(3, 3, 3, 3)
.
Assume a subject’s first visit affected by the ICE was visit v2, then these values of delta
and dlag
would imply the following delta offsets:
v1 v2 v3 v4
+--------------
+ 1 4 1 3 # delta assigned to each visit
+ 0 3 3 3 # scaling starting from the first visit after the subjects ICE
+--------------
+ 0 12 3 9 # delta * scaling
+--------------
+ 0 12 15 24 # cumulative sum (i.e. delta) to be applied to each visit
+To wrap up, we show this in action for our simulated dataset from section 2 and the imputed datasets
+based on a CIR assumption from section 3.
+The simulation setting specified follow-up visits at months 2, 4, 6, 8, 10, and 12.
+Assume that we want to apply a delta-adjustment of 1 for every month after an ICE to unobserved post-ICE visits from the intervention group only. (E.g. if the ICE occurred immediately after the month 4 visit, then the total delta applied to a missing value from the month 10 visit would be 6.)
To program this, we first use the delta
and dlag
arguments of delta_template()
to set up a corresponding template data.frame
:
<- delta_template(
+ delta_df
+ impute_obj_CIR,delta = c(2, 2, 2, 2, 2, 2),
+ dlag = c(1, 1, 1, 1, 1, 1)
+
+ )
+head(delta_df)
+#> id visit group is_mar is_missing is_post_ice strategy delta
+#> 1 id_1 1 Control TRUE TRUE TRUE MAR 2
+#> 2 id_1 2 Control TRUE TRUE TRUE MAR 4
+#> 3 id_1 3 Control TRUE TRUE TRUE MAR 6
+#> 4 id_1 4 Control TRUE TRUE TRUE MAR 8
+#> 5 id_1 5 Control TRUE TRUE TRUE MAR 10
+#> 6 id_1 6 Control TRUE TRUE TRUE MAR 12
Next, we can use the additional metadata variables provided by delta_template()
to manually
+reset the delta values for the control group back to 0:
<- delta_df %>%
+ delta_df2 mutate(delta = if_else(group == "Control", 0, delta))
+
+head(delta_df2)
+#> id visit group is_mar is_missing is_post_ice strategy delta
+#> 1 id_1 1 Control TRUE TRUE TRUE MAR 0
+#> 2 id_1 2 Control TRUE TRUE TRUE MAR 0
+#> 3 id_1 3 Control TRUE TRUE TRUE MAR 0
+#> 4 id_1 4 Control TRUE TRUE TRUE MAR 0
+#> 5 id_1 5 Control TRUE TRUE TRUE MAR 0
+#> 6 id_1 6 Control TRUE TRUE TRUE MAR 0
Finally, we can use our delta data.frame
to apply the desired delta offset to our analysis:
<- analyse(impute_obj_CIR, delta = delta_df2, vars = vars_an)
+ ana_delta pool(ana_delta)
+#>
+#> Pool Object
+#> -----------
+#> Number of Results Combined: 20
+#> Method: rubin
+#> Confidence Level: 0.95
+#> Alternative: two.sided
+#>
+#> Results:
+#>
+#> ==================================================
+#> parameter est se lci uci pval
+#> --------------------------------------------------
+#> trt_1 -0.446 0.514 -1.459 0.567 0.386
+#> lsm_ref_1 2.62 0.363 1.904 3.335 <0.001
+#> lsm_alt_1 2.173 0.363 1.458 2.889 <0.001
+#> trt_2 0.072 0.546 -1.006 1.15 0.896
+#> lsm_ref_2 3.708 0.387 2.945 4.471 <0.001
+#> lsm_alt_2 3.78 0.386 3.018 4.542 <0.001
+#> trt_3 -1.507 0.626 -2.743 -0.272 0.017
+#> lsm_ref_3 5.844 0.441 4.973 6.714 <0.001
+#> lsm_alt_3 4.336 0.442 3.464 5.209 <0.001
+#> trt_4 -2.062 0.731 -3.504 -0.621 0.005
+#> lsm_ref_4 7.658 0.519 6.634 8.682 <0.001
+#> lsm_alt_4 5.596 0.515 4.58 6.612 <0.001
+#> trt_5 -2.938 0.916 -4.746 -1.13 0.002
+#> lsm_ref_5 9.558 0.641 8.293 10.823 <0.001
+#> lsm_alt_5 6.62 0.651 5.335 7.906 <0.001
+#> trt_6 -3.53 1.045 -5.591 -1.469 0.001
+#> lsm_ref_6 11.045 0.73 9.604 12.486 <0.001
+#> lsm_alt_6 7.515 0.738 6.058 8.971 <0.001
+#> --------------------------------------------------
The purpose of this vignette is to provide a 15 minute quickstart guide to the core functions of the rbmi package.
+The rbmi package consists of 4 core functions (plus several helper functions) which are typically called in sequence:
+draws()
- fits the imputation models and stores their parametersimpute()
- creates multiple imputed datasetsanalyse()
- analyses each of the multiple imputed datasetspool()
- combines the analysis results across imputed datasets into a single statisticWe use a publicly available example dataset from an antidepressant clinical trial of an active drug versus placebo. The relevant endpoint is the Hamilton 17-item depression rating scale (HAMD17) which was assessed at baseline and at weeks 1, 2, 4, and 6. Study drug discontinuation occurred in 24% of subjects from the active drug and 26% of subjects from placebo. All data after study drug discontinuation are missing and there is a single additional intermittent missing observation.
+library(rbmi)
+library(dplyr)
+
+data("antidepressant_data")
+<- antidepressant_data dat
We consider an imputation model with the mean change from baseline in the HAMD17 score as the outcome (variable CHANGE
in the dataset). The following covariates are included in the imputation model: the treatment group (THERAPY
), the (categorical) visit (VISIT
), treatment-by-visit interactions, the baseline HAMD17 score (BASVAL
), and baseline HAMD17 score-by-visit interactions. A common unstructured covariance matrix structure is assumed for both groups. The analysis model is an ANCOVA model with the treatment group as the primary covariate and adjustment for the baseline HAMD17 score.
rbmi
expects its input dataset to be complete; that is, there must be one row per subject for each visit. Missing outcome values should be coded as NA
, while missing covariate values are not allowed. If the dataset is incomplete, then the expand_locf()
helper function can be used to add any missing rows, using LOCF imputation to carry forward the observed baseline covariate values to visits with missing outcomes. Rows corresponding to missing outcomes are not present in the antidepressant trial dataset. To address this we will therefore use the expand_locf()
function as follows:
+# Use expand_locf to add rows corresponding to visits with missing outcomes to the dataset
+<- expand_locf(
+ dat
+ dat,PATIENT = levels(dat$PATIENT), # expand by PATIENT and VISIT
+ VISIT = levels(dat$VISIT),
+ vars = c("BASVAL", "THERAPY"), # fill with LOCF BASVAL and THERAPY
+ group = c("PATIENT"),
+ order = c("PATIENT", "VISIT")
+ )
The draws()
function fits the imputation models and stores the corresponding parameter estimates or Bayesian posterior parameter draws.
+The three main inputs to the draws()
function are:
data
- The primary longitudinal data.frame containing the outcome variable and all covariates.data_ice
- A data.frame which specifies the first visit affected by an intercurrent event (ICE) and the imputation strategy for handling missing outcome data after the ICE. At most one ICE which is to be imputed by a non-MAR strategy is allowed per subject.method
- The statistical method used to fit the imputation models and to create imputed datasets.For the antidepressant trial data, the dataset data_ice
is not provided. However, it can be derived because, in this dataset,
+the subject’s first visit affected by the ICE “study drug discontinuation” corresponds to the first terminal missing observation.
+We first derive the dateset data_ice
and then create 150 Bayesian posterior draws of the imputation model parameters.
For this example, we assume that the imputation strategy after the ICE is Jump To Reference (JR) for all subjects +and that 150 multiple imputed datasets using Bayesian posterior draws from the imputation model are to be created.
+# create data_ice and set the imputation strategy to JR for
+# each patient with at least one missing observation
+<- dat %>%
+ dat_ice arrange(PATIENT, VISIT) %>%
+ filter(is.na(CHANGE)) %>%
+ group_by(PATIENT) %>%
+ slice(1) %>%
+ ungroup() %>%
+ select(PATIENT, VISIT) %>%
+ mutate(strategy = "JR")
+
+# In this dataset, subject 3618 has an intermittent missing values which does not correspond
+# to a study drug discontinuation. We therefore remove this subject from `dat_ice`.
+# (In the later imputation step, it will automatically be imputed under the default MAR assumption.)
+<- dat_ice[-which(dat_ice$PATIENT == 3618),]
+ dat_ice
+
+ dat_ice#> # A tibble: 43 × 3
+#> PATIENT VISIT strategy
+#> <fct> <fct> <chr>
+#> 1 1513 5 JR
+#> 2 1514 5 JR
+#> 3 1517 5 JR
+#> 4 1804 7 JR
+#> 5 2104 7 JR
+#> 6 2118 5 JR
+#> 7 2218 6 JR
+#> 8 2230 6 JR
+#> 9 2721 5 JR
+#> 10 2729 5 JR
+#> # … with 33 more rows
+
+# Define the names of key variables in our dataset and
+# the covariates included in the imputation model using `set_vars()`
+# Note that the covariates argument can also include interaction terms
+<- set_vars(
+ vars outcome = "CHANGE",
+ visit = "VISIT",
+ subjid = "PATIENT",
+ group = "THERAPY",
+ covariates = c("BASVAL*VISIT", "THERAPY*VISIT")
+
+ )
+# Define which imputation method to use (here: Bayesian multiple imputation with 150 imputed datsets)
+<- method_bayes(
+ method burn_in = 200,
+ burn_between = 5,
+ n_samples = 150
+
+ )
+# Create samples for the imputation parameters by running the draws() function
+set.seed(987)
+<- draws(
+ drawObj data = dat,
+ data_ice = dat_ice,
+ vars = vars,
+ method = method,
+ quiet = TRUE
+
+ )
+ drawObj#>
+#> Draws Object
+#> ------------
+#> Number of Samples: 150
+#> Number of Failed Samples: 0
+#> Model Formula: CHANGE ~ 1 + THERAPY + VISIT + BASVAL * VISIT + THERAPY * VISIT
+#> Imputation Type: random
+#> Method:
+#> name: Bayes
+#> burn_in: 200
+#> burn_between: 5
+#> same_cov: TRUE
+#> n_samples: 150
+#> seed: NA
Note the use of set_vars()
which specifies the names of the key variables
+within the dataset and the imputation model. Additionally, note that whilst vars$group
and vars$visit
+are added as terms to the imputation model by default, their interaction is not,
+thus the inclusion of group * visit
in the list of covariates.
Available imputation methods include:
+method_bayes()
method_approxbayes()
method_condmean(type = "bootstrap")
method_condmean(type = "jackknife")
method = method_bmlmi()
For a comparison of these methods, we refer to the stat_specs
vignette (Section 3.10).
“statistical specifications” vignette (Section 3.10): vignette("stat_specs",package="rbmi")
.
Available imputation strategies include:
+"MAR"
"JR"
"CR"
"CIR"
"LMCF"
The next step is to use the parameters from the imputation model to generate the imputed datasets. This is
+done via the impute()
function. The function only has two key inputs: the imputation
+model output from draws()
and the reference groups relevant to reference-based imputation methods. It’s usage is thus:
<- impute(
+ imputeObj
+ drawObj,references = c("DRUG" = "PLACEBO", "PLACEBO" = "PLACEBO")
+
+ )
+ imputeObj#>
+#> Imputation Object
+#> -----------------
+#> Number of Imputed Datasets: 150
+#> Fraction of Missing Data (Original Dataset):
+#> 4: 0%
+#> 5: 8%
+#> 6: 13%
+#> 7: 25%
+#> References:
+#> DRUG -> PLACEBO
+#> PLACEBO -> PLACEBO
In this instance, we are specifying that the PLACEBO
group should be the reference group for itself as well as for the DRUG
group (as is standard for imputation using reference-based methods).
Generally speaking, there is no need to see or directly interact with the imputed
+datasets. However, if you do wish to inspect them, they can be extracted from the imputation
+object using the extract_imputed_dfs()
helper function, i.e.:
<- extract_imputed_dfs(imputeObj)
+ imputed_dfs head(imputed_dfs[[10]], 12) # first 12 rows of 10th imputed dataset
+#> PATIENT HAMATOTL PGIIMP RELDAYS VISIT THERAPY GENDER POOLINV BASVAL HAMDTL17 CHANGE
+#> 1 new_pt_1 21 2 7 4 DRUG F 006 32 21 -11
+#> 2 new_pt_1 19 2 14 5 DRUG F 006 32 20 -12
+#> 3 new_pt_1 21 3 28 6 DRUG F 006 32 19 -13
+#> 4 new_pt_1 17 4 42 7 DRUG F 006 32 17 -15
+#> 5 new_pt_2 18 3 7 4 PLACEBO F 006 14 11 -3
+#> 6 new_pt_2 18 2 15 5 PLACEBO F 006 14 14 0
+#> 7 new_pt_2 14 3 29 6 PLACEBO F 006 14 9 -5
+#> 8 new_pt_2 8 2 42 7 PLACEBO F 006 14 5 -9
+#> 9 new_pt_3 18 3 7 4 DRUG F 006 21 20 -1
+#> 10 new_pt_3 17 3 14 5 DRUG F 006 21 18 -3
+#> 11 new_pt_3 12 3 28 6 DRUG F 006 21 16 -5
+#> 12 new_pt_3 9 3 44 7 DRUG F 006 21 13 -8
Note that in the case of method_bayes()
or method_approxbayes()
, all imputed datasets correspond to random imputations on the original dataset.
+For method_condmean()
, the first imputed dataset will always correspond to the completed original dataset containing all subjects.
+For method_condmean(type="jackknife")
, the remaining datasets correspond to conditional mean imputations on leave-one-subject-out datasets,
+whereas for method_condmean(type="bootstrap")
, each subsequent dataset corresponds to a conditional mean imputation on a bootstrapped datasets.
+For method_bmlmi()
, all the imputed datasets correspond to sets of random imputations on bootstrapped datasets.
The next step is to run the analysis model on each imputed dataset. This is done by defining
+an analysis function and then calling analyse()
to apply this function to each
+imputed dataset. For this vignette we use the ancova()
function provided by the rbmi
+package which fits a separate ANCOVA model for the outcomes from each visit and returns a treatment
+effect estimate and corresponding least square means for each group per visit.
<- analyse(
+ anaObj
+ imputeObj,
+ ancova,vars = set_vars(
+ subjid = "PATIENT",
+ outcome = "CHANGE",
+ visit = "VISIT",
+ group = "THERAPY",
+ covariates = c("BASVAL")
+
+ )
+ )
+ anaObj#>
+#> Analysis Object
+#> ---------------
+#> Number of Results: 150
+#> Analysis Function: ancova
+#> Delta Applied: FALSE
+#> Analysis Estimates:
+#> trt_4
+#> lsm_ref_4
+#> lsm_alt_4
+#> trt_5
+#> lsm_ref_5
+#> lsm_alt_5
+#> trt_6
+#> lsm_ref_6
+#> lsm_alt_6
+#> trt_7
+#> lsm_ref_7
+#> lsm_alt_7
Note that, similar to draws()
, the ancova()
function uses the set_vars()
+function which determines the names of the key variables within the data and the covariates
+(in addition to the treatment group) for which the analysis model will be adjusted.
Please also note that the names of the analysis estimates contain “ref” and “alt” to refer to the two treatment arms. In particular “ref” refers to the first factor level of vars$group
which does not necessarily
+coincide with the control arm. In this example, since levels(dat[[vars$group]]) = c("DRUG", PLACEBO
), the results associated with “ref” correspond to the intervention arm, while those associated with “alt” correspond to the control arm.
Additionally, we can use the delta
argument of analyse()
to perform a delta adjustments of the imputed datasets prior to the analysis.
+In brief, this is implemented by specifying a data.frame that contains the amount
+of adjustment to be added to each longitudinal outcome for each subject and visit, i.e.
+the data.frame must contain the columns subjid
, visit
, and delta
.
It is appreciated that carrying out this procedure is potentially tedious, therefore the
+delta_template()
helper function has been provided to simplify it. In particular,
+delta_template()
returns a shell data.frame
where the delta-adjustment is set to 0 for all
+patients. Additionally delta_template()
adds several meta-variables onto the shell
+data.frame
which can be used for manual derivation or manipulation of the delta-adjustment.
For example lets say we want to add a delta-value of 5 to all imputed values (i.e. those values +which were missing in the original dataset) in the drug arm. That could then be implemented as follows:
+# For reference show the additional meta variables provided
+delta_template(imputeObj) %>% as_tibble()
+#> # A tibble: 688 × 8
+#> PATIENT VISIT THERAPY is_mar is_missing is_post_ice strategy delta
+#> <fct> <fct> <fct> <lgl> <lgl> <lgl> <chr> <dbl>
+#> 1 1503 4 DRUG TRUE FALSE FALSE <NA> 0
+#> 2 1503 5 DRUG TRUE FALSE FALSE <NA> 0
+#> 3 1503 6 DRUG TRUE FALSE FALSE <NA> 0
+#> 4 1503 7 DRUG TRUE FALSE FALSE <NA> 0
+#> 5 1507 4 PLACEBO TRUE FALSE FALSE <NA> 0
+#> 6 1507 5 PLACEBO TRUE FALSE FALSE <NA> 0
+#> 7 1507 6 PLACEBO TRUE FALSE FALSE <NA> 0
+#> 8 1507 7 PLACEBO TRUE FALSE FALSE <NA> 0
+#> 9 1509 4 DRUG TRUE FALSE FALSE <NA> 0
+#> 10 1509 5 DRUG TRUE FALSE FALSE <NA> 0
+#> # … with 678 more rows
+
+<- delta_template(imputeObj) %>%
+ delta_df as_tibble() %>%
+ mutate(delta = if_else(THERAPY == "DRUG" & is_missing , 5, 0)) %>%
+ select(PATIENT, VISIT, delta)
+
+
+ delta_df#> # A tibble: 688 × 3
+#> PATIENT VISIT delta
+#> <fct> <fct> <dbl>
+#> 1 1503 4 0
+#> 2 1503 5 0
+#> 3 1503 6 0
+#> 4 1503 7 0
+#> 5 1507 4 0
+#> 6 1507 5 0
+#> 7 1507 6 0
+#> 8 1507 7 0
+#> 9 1509 4 0
+#> 10 1509 5 0
+#> # … with 678 more rows
+
+<- analyse(
+ anaObj_delta
+ imputeObj,
+ ancova,delta = delta_df,
+ vars = set_vars(
+ subjid = "PATIENT",
+ outcome = "CHANGE",
+ visit = "VISIT",
+ group = "THERAPY",
+ covariates = c("BASVAL")
+
+ ) )
Finally, the pool()
function can be used to summarise the analysis results across multiple
+imputed datasets to provide an overall statistic with a standard error, confidence intervals and a p-value for
+the hypothesis test of the null hypothesis that the effect is equal to 0.
Note that the pooling method is automatically derived based on the method that was specified
+in the original call to draws()
:
method_bayes()
or method_approxbayes()
pooling and inference are based on Rubin’s rules.method_condmean(type = "bootstrap")
inference is either based on a normal approximation using the bootstrap standard error (pool(..., type = "normal")
) or on the bootstrap percentiles (pool(..., type = "percentile")
).method_condmean(type = "jackknife")
inference is based on a normal approximation using the jackknife estimate of the standard error.method = method_bmlmi()
inference is according to the methods described by von Hippel and Bartlett (see the stat_specs
vignette for details)Since we have used Bayesian multiple imputation in this vignette, the pool()
function will automatically use Rubin’s rules.
<- pool(
+ poolObj
+ anaObj, conf.level = 0.95,
+ alternative = "two.sided"
+
+ )
+ poolObj#>
+#> Pool Object
+#> -----------
+#> Number of Results Combined: 150
+#> Method: rubin
+#> Confidence Level: 0.95
+#> Alternative: two.sided
+#>
+#> Results:
+#>
+#> ==================================================
+#> parameter est se lci uci pval
+#> --------------------------------------------------
+#> trt_4 -0.092 0.683 -1.439 1.256 0.893
+#> lsm_ref_4 -1.616 0.486 -2.576 -0.656 0.001
+#> lsm_alt_4 -1.708 0.475 -2.645 -0.77 <0.001
+#> trt_5 1.34 0.925 -0.486 3.166 0.149
+#> lsm_ref_5 -4.154 0.66 -5.457 -2.852 <0.001
+#> lsm_alt_5 -2.815 0.647 -4.092 -1.538 <0.001
+#> trt_6 1.956 0.999 -0.018 3.931 0.052
+#> lsm_ref_6 -6.1 0.72 -7.523 -4.678 <0.001
+#> lsm_alt_6 -4.144 0.695 -5.516 -2.772 <0.001
+#> trt_7 2.178 1.124 -0.043 4.399 0.055
+#> lsm_ref_7 -7.001 0.822 -8.628 -5.375 <0.001
+#> lsm_alt_7 -4.823 0.789 -6.384 -3.263 <0.001
+#> --------------------------------------------------
The table of values shown in the print message for poolObj
can also be extracted using the as.data.frame()
function:
as.data.frame(poolObj)
+#> parameter est se lci uci pval
+#> 1 trt_4 -0.09180645 0.6826279 -1.43949684 1.2558839 8.931772e-01
+#> 2 lsm_ref_4 -1.61581996 0.4862316 -2.57577141 -0.6558685 1.093708e-03
+#> 3 lsm_alt_4 -1.70762640 0.4749573 -2.64531931 -0.7699335 4.262148e-04
+#> 4 trt_5 1.33975256 0.9245536 -0.48627442 3.1657795 1.492897e-01
+#> 5 lsm_ref_5 -4.15439612 0.6596139 -5.45719295 -2.8515993 2.853288e-09
+#> 6 lsm_alt_5 -2.81464355 0.6465855 -4.09178487 -1.5375022 2.412198e-05
+#> 7 trt_6 1.95633775 0.9994792 -0.01823149 3.9309070 5.212617e-02
+#> 8 lsm_ref_6 -6.10037936 0.7197954 -7.52269771 -4.6780610 2.097471e-14
+#> 9 lsm_alt_6 -4.14404161 0.6946859 -5.51643551 -2.7716477 1.625003e-08
+#> 10 trt_7 2.17794477 1.1237356 -0.04320821 4.3990977 5.456450e-02
+#> 11 lsm_ref_7 -7.00142993 0.8223970 -8.62790047 -5.3749594 2.911670e-14
+#> 12 lsm_alt_7 -4.82348516 0.7893734 -6.38405217 -3.2629182 9.210812e-09
These outputs gives an estimated difference of +2.178 (95% CI -0.043 to 4.399) +between the two groups at the last visit with an associated p-value of 0.055.
+We report below all the code presented in this vignette.
+library(rbmi)
+library(dplyr)
+
+data("antidepressant_data")
+<- antidepressant_data
+ dat
+# Use expand_locf to add rows corresponding to visits with missing outcomes to the dataset
+<- expand_locf(
+ dat
+ dat,PATIENT = levels(dat$PATIENT), # expand by PATIENT and VISIT
+ VISIT = levels(dat$VISIT),
+ vars = c("BASVAL", "THERAPY"), # fill with LOCF BASVAL and THERAPY
+ group = c("PATIENT"),
+ order = c("PATIENT", "VISIT")
+
+ )
+# Create data_ice and set the imputation strategy to JR for
+# each patient with at least one missing observation
+<- dat %>%
+ dat_ice arrange(PATIENT, VISIT) %>%
+ filter(is.na(CHANGE)) %>%
+ group_by(PATIENT) %>%
+ slice(1) %>%
+ ungroup() %>%
+ select(PATIENT, VISIT) %>%
+ mutate(strategy = "JR")
+
+# In this dataset, subject 3618 has an intermittent missing values which does not correspond
+# to a study drug discontinuation. We therefore remove this subject from `dat_ice`.
+# (In the later imputation step, it will automatically be imputed under the default MAR assumption.)
+<- dat_ice[-which(dat_ice$PATIENT == 3618),]
+ dat_ice
+# Define the names of key variables in our dataset using `set_vars()`
+# and the covariates included in the imputation model
+# Note that the covariates argument can also include interaction terms
+<- set_vars(
+ vars outcome = "CHANGE",
+ visit = "VISIT",
+ subjid = "PATIENT",
+ group = "THERAPY",
+ covariates = c("BASVAL*VISIT", "THERAPY*VISIT")
+
+ )
+# Define which imputation method to use (here: Bayesian multiple imputation with 150 imputed datsets)
+<- method_bayes(
+ method burn_in = 200,
+ burn_between = 5,
+ n_samples = 150
+
+ )
+
+# Create samples for the imputation parameters by running the draws() function
+set.seed(987)
+<- draws(
+ drawObj data = dat,
+ data_ice = dat_ice,
+ vars = vars,
+ method = method,
+ quiet = TRUE
+
+ )
+# Impute the data
+<- impute(
+ imputeObj
+ drawObj,references = c("DRUG" = "PLACEBO", "PLACEBO" = "PLACEBO")
+
+ )
+# Fit the analysis model on each imputed dataset
+<- analyse(
+ anaObj
+ imputeObj,
+ ancova,vars = set_vars(
+ subjid = "PATIENT",
+ outcome = "CHANGE",
+ visit = "VISIT",
+ group = "THERAPY",
+ covariates = c("BASVAL")
+
+ )
+ )
+# Apply a delta adjustment
+
+# Add a delta-value of 5 to all imputed values (i.e. those values
+# which were missing in the original dataset) in the drug arm.
+<- delta_template(imputeObj) %>%
+ delta_df as_tibble() %>%
+ mutate(delta = if_else(THERAPY == "DRUG" & is_missing , 5, 0)) %>%
+ select(PATIENT, VISIT, delta)
+
+# Repeat the analyses with the adjusted values
+<- analyse(
+ anaObj_delta
+ imputeObj,
+ ancova,delta = delta_df,
+ vars = set_vars(
+ subjid = "PATIENT",
+ outcome = "CHANGE",
+ visit = "VISIT",
+ group = "THERAPY",
+ covariates = c("BASVAL")
+
+ )
+ )
+# Pool the results
+<- pool(
+ poolObj
+ anaObj, conf.level = 0.95,
+ alternative = "two.sided"
+ )
rbmi
functionsThis document describes the statistical methods implemented in the rbmi
R package for standard and reference-based multiple imputation of continuous longitudinal outcomes.
+The package implements three classes of multiple imputation (MI) approaches:
Conventional MI methods based on Bayesian (or approximate Bayesian) posterior draws of model parameters combined with Rubin’s rules to make inferences as described in Carpenter, Roger, and Kenward (2013) and Cro et al. (2020).
Conditional mean imputation methods combined with re-sampling techniques as described in Wolbers et al. (2021).
Bootstrapped MI methods as described in von Hippel and Bartlett (2021).
The document is structured as follows: we first provide an informal introduction to estimands and corresponding treatment effect estimation based on MI (section 2). The core of this document consists of section 3 which describes the statistical methodology in detail and also contains a comparison of the implemented approaches (section 3.10). The link between theory and the functions included in package rbmi
is described in section 4. We conclude with a comparison of our package to some alternative software implementations of reference-based imputation methods (section 5).
The ICH E9(R1) addendum on estimands and sensitivity analyses describes a systematic approach to ensure alignment among clinical trial objectives, trial execution/conduct, statistical analyses, and interpretation of results (ICH E9 working group (2019)). +As per the addendum, an estimand is a precise description of the treatment effect reflecting the clinical question posed by the trial objective which summarizes at a population-level what the outcomes would be in the same patients under different +treatment conditions being compared. +One important attribute of an estimand is a list of possible intercurrent events (ICEs), i.e. of events occurring after treatment initiation that affect either the interpretation or the existence of the measurements associated with the clinical question of interest, and the definition of appropriate strategies to deal with ICEs. The three most relevant strategies for the purpose of this document are the hypothetical strategy, the treatment policy strategy, and the composite strategy. For the hypothetical strategy, a scenario is envisaged in which the ICE would not occur. Under this scenario, endpoint values after the ICE are not directly observable and treated using models for missing data. +For the treatment policy strategy, the treatment effect in the presence of the ICEs is targeted and analyses are based on the observed outcomes regardless whether the subject had an ICE or not. +For the composite strategy, the ICE itself is included as a component of the endpoint.
+The ICH E9(R1) addendum distinguishes between ICEs and missing data (ICH E9 working group (2019)). Whereas ICEs such as treatment discontinuations reflect clinical practice, the amount of missing data can be minimized in the conduct of a clinical trial. However, there are many connections between missing data and ICEs. For example, it is often difficult to retain subjects in a clinical trial after treatment discontinuation and a subject’s dropout from the trial leads to missing data. As another example, outcome values after ICEs addressed using a hypothetical strateg are not directly observable under the hypothetical scenario. Consequently, any observed outcome values after such ICEs are typically discarded and treated as missing data.
+The addendum proposes that estimation methods to address the problem presented by missing data should be selected to align with the estimand. A recent overview of methods to align the estimator with the estimand is Mallinckrodt et al. (2020). A short introduction on estimation methods for studies with longitudinal endpoints can also be found in Wolbers et al. (2021). One prominent statistical method for this purpose is multiple imputation (MI), which is the target of the rbmi
package.
Missing data may occur in subjects without an ICE or prior to the occurrence of an ICE. As such missing outcomes are not associated with an ICE, it is often plausible to impute them under a missing-at-random (MAR) assumption using a standard MMRM imputation model of the longitudinal outcomes. Informally, MAR occurs if the missing data can be fully accounted for by the baseline variables included in the model and the observed longitudinal outcomes, and if the model is correctly specified.
+The MAR imputation model described above is often also a good starting point for imputing data after an ICE handled using a hypothetical strategy (Mallinckrodt et al. (2020)).
+Informally, this assumes that unobserved values after the ICE would have been similar to the observed data from subjects who did not have the ICE and remained under follow-up.
+However, in some situations, it may be more reasonable to assume that missingness is “informative” and indicates a systematically better or worse outcome than in observed subjects. In such situations, MNAR imputation with a \(\delta\)-adjustment could be explored as a sensitivity analysis. \(\delta\)-adjustments add a fixed or random quantity to the imputations in order to make the imputed outcomes systematically worse or better than those observed as described in Cro et al. (2020). In rbmi
only fixed \(\delta\)-adjustments are implemented.
Ideally, data collection continues after an ICE handled with a treatment policy strategy and no missing data arises. +Indeed, such post-ICE data are increasingly systematically collected in RCTs. +However, despite best efforts, missing data after an ICE such as study treatment discontinuation may still occur because the subject drops out from the study after discontinuation. It is difficult to give definite recommendations regarding the implementation of the treatment policy strategy in the presence of missing data at this stage because the optimal method is highly context dependent and a topic of ongoing statistical research.
+For ICEs which are thought to have a negligible effect on efficacy outcomes, standard MAR-based imputation may be appropriate. In contrast, an ICE such as treatment discontinuation may be expected to have a more substantial impact on efficacy outcomes. In such settings, the MAR assumption may still be plausible after conditioning on the subject’s time-varying treatment status (Guizzaro et al. (2021)). In this case, one option is to impute missing post-discontinuation data based on subjects who also discontinued treatment but continued to be followed up (Polverejan and Dragalin (2020)). Another option which may require somewhat less post-discontinuation data is to include all subjects in the imputation procedure but to model post-discontinuation data by using a time-varying treatment status indicators (e.g. time-varying indicators of treatment compliance, discontinuation, or initiation of rescue +treatment) (Guizzaro et al. (2021)). In this approach, post-ICE outcomes are included +in every step of the analysis, including in the fitting of the imputation model. +It assumes that ICEs may impact post-ICE outcomes but that otherwise missingness is non-informative. The approach also assumes that the time-varying covariates do not contain missing values, deviations in outcomes after the ICE are correctly modeled by these time-varying covariates, and that sufficient post-ICE data are available to inform the regression coefficients of the time-varying covariates. These proposals are relatively recent and there remain open questions regarding the appropriate trade-off between model complexity (e.g. should the model account for a potentially differential effect on post-ICE outcomes depending on the timing of the ICE?) and the variance in the resulting treatment effect estimate. More generally, it is not yet established how much post-discontinuation data is required to implement such methods robustly and without the risk of substantial inflation of variance.
+In some trial settings, only few subjects discontinue the randomized treatment. In other settings, treatment discontinuation rates are higher but it is difficult to retain subjects in the trial after treatment discontinuation leading to sparse data collection after treatment discontinuation. In both settings, the amount of available data after treatment discontinuation may be insufficient to inform an imputation model which explicitly models post-discontinuation data. Depending on the disease area and the anticipated mechanism of action of the intervention, it may be plausible to assume that subjects in the intervention group behave similarly to subjects in the control group after the ICE treatment discontinuation. In this case, reference-based imputation methods are an option (Mallinckrodt et al. (2020)). Reference-based imputation methods formalize the idea to impute missing data in the intervention group based on data from a control or reference group. For a general description and review of reference-based imputation methods, we refer to Carpenter, Roger, and Kenward (2013), Cro et al. (2020), I. White, Royes, and Best (2020) and Wolbers et al. (2021). For a technical description of the implemented statistical methodology for reference-based imputation, we refer to section 3 (in particular section 3.4).
+The composite strategy is typically applied to binary or time-to-event outcomes but it can also be used for continuous outcomes by ascribing a suitably unfavorable value to patients who experience ICEs for which a composite strategy has been defined. One possibility to implement this is to use MI with a \(\delta\)-adjustment for post-ICE data as described in Darken et al. (2020).
+Analyses of datasets with missing data always rely on missing data assumptions. The methods described here can be used to produce valid imputations under a MAR assumption or under reference-based imputation assumptions. MNAR imputation based on fixed \(\delta\)-adjustments as typically used in sensitivity analyses such as tipping-point analyses are also supported.
+Three general imputation approaches are implemented in rbmi
:
Conventional MI based on Bayesian (or approximate Bayesian) posterior draws from the imputation model combined with Rubin’s rules for inference as described in Carpenter, Roger, and Kenward (2013) and Cro et al. (2020).
Conditional mean imputation based on the REML estimate of the imputation model combined with resampling techniques (the jackknife or the bootstrap) for inference as described in Wolbers et al. (2021).
Bootstrapped MI methods based on REML estimates of the imputation model as described in von Hippel and Bartlett (2021).
Conventional MI approaches include the following steps:
+Fit a Bayesian multivariate normal mixed model for repeated measures (MMRM) to the observed longitudinal outcomes after exclusion of data after ICEs for which reference-based missing data imputation is desired (Section 3.3.3). Draw \(M\) posterior samples of the estimated parameters (regression coefficients and covariance matrices) from this model.
Alternatively, \(M\) approximate posterior draws from the posterior distribution can be sampled by repeatedly applying conventional restricted maximum-likelihood (REML) parameter estimation of the MMRM model to nonparametric bootstrap samples from the original dataset (Section 3.3.4).
Take a single sample \(m\) (\(m\in 1,\ldots, M)\) from the posterior distribution of the imputation model parameters.
For each subject, use the sampled parameters and the defined imputation strategy to determine the mean and covariance matrix describing the subject’s marginal outcome distribution for all longitudinal outcome assessments (i.e. observed and missing outcomes).
For each subjects, construct the conditional multivariate normal distribution of their missing outcomes given their observed outcomes (including observed outcomes after ICEs for which a reference-based assumption is desired).
For each subject, draw a single sample from this conditional distribution to impute their missing outcomes leading to a complete imputed dataset.
For sensitivity analyses, a pre-defined \(\delta\)-adjustment may be applied to the imputed data prior to the analysis step. (Section 3.5).
The conditional mean imputation approach includes the following steps:
+For each subject, use the fitted parameters from step 1. to construct the conditional distribution of missing outcomes given observed outcomes (including observed outcomes after ICEs for which reference-based missing data imputation is desired) as described above.
For each subject, impute their missing data deterministically by the mean of this conditional distribution leading to a complete imputed dataset.
For sensitivity analyses, a pre-defined \(\delta\)-adjustment may be applied to the imputed data prior to the analysis step. (Section 3.5).
The bootstrapped MI approach includes the following steps:
+Take a bootstrapped dataset \(b\) (\(b\in 1,\ldots, B)\) and its corresponding imputation model parameter estimates.
For each subject (from the bootstrapped dataset), use the parameter estimates and the defined strategy for dealing with their ICEs to determine the mean and covariance matrix describing the subject’s marginal outcome distribution for all longitudinal outcome assessments (i.e. observed and missing outcomes).
For each subjects (from the bootstrapped dataset), construct the conditional multivariate normal distribution of their missing outcomes given their observed outcomes (including observed outcomes after ICEs for which reference-based missing data imputation is desired).
For each subject (from the bootstrapped dataset), draw \(D\) samples from this conditional distributions to impute their missing outcomes leading to \(D\) complete imputed dataset for bootstrap sample \(b\).
For sensitivity analyses, a pre-defined \(\delta\)-adjustment may be applied to the imputed data prior to the analysis step. (Section 3.5).
Assume that the data are from a study with \(n\) subjects in total and that each subject \(i\) (\(i=1,\ldots,n\)) has \(J\) scheduled follow-up visits at which the outcome of interest is assessed.
+In most applications, the data will be from a randomized trial of an intervention vs a control group and the treatment effect of interest is a comparison in outcomes at a specific visit between these randomized groups. However, single-arm trials or multi-arm trials are in principle also supported by the rbmi
implementation.
Denote the observed outcome vector of length \(J\) for subject \(i\) by \(Y_i\) (with missing assessments coded as NA (not available)) and its non-missing and missing components by \(Y_{i!}\) and \(Y_{i?}\), respectively.
+By default, imputation of missing outcomes in \(Y_{i}\) is performed under a MAR assumption in rbmi
. Therefore, if missing data following an ICE are to be handled using MAR imputation, this is compatible with the default assumption. As discussed in Section 2, the MAR assumption is often a good starting point for implementing a hypothetical strategy. But also note that observed outcome data after an ICE handled using a hypothetical strategy is not compatible with this strategy. Therefore, we assume that all post-ICE data after ICEs handled using a hypothetical strategy are already set to NA in \(Y_i\) prior calling any rbmi
functions. However, any observed outcomes after ICEs handled using a treatment policy strategy should be included in \(Y_i\) as they are compatible with this strategy.
Subjects may also experience up to one ICE after which missing data imputation according to a reference-based imputation method is foreseen. For a subject \(i\) with such an ICE, denote their first visit which is affected by the ICE by \(\tilde{t}_i \in \{1,\ldots,J\}\). For all other subjects, set \(\tilde{t}_i=\infty\). A subject’s outcome vector after setting observed outcomes from visit \(\tilde{t}_i\) onwards to missing (i.e. NA) is denoted as \(Y'_i\) and the corresponding data vector after removal of NA elements as \(Y'_{i!}\).
+MNAR \(\delta\)-adjustments are added to the imputed datasets after the formal imputation steps. This is covered in a separate section (Section 3.5).
+The purpose of the imputation model is to estimate (covariate-dependent) mean trajectories and covariance matrices for each group in the absence of ICEs handled using reference-based imputation methods. Conventionally, +publications on reference-based imputation methods have implicitly assumed that the corresponding post-ICE +data is missing for all subjects (Carpenter, Roger, and Kenward (2013)). We also allow the situation where post-ICE data +is available for some subjects but needs to be imputed using reference-based methods for others. However, +any observed data after ICEs for which reference-based imputation methods are specified is not compatible +with the imputation model described below and they are therefore removed and considered as missing for +the purpose of estimating the imputation model, and for this purpose only. For example, if a patient has an ICE addressed with a reference-based method but outcomes after the ICE are collected, these post-ICE outcomes will be excluded when fitting the base imputation model (but they will be included again in the following steps). +That is, the base imputation model is fitted to \(Y'_{i!}\) and not to \(Y_{i!}\). +If we did not exclude these data, then the imputation model would mistakenly estimate mean trajectories based on a mixture of observed pre- and post-ICE data which are not relevant for reference-based imputations.
+Observed post-ICE outcomes in the control or reference group are also excluded from the base imputation model if the user specifies a reference-based imputation strategy for such ICEs. This ensures that an ICE has the same impact on the data included in the imputation model regardless whether the ICE occurred in the control or the intervention group. On the other hand, imputation in the reference group is based on a MAR assumption even for reference-based imputation methods and it may be preferable in some settings to include post-ICE data from the control group in the base imputation model. This can be implemented by specifying a MAR
strategy for the ICE in the control group and a reference-based strategy for the same ICE in the intervention group.
The base imputation model of the longitudinal outcomes \(Y'_i\) assumes that the mean structure is a linear function of covariates. Full flexibility for the specification of the linear predictor of the model is supported. At a minimum the covariates should include the treatment group, the (categorical) visit, and treatment-by-visit interactions. Typically, other covariates including the baseline outcome are also included. +External time-varying covariates (e.g. calendar time of the visit) as well as internal time-varying (e.g. time-varying indicators of treatment discontinuation or initiation of rescue treatment) may in principle also be included if indicated (Guizzaro et al. (2021)). Missing covariate values are not allowed. This means that the values of time-varying covariates must be non-missing at every visit regardless of whether the outcome is measured or missing.
+Denote the \(J\times p\) design matrix for subject \(i\) corresponding to the mean structure model by \(X_i\) and the same matrix after removal of rows corresponding to missing outcomes in \(Y'_{i!}\) by \(X'_{i!}\). +Here \(p\) is the number of parameters in the mean structure of the model for the elements of \(Y'_{i!}\). +The base imputation model for the observed outcomes is defined as: +\[ Y'_{i!} = X'_{i!}\beta + \epsilon_{i!} \mbox{ with } \epsilon_{i!}\sim N(0,\Sigma_{i!!})\] +where \(\beta\) is the vector of regression coefficients and \(\Sigma_{i!!}\) is a covariance matrix which is obtained from the complete-data \(J\times J\)-covariance matrix \(\Sigma\) by omitting rows and columns corresponding to missing outcome assessments for subject \(i\).
+Typically, a common unstructured covariance matrix for all subjects is assumed for \(\Sigma\) but separate covariate matrices per treatment group are also supported. Indeed, the implementation also supports the specification of separate covariate matrices according to an arbitrarily defined categorical variable which groups the subjects into disjoint subset. For example, this could be useful if different covariance matrices are suspected in different subject strata. Finally, for all imputation methods described below that do not rely on Bayesian model fitting through MCMC, there is further flexibility in the choice of the covariance structure, i.e. unstructured (default), heterogeneous Toeplitz, heterogeneous compound symmetry, and AR(1) covariance structures are supported.
+Frequentist parameter estimation for the base imputation is based on REML. The use of REML as an improved alternative to maximum likelihood (ML) for covariance parameter estimation was originally proposed by Patterson and Thompson (1971). Since then, it has become the default method for parameter estimation in linear mixed effects models. rbmi
allows to choose between ML and REML methods to estimate the model parameters, with REML being the default option.
The Bayesian imputation model is fitted with the R package rstan
(Stan Development Team (2020)). rstan
is the R interface of Stan. Stan is a powerful and flexible statistical software developed by a dedicated team and implements Bayesian inference with state-of-the-art MCMC sampling procedures. The multivariate normal model with missing data specified in section 3.3.1 can be considered a generalization of the models described in the Stan user’s guide (see Stan Development Team (2020, sec. 3.5)).
The same prior distributions as in the SAS implementation of the “five macros” are used (Roger (2021)), i.e. an improper flat priors for the regression coefficients and a weakly informative inverse Wishart prior for the covariance matrix (or matrices). Specifically, let \(S \in \mathbb{R}^{J \times J}\) be a symmetric positive definite matrix and \(\nu \in (J-1, \infty)\). Then the symmetric positive definite matrix \(x \in \mathbb{R}^{J \times J}\) has density: +\[ +\text{InvWish}(x \vert \nu, S) = \frac{1}{2^{\nu J/2}} \frac{1}{\Gamma_J(\frac{\nu}{2})} \vert S \vert^{\nu/2} \vert x \vert ^{-(\nu + J + 1)/2} \text{exp}(-\frac{1}{2} \text{tr}(Sx^{-1})). +\] +For \(\nu > J+1\) the mean is given by: +\[ +E[x] = \frac{S}{\nu - J - 1}. +\] +We choose \(S\) equal to the estimated covariance matrix from the frequentist REML fit and \(\nu = J+2\) as these are the lowest degrees of freedom that guarantee a finite mean. Setting the degrees of freedom with such a low \(\nu\) ensures that the prior has little impact on the posterior. Moreover, this choice allows to interpret the parameter \(S\) as the mean of the prior distribution.
+As in the “five macros”, the MCMC algorithm is initialized at the parameters from a frequentist REML fit (see section 3.3.2). As described above, we are using only weakly informative priors for the parameters. Therefore, the Markov chain is essentially starting from the targeted stationary posterior distribution and only a minimal amount of burn-in of the chain is required.
+Several authors have suggested that a stabler way to get Bayesian posterior draws from the imputation model is to bootstrap the incomplete data and to calculate REML estimates for each bootstrap sample (Little and Rubin (2002), Efron (1994), Honaker and King (2010), von Hippel and Bartlett (2021)). This method is proper in that the REML estimates from the bootstrap samples are asymptotically equivalent to a sample from the posterior distribution and may provide additional robustness to model misspecification (Little and Rubin (2002, sec. 10.2.3, part 6), Honaker and King (2010)). In order to retain balance between treatment groups and stratification factors across bootstrap samples, the user is able to provide stratification variables for the bootstrap in the rbmi
implementation.
For each subject \(i\), the marginal distribution of the complete \(J\)-dimensional outcome vector from all assessment visits according to the imputation model is a multivariate normal distribution. Its mean \(\tilde{\mu}_i\) is given by the predicted mean from the imputation model conditional on the subject’s baseline characteristics, group, and, optionally, time-varying covariates. Its covariance matrix \(\tilde{\Sigma}_i\) is given by the overall estimated covariance matrix or, if different covariance matrices are assumed for different groups, the covariance matrix corresponding to subject \(i\)’s group.
+For each subject \(i\), we calculate the mean and covariance matrix of the complete \(J\)-dimensional outcome vector from all assessment visits as for the MAR case and denote them by \(\mu_i\) and \(\Sigma_i\). +For reference-based imputation methods, a corresponding reference group is also required for each group. Typically, the reference group for the intervention group will be the control group. +The reference mean \(\mu_{ref,i}\) is defined as the predicted mean from the imputation model conditional on the reference group (rather than the actual group subject \(i\) belongs to) and the subject’s baseline characteristics. +The reference covariance matrix \(\Sigma_{ref,i}\) is the overall estimated covariance matrix or, if different covariance matrices are assumed for different groups, the estimated covariance matrix corresponding to the reference group. In principle, time-varying covariates could also be included in reference-based imputation methods. However, this is only sensible for external time-varying covariates (e.g. calendar time of the visit) and not for internal time-varying covariates (e.g. treatment discontinuation) because the latter likely depend on the actual treatment group and it is typically not sensible to assume the same trajectory of the time-varying covariate for the reference group.
+Based on these means and covariance matrices, the subject’s marginal imputation distribution for the reference-based imputation methods is then calculated as detailed in Carpenter, Roger, and Kenward (2013, sec. 4.3). +Denote the mean and covariance matrix of this marginal imputation distribution by \(\tilde{\mu}_i\) and \(\tilde{\Sigma}_i\). Recall that the subject’s first visit which is affected by the ICE is denoted by \(\tilde{t}_i \in \{1,\ldots,J\}\) (and visit \(\tilde{t}_i-1\) is the last visit unaffected by the ICE). The marginal distribution for the patient \(i\) is then built according to the specific assumption for the data up to and post the ICE as follows:
+Jump to reference (JR): the patient’s outcome distribution is normally distributed with the following mean: +\[\tilde{\mu}_i = (\mu_i[1], \dots, \mu_i[\tilde{t}_i-1], \mu_{ref,i}[\tilde{t}_i], \dots, \mu_{ref,i}[J])^T.\] +The covariance matrix is constructed as follows. First, we partition the covariance matrices \(\Sigma_i\) and \(\Sigma_{ref,i}\) in blocks according to the time of the ICE \(\tilde{t}_i\): +\[ +\Sigma_{i} = \begin{bmatrix} \Sigma_{i, 11} & \Sigma_{i, 12} \\ +\Sigma_{i, 21} & \Sigma_{i,22} \\ +\end{bmatrix} +\] +\[ +\Sigma_{ref,i} = \begin{bmatrix} \Sigma_{ref, i, 11} & \Sigma_{ref, i, 12} \\ +\Sigma_{ref, i, 21} & \Sigma_{ref, i,22} \\ +\end{bmatrix}. +\] +We want the covariance matrix \(\tilde{\Sigma}_i\) to match \(\Sigma_i\) for the pre-deviation measurements, and \(\Sigma_{ref,i}\) for the conditional components for the post-deviation given the pre-deviation measurements. The solution is derived in Carpenter, Roger, and Kenward (2013, sec. 4.3) and is given by: +\[ +\begin{matrix} +\tilde{\Sigma}_{i,11} = \Sigma_{i, 11} \\ +\tilde{\Sigma}_{i, 21} = \Sigma_{ref,i, 21} \Sigma^{-1}_{ref,i, 11} \Sigma_{i, 11} \\ +\tilde{\Sigma}_{i, 22} = \Sigma_{ref, i, 22} - \Sigma_{ref,i, 21} \Sigma^{-1}_{ref,i, 11} (\Sigma_{ref,i, 11} - \Sigma_{i,11}) \Sigma^{-1}_{ref,i, 11} \Sigma_{ref,i, 12}. +\end{matrix} +\]
Copy increments in reference (CIR): the patient’s outcome distribution is normally distributed with the following mean: +\[ +\begin{split} +\tilde{\mu}_i =& (\mu_i[1], \dots, \mu_i[\tilde{t}_i-1], \mu_i[\tilde{t}_i-1] + (\mu_{ref,i}[\tilde{t}_i] - \mu_{ref,i}[\tilde{t}_i-1]), \dots,\\ & +\mu_i[\tilde{t}_i-1]+(\mu_{ref,i}[J] - \mu_{ref,i}[\tilde{t}_i-1]))^T. +\end{split} +\] +The covariance matrix is derived as for the JR method.
Copy reference (CR): the patient’s outcome distribution is normally distributed with mean and covariance matrix taken from the reference group: +\[ +\tilde{\mu}_i = \mu_{ref,i} +\] +\[ +\tilde{\Sigma}_i = \Sigma_{ref,i}. +\]
Last mean carried forward (LMCF): the patient’s outcome distribution is normally distributed with the following mean: +\[ \tilde{\mu}_i = (\mu_i[1], \dots, \mu_i[\tilde{t}_i-1], \mu_i[\tilde{t}_i-1], \dots, \mu_i[\tilde{t}_i-1])'\] +and covariance matrix: \[ \tilde{\Sigma}_i = \Sigma_i.\]
The joint marginal multivariate normal imputation distribution of subject \(i\)’s observed and missing outcome data has mean \(\tilde{\mu}_i\) and covariance matrix \(\tilde{\Sigma}_i\) as defined above. The actual imputation of the missing outcome data is obtained by conditioning this marginal distribution on the subject’s observed outcome data. Of note, this approach is valid regardless whether the subject has intermittent or terminal missing data.
+The conditional distribution used for the imputation is again a multivariate normal distribution and explicit formulas for the conditional mean and covariance are readily available. For completeness, we report them here with the notation and terminology of our setting. The marginal distribution for the outcome of patient \(i\) is \(Y_i \sim N(\tilde{\mu}_i, \tilde{\Sigma}_i)\) and the outcome \(Y_i\) can be decomposed in the observed (\(Y_{i,!}\)) and the unobserved (\(Y_{i,?}\)) components. Analogously the mean \(\tilde{\mu}_i\) can be decomposed as \((\tilde{\mu}_{i,!},\tilde{\mu}_{i,?})\) and the covariance \(\tilde{\Sigma}_i\) as: +\[ +\tilde{\Sigma}_i = +\begin{bmatrix} +\tilde{\Sigma}_{i, !!} & \tilde{\Sigma}_{i,!?} \\ +\tilde{\Sigma}_{i, ?!} & \tilde{\Sigma}_{i, ??} +\end{bmatrix}. +\] +The conditional distribution of \(Y_{i,?}\) conditional on \(Y_{i,!}\) is then a multivariate normal distribution with expectation +\[ +E(Y_{i,?} \vert Y_{i,!})= \tilde{\mu}_{i,?} + \tilde{\Sigma}_{i, ?!} \tilde{\Sigma}_{i,!!}^{-1} (Y_{i,!} - \tilde{\mu}_{i,!}) +\] +and covariance matrix +\[ +Cov(Y_{i,?} \vert Y_{i,!}) = \tilde{\Sigma}_{i,??} - \tilde{\Sigma}_{i,?!} \tilde{\Sigma}_{i,!!}^{-1} \tilde{\Sigma}_{i,!?}. +\]
+Conventional random imputation consists in sampling from this conditional multivariate normal distribution. Conditional mean imputation imputes missing values with the deterministic conditional expectation \(E(Y_{i,?} \vert Y_{i,!})\).
+A marginal \(\delta\)-adjustment approach similar to the “five macros” in SAS is implemented (Roger (2021)), i.e. fixed non-stochastic values are added after the multivariate normal imputation step and prior to the analysis. +This is relevant for sensitivity analyses in order to make imputed data systematically worse or better, respectively, than observed data. In addition, some authors have suggested \(\delta\)-type adjustments to implement a composite strategy for continuous outcomes (Darken et al. (2020)).
+The implementation provides full flexibility regarding the specific implementation of the \(\delta\)-adjustment, i.e. the value that is added may depend on the randomized treatment group, the timing of the subject’s ICE, and other factors. For suggestions and case studies regarding this topic, we refer to Cro et al. (2020).
+After data imputation, a standard analysis model can be applied to the completed data resulting in a treatment effect estimate. As the imputed data no longer contains missing values, the analysis model is often simple. For example, it can be an analysis of covariance (ANCOVA) model with the outcome (or the change in the outcome from baseline) at a specific visit j as the dependent variable, the randomized treatment group as the primary covariate and, typically, adjustment for the same baseline covariates as for the imputation model.
+Assume that the analysis model has been applied to \(M\) multiple imputed random datasets which resulted in \(m\) treatment effect estimates \(\hat{\theta}_m\) (\(m=1,\ldots,M\)) with corresponding standard error \(SE_m\) and (if available) degrees of freedom \(\nu_{com}\). If degrees of freedom are not available for an analysis model, set \(\nu_{com}=\infty\) for inference based on the normal distribution.
+Rubin’s rules are used for pooling the treatment effect estimates and corresponding variances estimates from the analysis steps across the \(M\) multiple imputed datasets. According to Rubin’s rules, the final estimate of the treatment effect is calculated as the sample mean over the \(M\) treatment effect estimates: +\[ +\hat{\theta} = \frac{1}{M} \sum_{m = 1}^M \hat{\theta}_m. +\] +The pooled variance is based on two components that reflect the within and the between variance of the treatment effects across the multiple imputed datasets: +\[ +V(\hat{\theta}) = V_W(\hat{\theta}) + (1 + \frac{1}{M}) V_B(\hat{\theta}) +\] +where \(V_W(\hat{\theta}) = \frac{1}{M}\sum_{m = 1}^M SE^2_m\) is the within-variance and \(V_B(\hat{\theta}) = \frac{1}{M-1} \sum_{m = 1}^M (\hat{\theta}_m - \hat{\theta})^2\) is the between-variance.
+Confidence intervals and tests of the null hypothesis \(H_0: \theta=\theta_0\) are based on the \(t\)-statistics \(T\):
+\[ T= (\hat{\theta}-\theta_0)/\sqrt{V(\hat{\theta})}. \] +Under the null hypothesis, \(T\) has an approximate \(t\)-distribution with \(\nu\) degrees of freedom. \(\nu\) is calculated according to the Barnard and Rubin approximation, see Barnard and Rubin (1999) (formula 3) or Little and Rubin (2002) (formula (5.24), page 87):
+\[ +\nu = \frac{\nu_{old}* \nu_{obs}}{\nu_{old} + \nu_{obs}} +\] +with +\[ +\nu_{old} = \frac{M-1}{\lambda^2} \quad\mbox{and}\quad \nu_{obs} = \frac{\nu_{com} + 1}{\nu_{com} + 3} \nu_{com} (1 - \lambda) +\] +where \(\lambda = \frac{(1 + \frac{1}{M})V_B(\hat{\theta})}{V(\hat{\theta})}\) is the fraction of missing information.
+The point estimator is obtained by applying the analysis model (Section 3.6) to a single conditional mean imputation of the missing data (see Section 3.4.3) based on the REML estimator of the parameters of the imputation model (see Section 3.3.2). We denote this treatment effect estimator by \(\hat{\theta}\).
+As demonstrated in Wolbers et al. (2021) (Section 2.4), this treatment effect estimator is valid if the analysis model is an ANCOVA model or, more generally, if the treatment effect estimator is a linear function of the imputed outcome vector. Indeed, if this is the case, then the estimator is identical to the pooled treatment effect across multiple random REML imputation with an infinite number of imputations and corresponds to a computationally efficient implementation of a proposal by von Hippel and Bartlett (2021). We expect that the conditional mean imputation method is also applicable to some other analysis models (e.g. for general MMRM analysis models) but this has not been formally justified.
+For a dataset containing \(n\) subjects, the jackknife standard error depends on treatment effect estimates \(\hat{\theta}_{(-b)}\) (\(b=1,\ldots,n\)) from samples of the original dataset which leave out the observation from subject \(b\). As described previously, to obtain treatment effect estimates for leave-one-subject-out datasets, all +steps of the imputation procedure (i.e. imputation, conditional mean imputation, and analysis steps) need to be repeated on this new dataset.
+Then, the jackknife standard error is defined as +\[\hat{se}_{jack}=[\frac{(n-1)}{n}\cdot\sum_{b=1}^{n} (\hat{\theta}_{(-b)}-\bar{\theta}_{(.)})^2]^{1/2}\] +where \(\bar{\theta}_{(.)}\) denotes the mean of all jackknife estimates (Efron and Tibshirani (1994), chapter 10). The corresponding two-sided normal approximation \(1-\alpha\) CI is defined as \(\hat{\theta}\pm z^{1-\alpha/2}\cdot \hat{se}_{jack}\) where \(\hat{\theta}\) is the treatment effect estimate from the original dataset. Tests of the null hypothesis \(H_0: \theta=\theta_0\) are then based on the \(Z\)-score \(Z=(\hat{\theta}-\theta_0)/\hat{se}_{jack}\) using a standard normal approximation.
+A simulation study reported in Wolbers et al. (2021) demonstrated exact protection of the type I error for jackknife-based inference with a relatively low sample size (n = 100 per group) and a substantial amount of missing data (>25% of subjects with an ICE).
+As an alternative to the jackknife, the bootstrap has also been implemented in rbmi
(Efron and Tibshirani (1994), Davison and Hinkley (1997)).
Two different bootstrap methods are implemented in rbmi
: Methods based on the bootstrap standard error and the normal approximation and percentile bootstrap methods. Denote the treatment effect estimates from \(B\) bootstrap samples by \(\hat{\theta}^*_b\) (\(b=1,\ldots,B\)). The bootstrap standard error \(\hat{se}_{boot}\) is defined as the empirical standard deviation of the bootstrapped treatment effect estimates. Confidence intervals and tests based on the bootstrap standard error can then be constructed in the same way as for the jackknife. Confidence intervals using the percentile bootstrap are based on empirical quantiles of the bootstrap distribution and corresponding statistical tests are implemented in rbmi
via inversion of the confidence interval. Explicit formulas for bootstrap inference as implemented in the rbmi
package and some considerations regarding the required number of bootstrap samples are included in the Appendix of Wolbers et al. (2021).
A simulation study reported in Wolbers et al. (2021) demonstrated a small inflation of the type I error rate for inference based on the bootstrap standard error (up to \(5.3\%\) for a nominal type I error rate of \(5\%\)) for a sample size of n = 100 per group and a substantial amount of missing data (>25% of subjects with an ICE). Based on this simulations, we recommend the jackknife over the bootstrap for inference because it performed better in our simulation study and is typically much faster to +compute than the bootstrap.
+Assume that the analysis model has been applied to \(B\times D\) multiple imputed random datasets which resulted in \(B\times D\) treatment effect estimates \(\hat{\theta}_{bd}\) (\(b=1,\ldots,B\); \(d=1,\ldots,D\)).
+The final estimate of the treatment effect is calculated as the sample mean over the \(B*D\) treatment effect estimates: +\[ +\hat{\theta} = \frac{1}{BD} \sum_{b = 1}^B \sum_{d = 1}^D \hat{\theta}_{bd}. +\] +The pooled variance is based on two components that reflect the variability within and between imputed bootstrap samples (von Hippel and Bartlett (2021), formula 8.4): +\[ +V(\hat{\theta}) = (1 + \frac{1}{B})\frac{MSB - MSW}{D} + \frac{MSW}{BD} +\]
+where \(MSB\) is the mean square between the bootstrapped datasets, and \(MSW\) is the mean square within the bootstrapped datasets and between the imputed datasets:
+\[ +\begin{align*} +MSB &= \frac{D}{B-1} \sum_{b = 1}^B (\bar{\theta_{b}} - \hat{\theta})^2 \\ +MSW &= \frac{1}{B(D-1)} \sum_{b = 1}^B \sum_{d = 1}^D (\theta_{bd} - \bar{\theta_b})^2 +\end{align*} +\] +where \(\bar{\theta_{b}}\) is the mean across the \(D\) estimates obtained from random imputation of the \(b\)-th bootstrap sample.
+The degrees of freedom are estimated with the following formula (von Hippel and Bartlett (2021), formula 8.6):
+\[ +\nu = \frac{(MSB\cdot (B+1) - MSW\cdot B)^2}{\frac{MSB^2\cdot (B+1)^2}{B-1} + \frac{MSW^2\cdot B}{D-1}} +\]
+Confidence intervals and tests of the null hypothesis \(H_0: \theta=\theta_0\) are based on the \(t\)-statistics \(T\):
+\[ T= (\hat{\theta}-\theta_0)/\sqrt{V(\hat{\theta})}. \] +Under the null hypothesis, \(T\) has an approximate \(t\)-distribution with \(\nu\) degrees of freedom.
+All approaches provide consistent treatment effect estimates for standard and reference-based imputation methods in case the analysis model of the completed datasets is a general linear model such as ANCOVA. Methods other than conditional mean imputation should also be valid for other analysis models. The validity of conditional mean imputation has only been formally demonstrated for analyses using the general linear model (Wolbers et al. (2021, sec. 2.4)) though it may also be applicable more widely (e.g. for general MMRM analysis models).
+Treatment effects based on conditional mean imputation are deterministic. All other methods are affected by Monte Carlo sampling error and the precision of estimates depends on the number of imputations or bootstrap samples, respectively.
+All approaches provide frequentist consistent estimates of the standard error for imputation under a MAR assumption. For reference-based imputation methods, methods based on conditional mean imputation or bootstrapped MI provide frequentist consistent estimates of the standard error whereas Rubin’s rules applied to conventional MI methods provides so-called information anchored inference (Bartlett (2021), Cro, Carpenter, and Kenward (2019), von Hippel and Bartlett (2021), Wolbers et al. (2021)). Frequentist consistent estimates of the standard error lead to confidence intervals and tests which have (asymptotically) correct coverage and type I error control under the assumption that the reference-based assumption reflects the true data-generating mechanism. For finite samples, simulations for a sample size of \(n=100\) per group reported in Wolbers et al. (2021) demonstrated that conditional mean imputation combined with the jackknife provided exact protection of the type one error rate whereas the bootstrap was associated with a small type I error inflation (between 5.1% to 5.3% for a nominal level of 5%).
+It is well known that Rubin’s rules do not provide frequentist consistent estimates of the standard error for reference-based imputation methods (Seaman, White, and Leacy (2014), Liu and Pang (2016), Tang (2017), Cro, Carpenter, and Kenward (2019), Bartlett (2021)). Standard errors from Rubin’s rule are typically larger than frequentist standard error estimates leading to conservative inference and a corresponding loss of statistical power, see e.g. the simulations reported in Wolbers et al. (2021). +Intuitively, this occurs because reference-based imputation methods borrow information from the reference group for imputations in the intervention group leading to a reduction in the frequentist variance of the resulting treatment effect contrast which is not captured by Rubin’s variance estimator. Formally, this occurs because the imputation and analysis models are uncongenial for reference-based imputation methods (Meng (1994), Bartlett (2021)). +Cro, Carpenter, and Kenward (2019) argued that Rubin’s rule is nevertheless valid for reference-based imputation methods because it is approximately information-anchored, i.e. that the proportion of information lost due to missing data under MAR is approximately preserved in reference-based analyses. In contrast, frequentist standard errors for reference based imputation are not information anchored for reference-based imputation and standard errors under reference-based assumptions are typically smaller than those for MAR imputation.
+Information anchoring is a sensible concept for sensitivity analyses, whereas for a primary analyses, it may be more important to adhere to the principles of frequentist inference. Analyses of data with missing observations generally rely on unverifiable missing data assumptions and the assumptions for reference-based imputation methods are relatively strong. Therefore, these assumptions need to be clinically justified as appropriate or at least conservative for the considered disease area and the anticipated mechanism of action of the intervention.
+Conditional mean imputation combined with the jackknife is the only method which leads to deterministic standard error estimates and, consequently, confidence intervals and \(p\)-values are also deterministic. This is particularly important in a regulatory setting where it is important to ascertain whether a calculated \(p\)-value which is close to the critical boundary of 5% is truly below or above that threshold rather than being uncertain about this because of Monte Carlo error.
+Bayesian MI methods rely on the specification of prior distributions and the usage of Markov chain Monte Carlo (MCMC) methods. +All other methods based on multiple imputation or bootstrapping require no other tuning parameters than the specification of the number of imputations \(M\) or bootstrap samples \(B\) and rely on numerical optimization for fitting the MMRM imputation models via REML. Conditional mean imputation combined with the jackknife has no tuning parameters.
+In our rbmi
implementation, the fitting of the MMRM imputation model via REML is computationally most expensive. MCMC sampling using rstan
(Stan Development Team (2020)) is typically relatively fast in our setting and requires only a small burn-in and burn-between of the chains. In addition, the number of random imputations for reliable inference using Rubin’s rules is often smaller than the number of resamples required for the jackknife or the bootstrap (see e.g. the discussions in I. R. White, Royston, and Wood (2011, sec. 7) for Bayesian MI and the Appendix of Wolbers et al. (2021) for the bootstrap). Thus, for many applications, we expect that conventional MI based on Bayesian posterior draws will be fastest, followed by conventional MI using approximate Bayesian posterior draws and conditional mean imputation combined with the jackknife. Conditional mean imputation combined with the bootstrap and bootstrapped MI methods will typically be most computationally demanding. Of note, all implemented methods are conceptually straightforward to parallelize and some parallelization support is provided by rbmi
.
rbmi
functionsFor a full documentation of the rbmi
package functionality we refer to the help pages of all functions and to the other package vignettes. Here we only give a brief overview of how the different steps of the imputation procedure are mapped to rbmi
functions:
draws()
. The chosen MI approach can be set using the argument method
and should be one of the following:
+method = method_bayes()
.method = method_approxbayes()
.method = method_condmean(type = "jackknife")
.method = method_condmean(type = "bootstrap")
.method = method_bmlmi(B=B, D=D)
where \(B\) refers to the number of bootstrap samples and \(D\) to the number of random imputations for each bootstrap sample.impute()
. Imputation can be performed assuming the already implemented imputation strategies as presented in section 3.4. Additionally, user-defined imputation strategies are also supported.analyse()
and applies the analysis model to all imputed datasets. By default, the analysis model (argument fun
) is the ancova()
function but alternative analysis functions can also be provided by the user. The analyse()
function also allows for \(\delta\)-adjustments to the imputed datasets prior to the analysis via argument delta
.pool()
which pools the results across imputed datasets. The Rubin and Bernard rule is applied in case of (approximate) Bayesian MI. For conditional mean imputation, jackknife and bootstrap (normal approximation or percentile) inference is supported. For BMLMI, the pooling and inference steps are performed via pool()
which in this case implements the method described in Section 3.9.An established software implementation of reference-based imputation in SAS are the so-called “five macros” by James Roger (Roger (2021)). An alternative R
implementation which is also currently under development is the R package RefBasedMI
(McGrath and White (2021)).
rbmi
has several features which are not supported by the other implementations:
In addition to the Bayesian MI approach implemented also in the other packages, our implementation provides three alternative MI approaches: approximate Bayesian MI, conditional mean imputation combined with resampling, and bootstrapped MI.
rbmi
allows for the usage of data collected after an ICE. For example, suppose that we want to adopt a treatment policy strategy for the ICE “treatment discontinuation”. A possible implementation of this strategy is to use the observed outcome data for subjects who remain in the study after the ICE and to use reference-based imputation in case the subject drops out. In our implementation, this is implemented by excluding observed post ICE data from the imputation model which assumes MAR missingness but including them in the analysis model. To our knowledge, this is not directly supported by the other implementations.
RefBasedMI
fits the imputation model to data from each treatment group separately which implies covariate-treatment group interactions for all covariates for the pooled data from both treatment groups. In contrast, Roger’s five macros assume a joint model including data from all the randomized groups and covariate-treatment interactions covariates are not allowed. We also chose to implement a joint model but use a flexible model for the linear predictor which may or may not include an interaction term between any covariate and the treatment group. In addition, our imputation model also allows for the inclusion of time-varying covariates.
In our implementation, the grouping of the subjects for the purpose of the imputation model (and the definition of the reference group) does not need to correspond to the assigned treatment groups. This provides additional flexibility for the imputation procedure. It is not clear to us whether this feature is supported by Roger’s five macros or RefBasedMI
.
We believe that our R-based implementation is more modular than RefBasedMI
which should facilitate further package enhancements.
In contrast, the more general causal model introduced by I. White, Royes, and Best (2020) is available in the other implementations but is currently not supported by ours.
+