From 0f1125cdad10068350a466fc7b37750e1bfc3fae Mon Sep 17 00:00:00 2001 From: Daniel Date: Mon, 18 Mar 2024 11:49:30 +0100 Subject: [PATCH] DHARMa implementation for new `check_residuals()` function (#643) Co-authored-by: Michael McCarthy <51542091+mccarthy-m-g@users.noreply.github.com> --- DESCRIPTION | 6 +- NAMESPACE | 19 ++ NEWS.md | 25 +++ R/check_distribution.R | 20 ++- R/check_model.R | 151 ++++++++++++---- R/check_model_diagnostics.R | 77 +++++++- R/check_normality.R | 16 +- R/check_outliers.R | 70 +++++++- R/check_overdispersion.R | 153 +++++++++++----- R/check_predictions.R | 4 +- R/check_residuals.R | 101 +++++++++++ R/check_zeroinflation.R | 123 ++++++++++--- R/simulate_residuals.R | 117 +++++++++++++ _pkgdown.yaml | 1 + inst/WORDLIST | 7 + man/check_model.Rd | 40 ++++- man/check_outliers.Rd | 37 +++- man/check_overdispersion.Rd | 53 +++++- man/check_predictions.Rd | 2 + man/check_residuals.Rd | 65 +++++++ man/check_zeroinflation.Rd | 54 +++++- man/simulate_residuals.Rd | 68 ++++++++ tests/testthat/_snaps/check_collinearity.md | 23 +++ tests/testthat/test-binned_residuals.R | 4 + tests/testthat/test-check_autocorrelation.R | 5 + tests/testthat/test-check_collinearity.R | 5 +- tests/testthat/test-check_convergence.R | 17 ++ .../testthat/test-check_heterogeneity_bias.R | 2 +- .../testthat/test-check_heteroskedasticity.R | 17 ++ tests/testthat/test-check_model.R | 7 +- tests/testthat/test-check_outliers.R | 37 ++++ tests/testthat/test-check_overdispersion.R | 164 +++++++++++++++++- tests/testthat/test-check_residuals.R | 26 +++ tests/testthat/test-check_zeroinflation.R | 147 +++++++++++++++- tests/testthat/test-checks.R | 32 +++- vignettes/simulate_residuals.Rmd | 94 ++++++++++ 36 files changed, 1623 insertions(+), 166 deletions(-) create mode 100644 R/check_residuals.R create mode 100644 R/simulate_residuals.R create mode 100644 man/check_residuals.Rd create mode 100644 man/simulate_residuals.Rd create mode 100644 tests/testthat/test-check_heteroskedasticity.R create mode 100644 tests/testthat/test-check_residuals.R create mode 100644 vignettes/simulate_residuals.Rmd diff --git a/DESCRIPTION b/DESCRIPTION index 218b0aaa7..be9f19035 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Type: Package Package: performance Title: Assessment of Regression Models Performance -Version: 0.10.9.6 +Version: 0.10.9.8 Authors@R: c(person(given = "Daniel", family = "Lüdecke", @@ -70,9 +70,8 @@ Depends: R (>= 3.6) Imports: bayestestR (>= 0.13.2), - insight (>= 0.19.8), + insight (>= 0.19.9), datawizard (>= 0.9.1), - methods, stats, utils Suggests: @@ -91,6 +90,7 @@ Suggests: correlation, cplm, dbscan, + DHARMa, estimatr, fixest, flextable, diff --git a/NAMESPACE b/NAMESPACE index 26125fbaf..24daed512 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -55,9 +55,11 @@ S3method(check_heteroscedasticity,default) S3method(check_homogeneity,afex_aov) S3method(check_homogeneity,default) S3method(check_homogeneity,htest) +S3method(check_model,DHARMa) S3method(check_model,brmsfit) S3method(check_model,default) S3method(check_model,model_fit) +S3method(check_model,performance_simres) S3method(check_model,stanreg) S3method(check_multimodal,data.frame) S3method(check_multimodal,numeric) @@ -71,6 +73,7 @@ S3method(check_normality,lmerModLmerTest) S3method(check_normality,merMod) S3method(check_normality,numeric) S3method(check_outliers,BFBayesFactor) +S3method(check_outliers,DHARMa) S3method(check_outliers,character) S3method(check_outliers,data.frame) S3method(check_outliers,default) @@ -87,11 +90,13 @@ S3method(check_outliers,meta) S3method(check_outliers,metabin) S3method(check_outliers,metagen) S3method(check_outliers,numeric) +S3method(check_outliers,performance_simres) S3method(check_outliers,rma) S3method(check_outliers,rma.uni) S3method(check_outliers,rq) S3method(check_outliers,rqs) S3method(check_outliers,rqss) +S3method(check_overdispersion,DHARMa) S3method(check_overdispersion,default) S3method(check_overdispersion,fixest) S3method(check_overdispersion,fixest_multi) @@ -103,11 +108,15 @@ S3method(check_overdispersion,model_fit) S3method(check_overdispersion,negbin) S3method(check_overdispersion,negbinirr) S3method(check_overdispersion,negbinmfx) +S3method(check_overdispersion,performance_simres) S3method(check_overdispersion,poissonirr) S3method(check_overdispersion,poissonmfx) S3method(check_predictions,BFBayesFactor) S3method(check_predictions,default) S3method(check_predictions,lme) +S3method(check_residuals,DHARMa) +S3method(check_residuals,default) +S3method(check_residuals,performance_simres) S3method(check_singularity,MixMod) S3method(check_singularity,clmm) S3method(check_singularity,cpglmm) @@ -123,6 +132,9 @@ S3method(check_sphericity,default) S3method(check_sphericity,mlm) S3method(check_symmetry,htest) S3method(check_symmetry,numeric) +S3method(check_zeroinflation,DHARMa) +S3method(check_zeroinflation,default) +S3method(check_zeroinflation,performance_simres) S3method(cronbachs_alpha,data.frame) S3method(cronbachs_alpha,matrix) S3method(cronbachs_alpha,parameters_pca) @@ -261,10 +273,12 @@ S3method(plot,check_model) S3method(plot,check_normality) S3method(plot,check_outliers) S3method(plot,check_overdisp) +S3method(plot,check_residuals) S3method(plot,check_sphericity) S3method(plot,compare_performance) S3method(plot,performance_pp_check) S3method(plot,performance_roc) +S3method(plot,performance_simres) S3method(plot,test_likelihoodratio) S3method(plot,test_performance) S3method(print,binned_residuals) @@ -283,7 +297,9 @@ S3method(print,check_normality_binom) S3method(print,check_outliers) S3method(print,check_outliers_metafor) S3method(print,check_outliers_metagen) +S3method(print,check_outliers_simres) S3method(print,check_overdisp) +S3method(print,check_residuals) S3method(print,check_sphericity) S3method(print,check_symmetry) S3method(print,check_zi) @@ -302,6 +318,7 @@ S3method(print,performance_pcp) S3method(print,performance_pp_check) S3method(print,performance_roc) S3method(print,performance_score) +S3method(print,performance_simres) S3method(print,r2_bayes) S3method(print,r2_generic) S3method(print,r2_loo) @@ -535,6 +552,7 @@ export(check_outliers) export(check_overdispersion) export(check_posterior_predictions) export(check_predictions) +export(check_residuals) export(check_singularity) export(check_sphericity) export(check_sphericity_bartlett) @@ -588,6 +606,7 @@ export(r2_tjur) export(r2_xu) export(r2_zeroinflated) export(rmse) +export(simulate_residuals) export(test_bf) export(test_likelihoodratio) export(test_lrt) diff --git a/NEWS.md b/NEWS.md index 9e2f2daa8..6fb76ca53 100644 --- a/NEWS.md +++ b/NEWS.md @@ -4,10 +4,35 @@ * Rudimentary support for models of class `serp` from package *serp*. +## New functions + +* `simulate_residuals()` and `check_residuals()`, to simulate and check residuals + from generalized linear (mixed) models. Simulating residuals is based on the + DHARMa package, and objects returned by `simulate_residuals()` inherit from + the `DHARMa` class, and thus can be used with any functions from the *DHARMa* + package. However, there are also implementations in the *performance* package, + such as `check_overdispersion()`, `check_zeroinflation()`, `check_outliers()` + or `check_model()`. + +* Plots for `check_model()` have been improved. The Q-Q plots are now based + on simulated residuals from the DHARMa package for non-Gaussian models, thus + providing more accurate and informative plots. The half-normal QQ plot for + generalized linear models can still be obtained by setting the new argument + `residual_type = "normal"`. + +* Following functions now support simulated residuals (from `simulate_residuals()`) + resp. objects returned from `DHARMa::simulateResiduals()`: + - `check_overdispersion()` + - `check_zeroinflation()` + - `check_outliers()` + - `check_model()` + ## General * Improved error messages for `check_model()` when QQ-plots cannot be created. +* `check_distribution()` is more stable for possibly sparse data. + ## Bug fixes * Fixed issue in `check_normality()` for t-tests. diff --git a/R/check_distribution.R b/R/check_distribution.R index 6dc5f7481..89f48263a 100644 --- a/R/check_distribution.R +++ b/R/check_distribution.R @@ -77,7 +77,7 @@ check_distribution.default <- function(model) { } else { x <- stats::residuals(model) } - dat <- .extract_features(x) + dat <- .extract_features(x, "residuals") dist_residuals <- as.data.frame(t(stats::predict(classify_distribution, dat, type = "prob"))) @@ -88,7 +88,7 @@ check_distribution.default <- function(model) { dummy_factors = FALSE, preserve_levels = TRUE ) - dat <- .extract_features(x) + dat <- .extract_features(x, "response") dist_response <- as.data.frame(t(stats::predict(classify_distribution, dat, type = "prob"))) @@ -189,15 +189,27 @@ check_distribution.numeric <- function(model) { # utilities ----------------------------- -.extract_features <- function(x) { +.extract_features <- function(x, type = NULL) { # validation check, remove missings x <- x[!is.na(x)] + # this might fail, so we wrap in ".safe()" + map_est <- .safe(mean(x) - as.numeric(bayestestR::map_estimate(x, bw = "nrd0"))) + + if (is.null(map_est)) { + map_est <- mean(x) - datawizard::distribution_mode(x) + msg <- "Could not accurately estimate the mode." + if (!is.null(type)) { + msg <- paste(msg, "Predicted distribution of the", type, "may be less accurate.") + } + insight::format_alert(msg) + } + data.frame( SD = stats::sd(x), MAD = stats::mad(x, constant = 1), Mean_Median_Distance = mean(x) - stats::median(x), - Mean_Mode_Distance = mean(x) - as.numeric(bayestestR::map_estimate(x, bw = "nrd0")), + Mean_Mode_Distance = map_est, SD_MAD_Distance = stats::sd(x) - stats::mad(x, constant = 1), Var_Mean_Distance = stats::var(x) - mean(x), Range_SD = diff(range(x)) / stats::sd(x), diff --git a/R/check_model.R b/R/check_model.R index e015bff80..6be17be22 100644 --- a/R/check_model.R +++ b/R/check_model.R @@ -12,40 +12,50 @@ #' @param panel Logical, if `TRUE`, plots are arranged as panels; else, #' single plots for each diagnostic are returned. #' @param check Character vector, indicating which checks for should be performed -#' and plotted. May be one or more of `"all"`, `"vif"`, `"qq"`, `"normality"`, -#' `"linearity"`, `"ncv"`, `"homogeneity"`, `"outliers"`, `"reqq"`, `"pp_check"`, -#' `"binned_residuals"` or `"overdispersion"`. Note that not all check apply -#' to all type of models (see 'Details'). `"reqq"` is a QQ-plot for random -#' effects and only available for mixed models. `"ncv"` is an alias for -#' `"linearity"`, and checks for non-constant variance, i.e. for -#' heteroscedasticity, as well as the linear relationship. By default, all -#' possible checks are performed and plotted. +#' and plotted. May be one or more of `"all"`, `"vif"`, `"qq"`, `"normality"`, +#' `"linearity"`, `"ncv"`, `"homogeneity"`, `"outliers"`, `"reqq"`, `"pp_check"`, +#' `"binned_residuals"` or `"overdispersion"`. Note that not all check apply +#' to all type of models (see 'Details'). `"reqq"` is a QQ-plot for random +#' effects and only available for mixed models. `"ncv"` is an alias for +#' `"linearity"`, and checks for non-constant variance, i.e. for +#' heteroscedasticity, as well as the linear relationship. By default, all +#' possible checks are performed and plotted. #' @param alpha,dot_alpha The alpha level of the confidence bands and dot-geoms. -#' Scalar from 0 to 1. +#' Scalar from 0 to 1. #' @param colors Character vector with color codes (hex-format). Must be of -#' length 3. First color is usually used for reference lines, second color -#' for dots, and third color for outliers or extreme values. +#' length 3. First color is usually used for reference lines, second color +#' for dots, and third color for outliers or extreme values. #' @param theme String, indicating the name of the plot-theme. Must be in the -#' format `"package::theme_name"` (e.g. `"ggplot2::theme_minimal"`). +#' format `"package::theme_name"` (e.g. `"ggplot2::theme_minimal"`). #' @param detrend Logical. Should Q-Q/P-P plots be detrended? Defaults to -#' `TRUE`. +#' `TRUE` for linear models or when `residual_type = "normal"`. Defaults to +#' `FALSE` for QQ plots based on simulated residuals (i.e. when +#' `residual_type = "simulated"`). +#' @param residual_type Character, indicating the type of residuals to be used. +#' For non-Gaussian models, the default is `"simulated"`, which uses simulated +#' residuals. These are based on [`simulate_residuals()`] and thus uses the +#' **DHARMa** package to return randomized quantile residuals. For Gaussian +#' models, the default is `"normal"`, which uses the default residuals from +#' the model. Setting `residual_type = "normal"` for non-Gaussian models will +#' use a half-normal Q-Q plot of the absolute value of the standardized deviance +#' residuals. #' @param show_dots Logical, if `TRUE`, will show data points in the plot. Set -#' to `FALSE` for models with many observations, if generating the plot is too -#' time-consuming. By default, `show_dots = NULL`. In this case `check_model()` -#' tries to guess whether performance will be poor due to a very large model -#' and thus automatically shows or hides dots. +#' to `FALSE` for models with many observations, if generating the plot is too +#' time-consuming. By default, `show_dots = NULL`. In this case `check_model()` +#' tries to guess whether performance will be poor due to a very large model +#' and thus automatically shows or hides dots. #' @param verbose If `FALSE` (default), suppress most warning messages. #' @param ... Arguments passed down to the individual check functions, especially -#' to `check_predictions()` and `binned_residuals()`. +#' to `check_predictions()` and `binned_residuals()`. #' @inheritParams check_predictions #' #' @return The data frame that is used for plotting. #' #' @note This function just prepares the data for plotting. To create the plots, -#' **see** needs to be installed. Furthermore, this function suppresses -#' all possible warnings. In case you observe suspicious plots, please refer -#' to the dedicated functions (like `check_collinearity()`, -#' `check_normality()` etc.) to get informative messages and warnings. +#' **see** needs to be installed. Furthermore, this function suppresses +#' all possible warnings. In case you observe suspicious plots, please refer +#' to the dedicated functions (like `check_collinearity()`, +#' `check_normality()` etc.) to get informative messages and warnings. #' #' @details For Bayesian models from packages **rstanarm** or **brms**, #' models will be "converted" to their frequentist counterpart, using @@ -103,10 +113,20 @@ #' normally distributed. Usually, dots should fall along the line. If there is #' some deviation (mostly at the tails), this indicates that the model doesn't #' predict the outcome well for that range that shows larger deviations from -#' the line. For generalized linear models, a half-normal Q-Q plot of the -#' absolute value of the standardized deviance residuals is shown, however, the -#' interpretation of the plot remains the same. See [`check_normality()`] for -#' further details. +#' the line. For generalized linear models and when `residual_type = "normal"`, +#' a half-normal Q-Q plot of the absolute value of the standardized deviance +#' residuals is shown, however, the interpretation of the plot remains the same. +#' See [`check_normality()`] for further details. Usually, for generalized linear +#' (mixed) models, a test for uniformity of residuals based on simulated residuals +#' is conducted (see next section). +#' +#' @section Uniformity of Residuals: +#' Fore non-Gaussian models, when `residual_type = "simulated"` (the default +#' for generalized linear (mixed) models), residuals are not expected to be +#' normally distributed. In this case, the created Q-Q plot checks the uniformity +#' of residuals. The interpretation of the plot is the same as for the normal +#' Q-Q plot. See [`simulate_residuals()`] and [`check_residuals()`] for further +#' details. #' #' @section Overdispersion: #' For count models, an *overdispersion plot* is shown. Overdispersion occurs @@ -124,12 +144,12 @@ #' inside the error bounds. See [`binned_residuals()`] for further details. #' #' @section Residuals for (Generalized) Linear Models: -#' Plots that check the normality of residuals (QQ-plot) or the homogeneity of +#' Plots that check the normality of residuals (Q-Q plot) or the homogeneity of #' variance use standardized Pearson's residuals for generalized linear models, #' and standardized residuals for linear models. The plots for the normality of #' residuals (with overlayed normal curve) and for the linearity assumption use -#' the default residuals for `lm` and `glm` (which are deviance -#' residuals for `glm`). +#' the default residuals for `lm` and `glm` (which are deviance residuals for +#' `glm`). #' #' @section Troubleshooting: #' For models with many observations, or for more complex models in general, @@ -174,6 +194,7 @@ check_model.default <- function(x, show_dots = NULL, bandwidth = "nrd", type = "density", + residual_type = NULL, verbose = FALSE, ...) { # check model formula @@ -183,13 +204,22 @@ check_model.default <- function(x, minfo <- insight::model_info(x, verbose = FALSE) + # set default for residual_type + if (is.null(residual_type)) { + residual_type <- ifelse(minfo$is_linear && !minfo$is_gam, "normal", "simulated") + } + # set default for detrend + if (missing(detrend)) { + detrend <- residual_type == "normal" + } + assumptions_data <- tryCatch( if (minfo$is_bayesian) { suppressWarnings(.check_assumptions_stan(x, ...)) } else if (minfo$is_linear) { - suppressWarnings(.check_assumptions_linear(x, minfo, verbose, ...)) + suppressWarnings(.check_assumptions_linear(x, minfo, residual_type, verbose, ...)) } else { - suppressWarnings(.check_assumptions_glm(x, minfo, verbose, ...)) + suppressWarnings(.check_assumptions_glm(x, minfo, residual_type, verbose, ...)) }, error = function(e) { e @@ -272,6 +302,7 @@ check_model.stanreg <- function(x, show_dots = NULL, bandwidth = "nrd", type = "density", + residual_type = NULL, verbose = TRUE, ...) { check_model(bayestestR::bayesian_as_frequentist(x), @@ -287,6 +318,7 @@ check_model.stanreg <- function(x, show_dots = show_dots, bandwidth = bandwidth, type = type, + residual_type = residual_type, verbose = verbose, ... ) @@ -311,6 +343,7 @@ check_model.model_fit <- function(x, show_dots = NULL, bandwidth = "nrd", type = "density", + residual_type = NULL, verbose = TRUE, ...) { check_model( @@ -327,20 +360,65 @@ check_model.model_fit <- function(x, show_dots = show_dots, bandwidth = bandwidth, type = type, + residual_type = residual_type, verbose = verbose, ... ) } +#' @export +check_model.performance_simres <- function(x, + dot_size = 2, + line_size = 0.8, + panel = TRUE, + check = "all", + alpha = 0.2, + dot_alpha = 0.8, + colors = c("#3aaf85", "#1b6ca8", "#cd201f"), + theme = "see::theme_lucid", + detrend = TRUE, + show_dots = NULL, + bandwidth = "nrd", + type = "density", + residual_type = NULL, + verbose = TRUE, + ...) { + check_model( + x$fittedModel, + dot_size = dot_size, + line_size = line_size, + panel = panel, + check = check, + alpha = alpha, + dot_alpha = dot_alpha, + colors = colors, + theme = theme, + detrend = detrend, + show_dots = show_dots, + bandwidth = bandwidth, + type = type, + residual_type = "simulated", + verbose = verbose, + ... + ) +} + +#' @export +check_model.DHARMa <- check_model.performance_simres + + # compile plots for checks of linear models ------------------------ -.check_assumptions_linear <- function(model, model_info, verbose = TRUE, ...) { +.check_assumptions_linear <- function(model, model_info, residual_type = "normal", verbose = TRUE, ...) { dat <- list() dat$VIF <- .diag_vif(model, verbose = verbose) - dat$QQ <- .diag_qq(model, model_info = model_info, verbose = verbose) + dat$QQ <- switch(residual_type, + simulated = simulate_residuals(model, ...), + .diag_qq(model, model_info = model_info, verbose = verbose) + ) dat$REQQ <- .diag_reqq(model, level = 0.95, model_info = model_info, verbose = verbose) dat$NORM <- .diag_norm(model, verbose = verbose) dat$NCV <- .diag_ncv(model, verbose = verbose) @@ -363,11 +441,14 @@ check_model.model_fit <- function(x, # compile plots for checks of generalized linear models ------------------------ -.check_assumptions_glm <- function(model, model_info, verbose = TRUE, ...) { +.check_assumptions_glm <- function(model, model_info, residual_type = "simulated", verbose = TRUE, ...) { dat <- list() dat$VIF <- .diag_vif(model, verbose = verbose) - dat$QQ <- .diag_qq(model, model_info = model_info, verbose = verbose) + dat$QQ <- switch(residual_type, + simulated = simulate_residuals(model, ...), + .diag_qq(model, model_info = model_info, verbose = verbose) + ) dat$HOMOGENEITY <- .diag_homogeneity(model, verbose = verbose) dat$REQQ <- .diag_reqq(model, level = 0.95, model_info = model_info, verbose = verbose) dat$OUTLIERS <- .safe(check_outliers(model, method = "cook")) diff --git a/R/check_model_diagnostics.R b/R/check_model_diagnostics.R index ff9482934..39a3c843d 100644 --- a/R/check_model_diagnostics.R +++ b/R/check_model_diagnostics.R @@ -293,7 +293,81 @@ # prepare data for homogeneity of variance plot ---------------------------------- -.diag_overdispersion <- function(model) { +.new_diag_overdispersion <- function(model, ...) { + faminfo <- insight::model_info(model) + + simres <- simulate_residuals(model, ...) + predicted <- simres$fittedPredictedResponse + d <- data.frame(Predicted = predicted) + + # residuals based on simulated residuals - but we want normally distributed residuals + d$Residuals <- stats::residuals(simres, quantileFunction = stats::qnorm, ...) + d$Res2 <- d$Residuals^2 + d$StdRes <- insight::get_residuals(model, type = "pearson") + + # data for poisson models + if (faminfo$is_poisson && !faminfo$is_zero_inflated) { + d$V <- predicted + } + + # data for negative binomial models + if (faminfo$is_negbin && !faminfo$is_zero_inflated) { + if (inherits(model, "glmmTMB")) { + if (faminfo$family == "nbinom1") { + # for nbinom1, we can use "sigma()" + d$V <- insight::get_sigma(model)^2 * stats::family(model)$variance(predicted) + } else { + # for nbinom2, "sigma()" has "inverse meaning" (see #654) + d$V <- (1 / insight::get_sigma(model)^2) * stats::family(model)$variance(predicted) + } + } else { + ## FIXME: this is not correct for glm.nb models? + d$V <- predicted * (1 + predicted / insight::get_sigma(model)) + } + } + + # data for zero-inflated poisson models + if (faminfo$is_poisson && faminfo$is_zero_inflated) { + if (inherits(model, "glmmTMB")) { + ptype <- "zprob" + } else { + ptype <- "zero" + } + d$Prob <- stats::predict(model, type = ptype) + d$V <- predicted * (1 - d$Prob) * (1 + predicted * d$Prob) + } + + # data for zero-inflated negative binomial models + if (faminfo$is_negbin && faminfo$is_zero_inflated && !faminfo$is_dispersion) { + if (inherits(model, "glmmTMB")) { + ptype <- "zprob" + } else { + ptype <- "zero" + } + d$Prob <- stats::predict(model, type = ptype) + d$Disp <- insight::get_sigma(model) + d$V <- predicted * (1 + predicted / d$Disp) * (1 - d$Prob) * (1 + predicted * (1 + predicted / d$Disp) * d$Prob) # nolint + } + + # data for zero-inflated negative binomial models with dispersion + if (faminfo$is_negbin && faminfo$is_zero_inflated && faminfo$is_dispersion) { + d <- data.frame(Predicted = stats::predict(model, type = "response")) + if (inherits(model, "glmmTMB")) { + ptype <- "zprob" + } else { + ptype <- "zero" + } + d$Prob <- stats::predict(model, type = ptype) + d$Disp <- stats::predict(model, type = "disp") + d$V <- predicted * (1 + predicted / d$Disp) * (1 - d$Prob) * (1 + predicted * (1 + predicted / d$Disp) * d$Prob) # nolint + } + + d +} + + + +.diag_overdispersion <- function(model, ...) { faminfo <- insight::model_info(model) # data for poisson models @@ -380,7 +454,6 @@ } - # helpers ---------------------------------- .sigma_glmmTMB_nonmixed <- function(model, faminfo) { diff --git a/R/check_normality.R b/R/check_normality.R index 9dc00d03f..2b9f071d5 100644 --- a/R/check_normality.R +++ b/R/check_normality.R @@ -58,7 +58,7 @@ check_normality.default <- function(x, ...) { if (!insight::model_info(x)$is_linear) { insight::format_alert( - "Checking normality of residuals is only appropriate for linear models." + "Checking normality of residuals is only appropriate for linear models. It is recommended to use `simulate_residuals()` and `check_residuals()` to check generalized linear (mixed) models for uniformity of residuals." # nolint ) return(NULL) } @@ -87,7 +87,7 @@ check_normality.glm <- function(x, ...) { insight::format_alert( "There's no formal statistical test for normality for generalized linear model.", - "Please use `plot()` on the return value of this function: `plot(check_normality(model))`" + "Instead, please use `simulate_residuals()` and `check_residuals()` to check for uniformity of residuals." ) invisible(out) } @@ -181,7 +181,7 @@ check_normality.merMod <- function(x, effects = c("fixed", "random"), ...) { # valid model? if (!info$is_linear && effects == "fixed") { insight::format_alert( - "Checking normality of residuals is only appropriate for linear models." + "Checking normality of residuals is only appropriate for linear models. It is recommended to use `simulate_residuals()` and `check_residuals()` to check generalized linear (mixed) models for uniformity of residuals." # nolint ) return(NULL) } @@ -200,7 +200,7 @@ check_normality.merMod <- function(x, effects = c("fixed", "random"), ...) { } }, error = function(e) { - return(NULL) + NULL } ) @@ -217,6 +217,8 @@ check_normality.merMod <- function(x, effects = c("fixed", "random"), ...) { attr(p.val, "type") <- "random effects" attr(p.val, "re_groups") <- re_groups } + } else if (inherits(x, "glmmTMB")) { + p.val <- .check_normality(stats::residuals(x, type = "deviance"), x) } else { # check for normality of residuals p.val <- .check_normality(stats::rstudent(x), x) @@ -260,7 +262,7 @@ check_normality.BFBayesFactor <- check_normality.afex_aov # helper --------------------- .check_normality <- function(x, model, type = "residuals") { - ts <- .safe({ + ts_result <- .safe({ if (length(x) >= 5000) { suppressWarnings(stats::ks.test(x, y = "pnorm", alternative = "two.sided")) } else { @@ -268,7 +270,7 @@ check_normality.BFBayesFactor <- check_normality.afex_aov } }) - if (is.null(ts)) { + if (is.null(ts_result)) { insight::print_color( sprintf("`check_normality()` does not support models of class `%s`.\n", class(model)[1]), "red" @@ -276,7 +278,7 @@ check_normality.BFBayesFactor <- check_normality.afex_aov return(NULL) } - out <- ts$p.value + out <- ts_result$p.value attr(out, "type") <- type out diff --git a/R/check_outliers.R b/R/check_outliers.R index 2017d4cc5..a589b1309 100644 --- a/R/check_outliers.R +++ b/R/check_outliers.R @@ -12,7 +12,8 @@ #' by at least half of the methods). See the **Details** section below #' for a description of the methods. #' -#' @param x A model or a data.frame object. +#' @param x A model, a data.frame, a `performance_simres` [`simulate_residuals()`] +#' or a `DHARMa` object. #' @param method The outlier detection method(s). Can be `"all"` or some of #' `"cook"`, `"pareto"`, `"zscore"`, `"zscore_robust"`, `"iqr"`, `"ci"`, `"eti"`, #' `"hdi"`, `"bci"`, `"mahalanobis"`, `"mahalanobis_robust"`, `"mcd"`, `"ics"`, @@ -23,11 +24,19 @@ #' 'Details'). If a numeric value is given, it will be used as the threshold #' for any of the method run. #' @param ID Optional, to report an ID column along with the row number. +#' @param type Type of method to test for outliers. Can be one of `"default"`, +#' `"binomial"` or `"bootstrap"`. Only applies when `x` is an object returned +#' by `simulate_residuals()` or of class `DHARMa`. See 'Details' in +#' `?DHARMa::testOutliers` for a detailed description of the types. #' @param verbose Toggle warnings. #' @param ... When `method = "ics"`, further arguments in `...` are passed #' down to [ICSOutlier::ics.outlier()]. When `method = "mahalanobis"`, #' they are passed down to [stats::mahalanobis()]. `percentage_central` can -#' be specified when `method = "mcd"`. +#' be specified when `method = "mcd"`. For objects of class `performance_simres` +#' or `DHARMa`, further arguments are passed down to `DHARMa::testOutliers()`. +#' +#' @inheritParams check_zeroinflation +#' @inheritParams simulate_residuals #' #' @return A logical vector of the detected outliers with a nice printing #' method: a check (message) on whether outliers were detected or not. The @@ -200,6 +209,17 @@ #' observations located at `qnorm(1-0.025) * SD)` of the log-transformed #' LOF distance. Requires the **dbscan** package. #' +#' @section Methods for simulated residuals: +#' +#' The approach for detecting outliers based on simulated residuals differs +#' from the traditional methods and may not be detecting outliers as expected. +#' Literally, this approach compares observed to simulated values. However, we +#' do not know the deviation of the observed data to the model expectation, and +#' thus, the term "outlier" should be taken with a grain of salt. It refers to +#' "simulation outliers". Basically, the comparison tests whether on observed +#' data point is outside the simulated range. It is strongly recommended to read +#' the related documentations in the **DHARMa** package, e.g. `?DHARMa::testOutliers`. +#' #' @section Threshold specification: #' #' Default thresholds are currently specified as follows: @@ -785,6 +805,28 @@ plot.check_outliers <- function(x, ...) { NextMethod() } +#' @export +print.check_outliers_simres <- function(x, digits = 2, ...) { + result <- paste0( + insight::format_value(100 * x$Expected, digits = digits, ...), + "%, ", + insight::format_ci(100 * x$CI_low, 100 * x$CI_high, digits = digits, ...) + ) + insight::print_color("# Outliers detection\n\n", "blue") + cat(sprintf(" Proportion of observed outliers: %.*f%%\n", digits, 100 * x$Coefficient)) + cat(sprintf(" Proportion of expected outliers: %s\n\n", result)) + + p_string <- paste0(" (", insight::format_p(x$p_value), ")") + + if (x$p_value < 0.05) { + message("Outliers were detected", p_string, ".") + } else { + message("No outliers were detected", p_string, ".") + } + + invisible(x) +} + # other classes ------------------------- @@ -1438,6 +1480,30 @@ check_outliers.meta <- check_outliers.metagen check_outliers.metabin <- check_outliers.metagen +#' @rdname check_outliers +#' @export +check_outliers.performance_simres <- function(x, type = "default", iterations = 100, alternative = "two.sided", ...) { + type <- match.arg(type, c("default", "binomial", "bootstrap")) + alternative <- match.arg(alternative, c("two.sided", "greater", "less")) + + insight::check_if_installed("DHARMa") + result <- DHARMa::testOutliers(x, type = type, nBoot = iterations, alternative = alternative, plot = FALSE, ...) + + outlier <- list( + Coefficient = as.vector(result$estimate), + Expected = as.numeric(gsub("(.*)\\(expected: (\\d.*)\\)", "\\2", names(result$estimate))), + CI_low = result$conf.int[1], + CI_high = result$conf.int[2], + p_value = result$p.value + ) + class(outlier) <- c("check_outliers_simres", class(outlier)) + outlier +} + +#' @export +check_outliers.DHARMa <- check_outliers.performance_simres + + # Thresholds -------------------------------------------------------------- diff --git a/R/check_overdispersion.R b/R/check_overdispersion.R index 95ad62c0a..0a9cfb595 100644 --- a/R/check_overdispersion.R +++ b/R/check_overdispersion.R @@ -1,39 +1,51 @@ -#' @title Check overdispersion of GL(M)M's +#' @title Check overdispersion (and underdispersion) of GL(M)M's #' @name check_overdispersion #' #' @description `check_overdispersion()` checks generalized linear (mixed) -#' models for overdispersion. +#' models for overdispersion (and underdispersion). #' #' @param x Fitted model of class `merMod`, `glmmTMB`, `glm`, or `glm.nb` -#' (package **MASS**). -#' @param ... Currently not used. +#' (package **MASS**), or an object returned by `simulate_residuals()`. +#' +#' @inheritParams check_zeroinflation #' #' @return A list with results from the overdispersion test, like chi-squared #' statistics, p-value or dispersion ratio. #' #' @details Overdispersion occurs when the observed variance is higher than the -#' variance of a theoretical model. For Poisson models, variance increases -#' with the mean and, therefore, variance usually (roughly) equals the mean -#' value. If the variance is much higher, the data are "overdispersed". +#' variance of a theoretical model. For Poisson models, variance increases +#' with the mean and, therefore, variance usually (roughly) equals the mean +#' value. If the variance is much higher, the data are "overdispersed". A less +#' common case is underdispersion, where the variance is much lower than the +#' mean. #' #' @section Interpretation of the Dispersion Ratio: #' If the dispersion ratio is close to one, a Poisson model fits well to the #' data. Dispersion ratios larger than one indicate overdispersion, thus a -#' negative binomial model or similar might fit better to the data. A p-value < -#' .05 indicates overdispersion. +#' negative binomial model or similar might fit better to the data. Dispersion +#' ratios much smaller than one indicate underdispersion. A p-value < .05 +#' indicates either overdispersion or underdispersion (the first being more common). #' #' @section Overdispersion in Poisson Models: #' For Poisson models, the overdispersion test is based on the code from #' _Gelman and Hill (2007), page 115_. #' +#' @section Overdispersion in Negative Binomial or Zero-Inflated Models: +#' For negative binomial (mixed) models or models with zero-inflation component, +#' the overdispersion test is based simulated residuals (see [`simulate_residuals()`]). +#' #' @section Overdispersion in Mixed Models: #' For `merMod`- and `glmmTMB`-objects, `check_overdispersion()` #' is based on the code in the #' [GLMM FAQ](http://bbolker.github.io/mixedmodels-misc/glmmFAQ.html), #' section *How can I deal with overdispersion in GLMMs?*. Note that this #' function only returns an *approximate* estimate of an overdispersion -#' parameter, and is probably inaccurate for zero-inflated mixed models (fitted -#' with `glmmTMB`). +#' parameter. Using this approach would be inaccurate for zero-inflated or +#' negative binomial mixed models (fitted with `glmmTMB`), thus, in such cases, +#' the overdispersion test is based on [`simulate_residuals()`] (which is identical +#' to `check_overdispersion(simulate_residuals(model))`). +#' +#' @inheritSection check_zeroinflation Tests based on simulated residuals #' #' @section How to fix Overdispersion: #' Overdispersion can be fixed by either modeling the dispersion parameter, or @@ -113,7 +125,9 @@ print.check_overdisp <- function(x, digits = 3, ...) { orig_x <- x x$dispersion_ratio <- sprintf("%.*f", digits, x$dispersion_ratio) - x$chisq_statistic <- sprintf("%.*f", digits, x$chisq_statistic) + if (!is.null(x$chisq_statistic)) { + x$chisq_statistic <- sprintf("%.*f", digits, x$chisq_statistic) + } x$p_value <- pval <- round(x$p_value, digits = digits) if (x$p_value < 0.001) x$p_value <- "< 0.001" @@ -125,14 +139,21 @@ print.check_overdisp <- function(x, digits = 3, ...) { ) insight::print_color("# Overdispersion test\n\n", "blue") - cat(sprintf(" dispersion ratio = %s\n", format(x$dispersion_ratio, justify = "right", width = maxlen))) - cat(sprintf(" Pearson's Chi-Squared = %s\n", format(x$chisq_statistic, justify = "right", width = maxlen))) - cat(sprintf(" p-value = %s\n\n", format(x$p_value, justify = "right", width = maxlen))) + if (is.null(x$chisq_statistic)) { + cat(sprintf(" dispersion ratio = %s\n", format(x$dispersion_ratio, justify = "right", width = maxlen))) + cat(sprintf(" p-value = %s\n\n", format(x$p_value, justify = "right", width = maxlen))) + } else { + cat(sprintf(" dispersion ratio = %s\n", format(x$dispersion_ratio, justify = "right", width = maxlen))) + cat(sprintf(" Pearson's Chi-Squared = %s\n", format(x$chisq_statistic, justify = "right", width = maxlen))) + cat(sprintf(" p-value = %s\n\n", format(x$p_value, justify = "right", width = maxlen))) + } if (pval > 0.05) { message("No overdispersion detected.") - } else { + } else if (x$dispersion_ratio > 1) { message("Overdispersion detected.") + } else { + message("Underdispersion detected.") } invisible(orig_x) @@ -144,8 +165,21 @@ print.check_overdisp <- function(x, digits = 3, ...) { #' @export check_overdispersion.glm <- function(x, verbose = TRUE, ...) { - # check if we have poisson + # model info info <- insight::model_info(x) + obj_name <- insight::safe_deparse_symbol(substitute(x)) + + # for certain distributions, simulated residuals are more accurate + use_simulated <- info$is_bernoulli || info$is_binomial || (!info$is_count && !info$is_binomial) || info$is_negbin + + # model classes not supported in DHARMa + not_supported <- c("fixest", "glmx") + + if (use_simulated && !inherits(x, not_supported)) { + return(check_overdispersion(simulate_residuals(x, ...), object_name = obj_name, ...)) + } + + # check if we have poisson - need this for models not supported by DHARMa if (!info$is_count && !info$is_binomial) { insight::format_error( "Overdispersion checks can only be used for models from Poisson families or binomial families with trials > 1." @@ -157,10 +191,6 @@ check_overdispersion.glm <- function(x, verbose = TRUE, ...) { insight::format_error("Overdispersion checks cannot be used for Bernoulli models.") } - if (info$is_binomial) { - return(check_overdispersion.merMod(x, verbose = verbose, ...)) - } - yhat <- stats::fitted(x) n <- stats::nobs(x) @@ -179,7 +209,7 @@ check_overdispersion.glm <- function(x, verbose = TRUE, ...) { ) class(out) <- c("check_overdisp", "see_check_overdisp") - attr(out, "object_name") <- insight::safe_deparse_symbol(substitute(x)) + attr(out, "object_name") <- obj_name out } @@ -219,39 +249,30 @@ check_overdispersion.model_fit <- check_overdispersion.poissonmfx # Overdispersion for mixed models --------------------------- #' @export -check_overdispersion.merMod <- function(x, verbose = TRUE, ...) { - # check if we have poisson or binomial +check_overdispersion.merMod <- function(x, ...) { + # for certain distributions, simulated residuals are more accurate info <- insight::model_info(x) - if (!info$is_count && !info$is_binomial) { - insight::format_error( - "Overdispersion checks can only be used for models from Poisson families or binomial families with trials > 1." - ) - } + obj_name <- insight::safe_deparse_symbol(substitute(x)) - # check for Bernoulli - if (info$is_bernoulli) { - insight::format_error("Overdispersion checks cannot be used for Bernoulli models.") + # for certain distributions, simulated residuals are more accurate + use_simulated <- info$family == "genpois" || info$is_zero_inflated || info$is_bernoulli || info$is_binomial || (!info$is_count && !info$is_binomial) || info$is_negbin # nolint + + if (use_simulated) { + return(check_overdispersion(simulate_residuals(x, ...), object_name = obj_name, ...)) } rdf <- stats::df.residual(x) rp <- insight::get_residuals(x, type = "pearson") + + # check if pearson residuals are available if (insight::is_empty_object(rp)) { - Pearson.chisq <- NA - prat <- NA - pval <- NA - rp <- NA - if (isTRUE(verbose)) { - insight::format_alert( - "Cannot test for overdispersion, because pearson residuals are not implemented for models with zero-inflation or variable dispersion.", - "Only the visual inspection using `plot(check_overdispersion(model))` is possible." - ) - } - } else { - Pearson.chisq <- sum(rp^2) - prat <- Pearson.chisq / rdf - pval <- stats::pchisq(Pearson.chisq, df = rdf, lower.tail = FALSE) + return(check_overdispersion(simulate_residuals(x, ...), object_name = obj_name, ...)) } + Pearson.chisq <- sum(rp^2) + prat <- Pearson.chisq / rdf + pval <- stats::pchisq(Pearson.chisq, df = rdf, lower.tail = FALSE) + out <- list( chisq_statistic = Pearson.chisq, dispersion_ratio = prat, @@ -260,7 +281,7 @@ check_overdispersion.merMod <- function(x, verbose = TRUE, ...) { ) class(out) <- c("check_overdisp", "see_check_overdisp") - attr(out, "object_name") <- insight::safe_deparse_symbol(substitute(x)) + attr(out, "object_name") <- obj_name out } @@ -270,3 +291,41 @@ check_overdispersion.negbin <- check_overdispersion.merMod #' @export check_overdispersion.glmmTMB <- check_overdispersion.merMod + + +# simulated residuals ----------------------------- + +#' @rdname check_overdispersion +#' @export +check_overdispersion.performance_simres <- function(x, alternative = c("two.sided", "less", "greater"), ...) { + # match arguments + alternative <- match.arg(alternative) + + # check for special arguments - we may pass "object_name" from other methods + dots <- list(...) + if (is.null(dots$object_name)) { + obj_name <- insight::safe_deparse_symbol(substitute(x)) + } else { + obj_name <- dots$object_name + } + + # statistics function + variance <- stats::sd(x$simulatedResponse)^2 + dispersion <- function(i) stats::var(i - x$fittedPredictedResponse) / variance + + # compute test results + result <- .simres_statistics(x, statistic_fun = dispersion, alternative = alternative) + + out <- list( + dispersion_ratio = result$observed / mean(result$simulated), + p_value = result$p + ) + + class(out) <- c("check_overdisp", "see_check_overdisp") + attr(out, "object_name") <- obj_name + + out +} + +#' @export +check_overdispersion.DHARMa <- check_overdispersion.performance_simres diff --git a/R/check_predictions.R b/R/check_predictions.R index ac4eaf45c..69222d7f8 100644 --- a/R/check_predictions.R +++ b/R/check_predictions.R @@ -37,6 +37,8 @@ #' #' @return A data frame of simulated responses and the original response vector. #' +#' @seealso [`simulate_residuals()`] and [`check_residuals()`]. +#' #' @details An example how posterior predictive checks can also be used for model #' comparison is Figure 6 from _Gabry et al. 2019, Figure 6_. #' @@ -104,7 +106,7 @@ check_predictions.default <- function(object, minfo <- insight::model_info(object, verbose = FALSE) # try to find sensible default for "type" argument - suggest_dots <- (minfo$is_bernoulli || minfo$is_count || minfo$is_ordinal || minfo$is_categorical || minfo$is_multinomial) + suggest_dots <- (minfo$is_bernoulli || minfo$is_count || minfo$is_ordinal || minfo$is_categorical || minfo$is_multinomial) # nolint if (missing(type) && suggest_dots) { type <- "discrete_interval" } diff --git a/R/check_residuals.R b/R/check_residuals.R new file mode 100644 index 000000000..2bb284c88 --- /dev/null +++ b/R/check_residuals.R @@ -0,0 +1,101 @@ +#' Check uniformity of simulated residuals +#' +#' `check_residuals()` checks generalized linear (mixed) models for uniformity +#' of randomized quantile residuals, which can be used to identify typical model +#' misspecification problems, such as over/underdispersion, zero-inflation, and +#' residual spatial and temporal autocorrelation. +#' +#' @param x An object returned by [`simulate_residuals()`] or +#' [`DHARMa::simulateResiduals()`]. +#' @param alternative A character string specifying the alternative hypothesis. +#' See [`stats::ks.test()`] for details. +#' @param ... Passed down to [`stats::ks.test()`]. +#' +#' @details Uniformity of residuals is checked using a Kolmogorov-Smirnov test. +#' There is a `plot()` method to visualize the distribution of the residuals. +#' The test for uniformity basically tests to which extent the observed values +#' deviate from the model expectations (i.e. simulated values). In this sense, +#' the `check_residuals()` function has similar goals like [`check_predictions()`]. +#' +#' @inheritSection simulate_residuals Tests based on simulated residuals +#' +#' @seealso [`simulate_residuals()`] and [`check_predictions()`]. +#' +#' @return The p-value of the test statistics. +#' +#' @examplesIf require("DHARMa") +#' dat <- DHARMa::createData(sampleSize = 100, overdispersion = 0.5, family = poisson()) +#' m <- glm(observedResponse ~ Environment1, family = poisson(), data = dat) +#' res <- simulate_residuals(m) +#' check_residuals(res) +#' +#' @export +check_residuals <- function(x, ...) { + UseMethod("check_residuals") +} + +#' @rdname check_residuals +#' @export +check_residuals.default <- function(x, alternative = c("two.sided", "less", "greater"), ...) { + if (insight::is_model(x)) { + check_residuals(simulate_residuals(x, ...), alternative = alternative) + } else { + insight::format_error("`check_residuals()` only works with objects supported by `simulate_residuals()` or `DHARMa::simulateResiduals()`.") # nolint + } +} + +#' @export +check_residuals.performance_simres <- function(x, alternative = c("two.sided", "less", "greater"), ...) { + alternative <- match.arg(alternative) + ts_test <- suppressWarnings( + stats::ks.test( + stats::residuals(x), + "punif", + alternative = alternative, + ... + ) + ) + + p.val <- ts_test$p.value + + attr(p.val, "data") <- x + attr(p.val, "object_name") <- insight::safe_deparse_symbol(substitute(x)) + class(p.val) <- unique(c("check_residuals", "see_check_residuals", class(p.val))) + + p.val +} + +#' @export +check_residuals.DHARMa <- check_residuals.performance_simres + + +# methods ------------------------------ + +#' @export +print.check_residuals <- function(x, ...) { + pstring <- insight::format_p(x) + + if (x < 0.05) { + insight::print_color( + sprintf( + "Warning: Non-uniformity of simulated residuals detected (%s).\n", pstring + ), + "red" + ) + } else { + insight::print_color( + sprintf( + "OK: Simulated residuals appear as uniformly distributed (%s).\n", pstring + ), + "green" + ) + } + + invisible(x) +} + +#' @export +plot.check_residuals <- function(x, ...) { + insight::check_if_installed("see", "for residual plots") + NextMethod() +} diff --git a/R/check_zeroinflation.R b/R/check_zeroinflation.R index f0f19b369..63badc5d4 100644 --- a/R/check_zeroinflation.R +++ b/R/check_zeroinflation.R @@ -7,9 +7,13 @@ #' @param x Fitted model of class `merMod`, `glmmTMB`, `glm`, or `glm.nb` #' (package **MASS**). #' @param tolerance The tolerance for the ratio of observed and predicted -#' zeros to considered as over- or underfitting zeros. A ratio -#' between 1 +/- `tolerance` is considered as OK, while a ratio -#' beyond or below this threshold would indicate over- or underfitting. +#' zeros to considered as over- or underfitting zeros. A ratio +#' between 1 +/- `tolerance` is considered as OK, while a ratio +#' beyond or below this threshold would indicate over- or underfitting. +#' @param alternative A character string specifying the alternative hypothesis. +#' @param ... Arguments passed down to [`simulate_residuals()`]. This only applies +#' for models with zero-inflation component, or for models of class `glmmTMB` +#' from `nbinom1` or `nbinom2` family. #' #' @return A list with information about the amount of predicted and observed #' zeros in the outcome, as well as the ratio between these two values. @@ -19,14 +23,52 @@ #' zero-inflation in the data. In such cases, it is recommended to use #' negative binomial or zero-inflated models. #' +#' In case of negative binomial models, models with zero-inflation component, +#' or hurdle models, the results from `check_zeroinflation()` are based on +#' [`simulate_residuals()`], i.e. `check_zeroinflation(simulate_residuals(model))` +#' is internally called if necessary. +#' +#' @section Tests based on simulated residuals: +#' For certain models, resp. model from certain families, tests are based on +#' [`simulated_residuals()`]. These are usually more accurate for tests than the +#' traditionally used Pearson residuals. However, when simulating from more +#' complex model, such as mixed models or models with zero-inflation, there are +#' several important considerations. Arguments specified in `...` are passed to +#' [`simulate_residuals()`], which relies on [`DHARMa::simulateResiduals()`] (and +#' therefore, arguments in `...` are passed further down to _DHARMa_). The +#' defaults in DHARMa are set on the most conservative option that works for +#' all models. However, in many cases, the help advises to use different settings +#' in particular situations or for particular models. It is recommended to read +#' the 'Details' in `?DHARMa::simulateResiduals` closely to understand the +#' implications of the simulation process and which arguments should be modified +#' to get the most accurate results. +#' #' @family functions to check model assumptions and and assess model quality #' -#' @examplesIf require("glmmTMB") +#' @examplesIf require("glmmTMB") && require("DHARMa") #' data(Salamanders, package = "glmmTMB") #' m <- glm(count ~ spp + mined, family = poisson, data = Salamanders) #' check_zeroinflation(m) +#' +#' # for models with zero-inflation component, it's better to carry out +#' # the check for zero-inflation using simulated residuals +#' m <- glmmTMB::glmmTMB( +#' count ~ spp + mined, +#' ziformula = ~ mined + spp, +#' family = poisson, +#' data = Salamanders +#' ) +#' res <- simulate_residuals(m) +#' check_zeroinflation(res) +#' @export +check_zeroinflation <- function(x, ...) { + UseMethod("check_zeroinflation") +} + + +#' @rdname check_zeroinflation #' @export -check_zeroinflation <- function(x, tolerance = 0.05) { +check_zeroinflation.default <- function(x, tolerance = 0.05, ...) { # check if we have poisson model_info <- insight::model_info(x) if (!model_info$is_count) { @@ -41,28 +83,22 @@ check_zeroinflation <- function(x, tolerance = 0.05) { return(NULL) } - # get predictions of outcome - mu <- stats::fitted(x) + # model classes not supported in DHARMa + not_supported <- c("fixest", "glmx") - # get overdispersion parameters - if (model_info$is_negbin) { - if (methods::is(x, "glmmTMB")) { - theta <- stats::sigma(x) - } else if (methods::is(x, "glmerMod")) { - theta <- environment(x@resp$family$aic)[[".Theta"]] - } else { - theta <- x$theta + # for models with zero-inflation component or negative binomial families, + # we use simulated_residuals() + if (!inherits(x, not_supported) && (model_info$is_zero_inflated || model_info$is_negbin || model_info$family == "genpois")) { # nolint + if (missing(tolerance)) { + tolerance <- 0.1 } - } else { - theta <- NULL + return(check_zeroinflation(simulate_residuals(x, ...), tolerance = tolerance, ...)) } + # get predictions of outcome + mu <- stats::fitted(x) # get predicted zero-counts - if (!is.null(theta)) { - pred.zero <- round(sum(stats::dnbinom(x = 0, size = theta, mu = mu))) - } else { - pred.zero <- round(sum(stats::dpois(x = 0, lambda = mu))) - } + pred.zero <- round(sum(stats::dpois(x = 0, lambda = mu))) # proportion structure( @@ -77,6 +113,33 @@ check_zeroinflation <- function(x, tolerance = 0.05) { } +#' @rdname check_zeroinflation +#' @export +check_zeroinflation.performance_simres <- function(x, + tolerance = 0.1, + alternative = c("two.sided", "less", "greater"), + ...) { + # match arguments + alternative <- match.arg(alternative) + + # compute test results + result <- .simres_statistics(x, statistic_fun = function(i) sum(i == 0), alternative = alternative) + + structure( + class = "check_zi", + list( + predicted.zeros = round(mean(result$simulated)), + observed.zeros = result$observed, + ratio = mean(result$simulated) / result$observed, + tolerance = tolerance, + p.value = result$p + ) + ) +} + +#' @export +check_zeroinflation.DHARMa <- check_zeroinflation.performance_simres + # methods ------------------ @@ -90,12 +153,22 @@ print.check_zi <- function(x, ...) { lower <- 1 - x$tolerance upper <- 1 + x$tolerance + if (is.null(x$p.value)) { + p_string <- "" + } else { + p_string <- paste0(" (", insight::format_p(x$p.value), ")") + } + if (x$ratio < lower) { - message("Model is underfitting zeros (probable zero-inflation).") + message("Model is underfitting zeros (probable zero-inflation)", p_string, ".") } else if (x$ratio > upper) { - message("Model is overfitting zeros.") + message("Model is overfitting zeros", p_string, ".") } else { - insight::format_alert("Model seems ok, ratio of observed and predicted zeros is within the tolerance range.") + insight::format_alert(paste0( + "Model seems ok, ratio of observed and predicted zeros is within the tolerance range", + p_string, + "." + )) } invisible(x) diff --git a/R/simulate_residuals.R b/R/simulate_residuals.R new file mode 100644 index 000000000..207b660db --- /dev/null +++ b/R/simulate_residuals.R @@ -0,0 +1,117 @@ +#' @title Simulate randomized quantile residuals from a model +#' @name simulate_residuals +#' +#' @description Returns simulated residuals from a model. This is useful for +#' checking the uniformity of residuals, in particular for non-Gaussian models, +#' where the residuals are not expected to be normally distributed. +#' +#' @param x A model object. +#' @param iterations Number of simulations to run. +#' @param ... Arguments passed on to [`DHARMa::simulateResiduals()`]. +#' +#' @return Simulated residuals, which can be further processed with +#' [`check_residuals()`]. The returned object is of class `DHARMa` and +#' `performance_simres`. +#' +#' @seealso [`check_residuals()`] and [`check_predictions()`]. +#' +#' @details This function is a small wrapper around [`DHARMa::simulateResiduals()`]. +#' It basically only sets `plot = FALSE` and adds an additional class attribute +#' (`"performance_sim_res"`), which allows using the DHARMa object in own plotting +#' functions in the **see** package. See also `vignette("DHARMa")`. There is a +#' `plot()` method to visualize the distribution of the residuals. +#' +#' @section Tests based on simulated residuals: +#' For certain models, resp. model from certain families, tests like +#' [`check_zeroinflation()`] or [`check_overdispersion()`] are based on +#' `simulated_residuals()`. These are usually more accurate for such tests than +#' the traditionally used Pearson residuals. However, when simulating from more +#' complex model, such as mixed models or models with zero-inflation, there are +#' several important considerations. `simulate_residuals()` relies on +#' [`DHARMa::simulateResiduals()`], and additional arguments specified in `...` +#' are passed further down to that function. The defaults in DHARMa are set on +#' the most conservative option that works for all models. However, in many +#' cases, the help advises to use different settings in particular situations +#' or for particular models. It is recommended to read the 'Details' in +#' `?DHARMa::simulateResiduals` closely to understand the implications of the +#' simulation process and which arguments should be modified to get the most +#' accurate results. +#' +#' @references +#' +#' - Hartig, F., & Lohse, L. (2022). DHARMa: Residual Diagnostics for Hierarchical +#' (Multi-Level / Mixed) Regression Models (Version 0.4.5). Retrieved from +#' https://CRAN.R-project.org/package=DHARMa +#' +#' - Dunn, P. K., & Smyth, G. K. (1996). Randomized Quantile Residuals. Journal +#' of Computational and Graphical Statistics, 5(3), 236. \doi{10.2307/1390802} +#' +#' @examplesIf require("DHARMa") +#' m <- lm(mpg ~ wt + cyl + gear + disp, data = mtcars) +#' simulate_residuals(m) +#' +#' @export +simulate_residuals <- function(x, iterations = 250, ...) { + insight::check_if_installed("DHARMa") + # TODO (low priority): Note that DHARMa::simulateResiduals(x, ...) does its own checks for whether + # or not the model passed to it is supported, do we want to use this or do our + # own checks so we can supply our own error message? + if (iterations < 2) { + insight::format_error("`iterations` must be at least 2.") + } + # It's important to preserve this object as is, rather than prematurely + # extracting the residuals from it because the object contains important stuff + # in it that we'll want to pass onto other functions later, such as passing + # the fitted model into check_model(). + out <- DHARMa::simulateResiduals(x, n = iterations, plot = FALSE, ...) + class(out) <- c("performance_simres", "see_performance_simres", class(out)) + out +} + + +# methods ------------------------------ + +#' @export +print.performance_simres <- function(x, ...) { + # TODO (low priority): We can probably just base this off of the print method + # DHARMa uses, but with an easystats style. For now we can just stick with + # DHARMa's method. + msg <- paste0( + "Simulated residuals from a model of class `", class(x$fittedModel)[1], + "` based on ", x$nSim, " simulations. Use `check_residuals()` to check ", + "uniformity of residuals. It is recommended to refer to `?DHARMa::simulateReisudals`", + " and `vignette(\"DHARMa\")` for more information about different settings", + " in particular situations or for particular models.\n" + ) + cat(insight::format_message(msg)) +} + +#' @export +plot.performance_simres <- function(x, ...) { + insight::check_if_installed("see", "for residual plots") + NextMethod() +} + + +# helper functions --------------------- + +.simres_statistics <- function(x, statistic_fun, alternative = "two.sided") { + # summarize the observed and simulated residuals + if (is.null(statistic_fun)) { + # we pass the values to compute the p-value directly (for "check_outliers()") + observed <- x + simulated <- statistic_fun + } else { + # or apply a function to observed and simulated residusls, + # to calcualte a summary statistic + observed <- statistic_fun(x$observedResponse) + simulated <- apply(x$simulatedResponse, 2, statistic_fun) + } + # p is simply ratio of simulated zeros to observed zeros + p <- switch(alternative, + greater = mean(simulated >= observed), + less = mean(simulated <= observed), + min(min(mean(simulated <= observed), mean(simulated >= observed)) * 2, 1) + ) + list(observed = observed, simulated = simulated, p = p) +} diff --git a/_pkgdown.yaml b/_pkgdown.yaml index 71f444657..d6e56740c 100644 --- a/_pkgdown.yaml +++ b/_pkgdown.yaml @@ -13,6 +13,7 @@ reference: contents: - binned_residuals - starts_with("check_") + - simulate_residuals - title: "Check Model Performance or Quality" contents: diff --git a/inst/WORDLIST b/inst/WORDLIST index 377bd8ea4..be923aed4 100644 --- a/inst/WORDLIST +++ b/inst/WORDLIST @@ -46,6 +46,7 @@ DOI Datenerhebung Delacre Deskriptivstatistische +DHARMa Distinguishability Dom Dominicy @@ -76,6 +77,7 @@ Gazen Gelman Gnanadesikan Guilford +Hartig HDI HJ Hastie @@ -124,6 +126,7 @@ Ley Leys Lillo Liu +Lohse Lomax MADs MSA @@ -182,6 +185,7 @@ Sensivity Shachar Shinichi Skrondal +Smyth Solomons Somers Specifity @@ -279,6 +283,7 @@ metafor mfx mhurdle mis +misspecification mlm mlogit modelfit @@ -302,6 +307,7 @@ quartile quartiles rOpenSci recoding +reimplement rempsyc reproducibility rescaling @@ -325,6 +331,7 @@ unadjusted und underfitted underfitting +underdispersion visualisation winsorization winsorize diff --git a/man/check_model.Rd b/man/check_model.Rd index 4e8b5fddf..4bdea6ebe 100644 --- a/man/check_model.Rd +++ b/man/check_model.Rd @@ -21,6 +21,7 @@ check_model(x, ...) show_dots = NULL, bandwidth = "nrd", type = "density", + residual_type = NULL, verbose = FALSE, ... ) @@ -57,7 +58,9 @@ for dots, and third color for outliers or extreme values.} format \code{"package::theme_name"} (e.g. \code{"ggplot2::theme_minimal"}).} \item{detrend}{Logical. Should Q-Q/P-P plots be detrended? Defaults to -\code{TRUE}.} +\code{TRUE} for linear models or when \code{residual_type = "normal"}. Defaults to +\code{FALSE} for QQ plots based on simulated residuals (i.e. when +\code{residual_type = "simulated"}).} \item{show_dots}{Logical, if \code{TRUE}, will show data points in the plot. Set to \code{FALSE} for models with many observations, if generating the plot is too @@ -76,6 +79,15 @@ to a different value.} options are appropriate for models with discrete - binary, integer or ordinal etc. - outcomes).} +\item{residual_type}{Character, indicating the type of residuals to be used. +For non-Gaussian models, the default is \code{"simulated"}, which uses simulated +residuals. These are based on \code{\link[=simulate_residuals]{simulate_residuals()}} and thus uses the +\strong{DHARMa} package to return randomized quantile residuals. For Gaussian +models, the default is \code{"normal"}, which uses the default residuals from +the model. Setting \code{residual_type = "normal"} for non-Gaussian models will +use a half-normal Q-Q plot of the absolute value of the standardized deviance +residuals.} + \item{verbose}{If \code{FALSE} (default), suppress most warning messages.} } \value{ @@ -161,10 +173,22 @@ This plot is used to determine if the residuals of the regression model are normally distributed. Usually, dots should fall along the line. If there is some deviation (mostly at the tails), this indicates that the model doesn't predict the outcome well for that range that shows larger deviations from -the line. For generalized linear models, a half-normal Q-Q plot of the -absolute value of the standardized deviance residuals is shown, however, the -interpretation of the plot remains the same. See \code{\link[=check_normality]{check_normality()}} for -further details. +the line. For generalized linear models and when \code{residual_type = "normal"}, +a half-normal Q-Q plot of the absolute value of the standardized deviance +residuals is shown, however, the interpretation of the plot remains the same. +See \code{\link[=check_normality]{check_normality()}} for further details. Usually, for generalized linear +(mixed) models, a test for uniformity of residuals based on simulated residuals +is conducted (see next section). +} + +\section{Uniformity of Residuals}{ + +Fore non-Gaussian models, when \code{residual_type = "simulated"} (the default +for generalized linear (mixed) models), residuals are not expected to be +normally distributed. In this case, the created Q-Q plot checks the uniformity +of residuals. The interpretation of the plot is the same as for the normal +Q-Q plot. See \code{\link[=simulate_residuals]{simulate_residuals()}} and \code{\link[=check_residuals]{check_residuals()}} for further +details. } \section{Overdispersion}{ @@ -188,12 +212,12 @@ inside the error bounds. See \code{\link[=binned_residuals]{binned_residuals()}} \section{Residuals for (Generalized) Linear Models}{ -Plots that check the normality of residuals (QQ-plot) or the homogeneity of +Plots that check the normality of residuals (Q-Q plot) or the homogeneity of variance use standardized Pearson's residuals for generalized linear models, and standardized residuals for linear models. The plots for the normality of residuals (with overlayed normal curve) and for the linearity assumption use -the default residuals for \code{lm} and \code{glm} (which are deviance -residuals for \code{glm}). +the default residuals for \code{lm} and \code{glm} (which are deviance residuals for +\code{glm}). } \section{Troubleshooting}{ diff --git a/man/check_outliers.Rd b/man/check_outliers.Rd index 22b88228c..84a381985 100644 --- a/man/check_outliers.Rd +++ b/man/check_outliers.Rd @@ -5,6 +5,7 @@ \alias{check_outliers.default} \alias{check_outliers.numeric} \alias{check_outliers.data.frame} +\alias{check_outliers.performance_simres} \title{Outliers detection (check for influential observations)} \usage{ check_outliers(x, ...) @@ -21,14 +22,24 @@ check_outliers(x, ...) \method{check_outliers}{numeric}(x, method = "zscore_robust", threshold = NULL, ...) \method{check_outliers}{data.frame}(x, method = "mahalanobis", threshold = NULL, ID = NULL, ...) + +\method{check_outliers}{performance_simres}( + x, + type = "default", + iterations = 100, + alternative = "two.sided", + ... +) } \arguments{ -\item{x}{A model or a data.frame object.} +\item{x}{A model, a data.frame, a \code{performance_simres} \code{\link[=simulate_residuals]{simulate_residuals()}} +or a \code{DHARMa} object.} \item{...}{When \code{method = "ics"}, further arguments in \code{...} are passed down to \code{\link[ICSOutlier:ics.outlier]{ICSOutlier::ics.outlier()}}. When \code{method = "mahalanobis"}, they are passed down to \code{\link[stats:mahalanobis]{stats::mahalanobis()}}. \code{percentage_central} can -be specified when \code{method = "mcd"}.} +be specified when \code{method = "mcd"}. For objects of class \code{performance_simres} +or \code{DHARMa}, further arguments are passed down to \code{DHARMa::testOutliers()}.} \item{method}{The outlier detection method(s). Can be \code{"all"} or some of \code{"cook"}, \code{"pareto"}, \code{"zscore"}, \code{"zscore_robust"}, \code{"iqr"}, \code{"ci"}, \code{"eti"}, @@ -44,6 +55,15 @@ for any of the method run.} \item{ID}{Optional, to report an ID column along with the row number.} \item{verbose}{Toggle warnings.} + +\item{type}{Type of method to test for outliers. Can be one of \code{"default"}, +\code{"binomial"} or \code{"bootstrap"}. Only applies when \code{x} is an object returned +by \code{simulate_residuals()} or of class \code{DHARMa}. See 'Details' in +\code{?DHARMa::testOutliers} for a detailed description of the types.} + +\item{iterations}{Number of simulations to run.} + +\item{alternative}{A character string specifying the alternative hypothesis.} } \value{ A logical vector of the detected outliers with a nice printing @@ -230,6 +250,19 @@ LOF distance. Requires the \strong{dbscan} package. } } +\section{Methods for simulated residuals}{ + + +The approach for detecting outliers based on simulated residuals differs +from the traditional methods and may not be detecting outliers as expected. +Literally, this approach compares observed to simulated values. However, we +do not know the deviation of the observed data to the model expectation, and +thus, the term "outlier" should be taken with a grain of salt. It refers to +"simulation outliers". Basically, the comparison tests whether on observed +data point is outside the simulated range. It is strongly recommended to read +the related documentations in the \strong{DHARMa} package, e.g. \code{?DHARMa::testOutliers}. +} + \section{Threshold specification}{ diff --git a/man/check_overdispersion.Rd b/man/check_overdispersion.Rd index ce8341dc4..19c957323 100644 --- a/man/check_overdispersion.Rd +++ b/man/check_overdispersion.Rd @@ -2,15 +2,22 @@ % Please edit documentation in R/check_overdispersion.R \name{check_overdispersion} \alias{check_overdispersion} -\title{Check overdispersion of GL(M)M's} +\alias{check_overdispersion.performance_simres} +\title{Check overdispersion (and underdispersion) of GL(M)M's} \usage{ check_overdispersion(x, ...) + +\method{check_overdispersion}{performance_simres}(x, alternative = c("two.sided", "less", "greater"), ...) } \arguments{ \item{x}{Fitted model of class \code{merMod}, \code{glmmTMB}, \code{glm}, or \code{glm.nb} -(package \strong{MASS}).} +(package \strong{MASS}), or an object returned by \code{simulate_residuals()}.} + +\item{...}{Arguments passed down to \code{\link[=simulate_residuals]{simulate_residuals()}}. This only applies +for models with zero-inflation component, or for models of class \code{glmmTMB} +from \code{nbinom1} or \code{nbinom2} family.} -\item{...}{Currently not used.} +\item{alternative}{A character string specifying the alternative hypothesis.} } \value{ A list with results from the overdispersion test, like chi-squared @@ -18,20 +25,23 @@ statistics, p-value or dispersion ratio. } \description{ \code{check_overdispersion()} checks generalized linear (mixed) -models for overdispersion. +models for overdispersion (and underdispersion). } \details{ Overdispersion occurs when the observed variance is higher than the variance of a theoretical model. For Poisson models, variance increases with the mean and, therefore, variance usually (roughly) equals the mean -value. If the variance is much higher, the data are "overdispersed". +value. If the variance is much higher, the data are "overdispersed". A less +common case is underdispersion, where the variance is much lower than the +mean. } \section{Interpretation of the Dispersion Ratio}{ If the dispersion ratio is close to one, a Poisson model fits well to the data. Dispersion ratios larger than one indicate overdispersion, thus a -negative binomial model or similar might fit better to the data. A p-value < -.05 indicates overdispersion. +negative binomial model or similar might fit better to the data. Dispersion +ratios much smaller than one indicate underdispersion. A p-value < .05 +indicates either overdispersion or underdispersion (the first being more common). } \section{Overdispersion in Poisson Models}{ @@ -40,6 +50,12 @@ For Poisson models, the overdispersion test is based on the code from \emph{Gelman and Hill (2007), page 115}. } +\section{Overdispersion in Negative Binomial or Zero-Inflated Models}{ + +For negative binomial (mixed) models or models with zero-inflation component, +the overdispersion test is based simulated residuals (see \code{\link[=simulate_residuals]{simulate_residuals()}}). +} + \section{Overdispersion in Mixed Models}{ For \code{merMod}- and \code{glmmTMB}-objects, \code{check_overdispersion()} @@ -47,8 +63,10 @@ is based on the code in the \href{http://bbolker.github.io/mixedmodels-misc/glmmFAQ.html}{GLMM FAQ}, section \emph{How can I deal with overdispersion in GLMMs?}. Note that this function only returns an \emph{approximate} estimate of an overdispersion -parameter, and is probably inaccurate for zero-inflated mixed models (fitted -with \code{glmmTMB}). +parameter. Using this approach would be inaccurate for zero-inflated or +negative binomial mixed models (fitted with \code{glmmTMB}), thus, in such cases, +the overdispersion test is based on \code{\link[=simulate_residuals]{simulate_residuals()}} (which is identical +to \code{check_overdispersion(simulate_residuals(model))}). } \section{How to fix Overdispersion}{ @@ -58,6 +76,23 @@ by choosing a different distributional family (like Quasi-Poisson, or negative binomial, see \emph{Gelman and Hill (2007), pages 115-116}). } +\section{Tests based on simulated residuals}{ + +For certain models, resp. model from certain families, tests are based on +\code{\link[=simulated_residuals]{simulated_residuals()}}. These are usually more accurate for tests than the +traditionally used Pearson residuals. However, when simulating from more +complex model, such as mixed models or models with zero-inflation, there are +several important considerations. Arguments specified in \code{...} are passed to +\code{\link[=simulate_residuals]{simulate_residuals()}}, which relies on \code{\link[DHARMa:simulateResiduals]{DHARMa::simulateResiduals()}} (and +therefore, arguments in \code{...} are passed further down to \emph{DHARMa}). The +defaults in DHARMa are set on the most conservative option that works for +all models. However, in many cases, the help advises to use different settings +in particular situations or for particular models. It is recommended to read +the 'Details' in \code{?DHARMa::simulateResiduals} closely to understand the +implications of the simulation process and which arguments should be modified +to get the most accurate results. +} + \examples{ \dontshow{if (getRversion() >= "4.0.0" && require("glmmTMB", quietly = TRUE)) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} diff --git a/man/check_predictions.Rd b/man/check_predictions.Rd index 591c813da..e64b306fb 100644 --- a/man/check_predictions.Rd +++ b/man/check_predictions.Rd @@ -117,6 +117,8 @@ Cambridge University Press. } } \seealso{ +\code{\link[=simulate_residuals]{simulate_residuals()}} and \code{\link[=check_residuals]{check_residuals()}}. + Other functions to check model assumptions and and assess model quality: \code{\link{check_autocorrelation}()}, \code{\link{check_collinearity}()}, diff --git a/man/check_residuals.Rd b/man/check_residuals.Rd new file mode 100644 index 000000000..dfb56ff83 --- /dev/null +++ b/man/check_residuals.Rd @@ -0,0 +1,65 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/check_residuals.R +\name{check_residuals} +\alias{check_residuals} +\alias{check_residuals.default} +\title{Check uniformity of simulated residuals} +\usage{ +check_residuals(x, ...) + +\method{check_residuals}{default}(x, alternative = c("two.sided", "less", "greater"), ...) +} +\arguments{ +\item{x}{An object returned by \code{\link[=simulate_residuals]{simulate_residuals()}} or +\code{\link[DHARMa:simulateResiduals]{DHARMa::simulateResiduals()}}.} + +\item{...}{Passed down to \code{\link[stats:ks.test]{stats::ks.test()}}.} + +\item{alternative}{A character string specifying the alternative hypothesis. +See \code{\link[stats:ks.test]{stats::ks.test()}} for details.} +} +\value{ +The p-value of the test statistics. +} +\description{ +\code{check_residuals()} checks generalized linear (mixed) models for uniformity +of randomized quantile residuals, which can be used to identify typical model +misspecification problems, such as over/underdispersion, zero-inflation, and +residual spatial and temporal autocorrelation. +} +\details{ +Uniformity of residuals is checked using a Kolmogorov-Smirnov test. +There is a \code{plot()} method to visualize the distribution of the residuals. +The test for uniformity basically tests to which extent the observed values +deviate from the model expectations (i.e. simulated values). In this sense, +the \code{check_residuals()} function has similar goals like \code{\link[=check_predictions]{check_predictions()}}. +} +\section{Tests based on simulated residuals}{ + +For certain models, resp. model from certain families, tests like +\code{\link[=check_zeroinflation]{check_zeroinflation()}} or \code{\link[=check_overdispersion]{check_overdispersion()}} are based on +\code{simulated_residuals()}. These are usually more accurate for such tests than +the traditionally used Pearson residuals. However, when simulating from more +complex model, such as mixed models or models with zero-inflation, there are +several important considerations. \code{simulate_residuals()} relies on +\code{\link[DHARMa:simulateResiduals]{DHARMa::simulateResiduals()}}, and additional arguments specified in \code{...} +are passed further down to that function. The defaults in DHARMa are set on +the most conservative option that works for all models. However, in many +cases, the help advises to use different settings in particular situations +or for particular models. It is recommended to read the 'Details' in +\code{?DHARMa::simulateResiduals} closely to understand the implications of the +simulation process and which arguments should be modified to get the most +accurate results. +} + +\examples{ +\dontshow{if (require("DHARMa")) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} +dat <- DHARMa::createData(sampleSize = 100, overdispersion = 0.5, family = poisson()) +m <- glm(observedResponse ~ Environment1, family = poisson(), data = dat) +res <- simulate_residuals(m) +check_residuals(res) +\dontshow{\}) # examplesIf} +} +\seealso{ +\code{\link[=simulate_residuals]{simulate_residuals()}} and \code{\link[=check_predictions]{check_predictions()}}. +} diff --git a/man/check_zeroinflation.Rd b/man/check_zeroinflation.Rd index db9eddd23..9de6c1f5c 100644 --- a/man/check_zeroinflation.Rd +++ b/man/check_zeroinflation.Rd @@ -2,18 +2,35 @@ % Please edit documentation in R/check_zeroinflation.R \name{check_zeroinflation} \alias{check_zeroinflation} +\alias{check_zeroinflation.default} +\alias{check_zeroinflation.performance_simres} \title{Check for zero-inflation in count models} \usage{ -check_zeroinflation(x, tolerance = 0.05) +check_zeroinflation(x, ...) + +\method{check_zeroinflation}{default}(x, tolerance = 0.05, ...) + +\method{check_zeroinflation}{performance_simres}( + x, + tolerance = 0.1, + alternative = c("two.sided", "less", "greater"), + ... +) } \arguments{ \item{x}{Fitted model of class \code{merMod}, \code{glmmTMB}, \code{glm}, or \code{glm.nb} (package \strong{MASS}).} +\item{...}{Arguments passed down to \code{\link[=simulate_residuals]{simulate_residuals()}}. This only applies +for models with zero-inflation component, or for models of class \code{glmmTMB} +from \code{nbinom1} or \code{nbinom2} family.} + \item{tolerance}{The tolerance for the ratio of observed and predicted zeros to considered as over- or underfitting zeros. A ratio between 1 +/- \code{tolerance} is considered as OK, while a ratio beyond or below this threshold would indicate over- or underfitting.} + +\item{alternative}{A character string specifying the alternative hypothesis.} } \value{ A list with information about the amount of predicted and observed @@ -28,12 +45,45 @@ If the amount of observed zeros is larger than the amount of predicted zeros, the model is underfitting zeros, which indicates a zero-inflation in the data. In such cases, it is recommended to use negative binomial or zero-inflated models. + +In case of negative binomial models, models with zero-inflation component, +or hurdle models, the results from \code{check_zeroinflation()} are based on +\code{\link[=simulate_residuals]{simulate_residuals()}}, i.e. \code{check_zeroinflation(simulate_residuals(model))} +is internally called if necessary. } +\section{Tests based on simulated residuals}{ + +For certain models, resp. model from certain families, tests are based on +\code{\link[=simulated_residuals]{simulated_residuals()}}. These are usually more accurate for tests than the +traditionally used Pearson residuals. However, when simulating from more +complex model, such as mixed models or models with zero-inflation, there are +several important considerations. Arguments specified in \code{...} are passed to +\code{\link[=simulate_residuals]{simulate_residuals()}}, which relies on \code{\link[DHARMa:simulateResiduals]{DHARMa::simulateResiduals()}} (and +therefore, arguments in \code{...} are passed further down to \emph{DHARMa}). The +defaults in DHARMa are set on the most conservative option that works for +all models. However, in many cases, the help advises to use different settings +in particular situations or for particular models. It is recommended to read +the 'Details' in \code{?DHARMa::simulateResiduals} closely to understand the +implications of the simulation process and which arguments should be modified +to get the most accurate results. +} + \examples{ -\dontshow{if (require("glmmTMB")) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} +\dontshow{if (require("glmmTMB") && require("DHARMa")) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} data(Salamanders, package = "glmmTMB") m <- glm(count ~ spp + mined, family = poisson, data = Salamanders) check_zeroinflation(m) + +# for models with zero-inflation component, it's better to carry out +# the check for zero-inflation using simulated residuals +m <- glmmTMB::glmmTMB( + count ~ spp + mined, + ziformula = ~ mined + spp, + family = poisson, + data = Salamanders +) +res <- simulate_residuals(m) +check_zeroinflation(res) \dontshow{\}) # examplesIf} } \seealso{ diff --git a/man/simulate_residuals.Rd b/man/simulate_residuals.Rd new file mode 100644 index 000000000..030e69501 --- /dev/null +++ b/man/simulate_residuals.Rd @@ -0,0 +1,68 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/simulate_residuals.R +\name{simulate_residuals} +\alias{simulate_residuals} +\title{Simulate randomized quantile residuals from a model} +\usage{ +simulate_residuals(x, iterations = 250, ...) +} +\arguments{ +\item{x}{A model object.} + +\item{iterations}{Number of simulations to run.} + +\item{...}{Arguments passed on to \code{\link[DHARMa:simulateResiduals]{DHARMa::simulateResiduals()}}.} +} +\value{ +Simulated residuals, which can be further processed with +\code{\link[=check_residuals]{check_residuals()}}. The returned object is of class \code{DHARMa} and +\code{performance_simres}. +} +\description{ +Returns simulated residuals from a model. This is useful for +checking the uniformity of residuals, in particular for non-Gaussian models, +where the residuals are not expected to be normally distributed. +} +\details{ +This function is a small wrapper around \code{\link[DHARMa:simulateResiduals]{DHARMa::simulateResiduals()}}. +It basically only sets \code{plot = FALSE} and adds an additional class attribute +(\code{"performance_sim_res"}), which allows using the DHARMa object in own plotting +functions in the \strong{see} package. See also \code{vignette("DHARMa")}. There is a +\code{plot()} method to visualize the distribution of the residuals. +} +\section{Tests based on simulated residuals}{ + +For certain models, resp. model from certain families, tests like +\code{\link[=check_zeroinflation]{check_zeroinflation()}} or \code{\link[=check_overdispersion]{check_overdispersion()}} are based on +\code{simulated_residuals()}. These are usually more accurate for such tests than +the traditionally used Pearson residuals. However, when simulating from more +complex model, such as mixed models or models with zero-inflation, there are +several important considerations. \code{simulate_residuals()} relies on +\code{\link[DHARMa:simulateResiduals]{DHARMa::simulateResiduals()}}, and additional arguments specified in \code{...} +are passed further down to that function. The defaults in DHARMa are set on +the most conservative option that works for all models. However, in many +cases, the help advises to use different settings in particular situations +or for particular models. It is recommended to read the 'Details' in +\code{?DHARMa::simulateResiduals} closely to understand the implications of the +simulation process and which arguments should be modified to get the most +accurate results. +} + +\examples{ +\dontshow{if (require("DHARMa")) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} +m <- lm(mpg ~ wt + cyl + gear + disp, data = mtcars) +simulate_residuals(m) +\dontshow{\}) # examplesIf} +} +\references{ +\itemize{ +\item Hartig, F., & Lohse, L. (2022). DHARMa: Residual Diagnostics for Hierarchical +(Multi-Level / Mixed) Regression Models (Version 0.4.5). Retrieved from +https://CRAN.R-project.org/package=DHARMa +\item Dunn, P. K., & Smyth, G. K. (1996). Randomized Quantile Residuals. Journal +of Computational and Graphical Statistics, 5(3), 236. \doi{10.2307/1390802} +} +} +\seealso{ +\code{\link[=check_residuals]{check_residuals()}} and \code{\link[=check_predictions]{check_predictions()}}. +} diff --git a/tests/testthat/_snaps/check_collinearity.md b/tests/testthat/_snaps/check_collinearity.md index 3e9aa24b7..dec439e85 100644 --- a/tests/testthat/_snaps/check_collinearity.md +++ b/tests/testthat/_snaps/check_collinearity.md @@ -12,3 +12,26 @@ P 1.00 1.00 1.00 K 1.00 1.00 1.00 +# check_collinearity, hurdle/zi models w/o zi-formula + + Code + print(out) + Output + # Check for Multicollinearity + + * conditional component: + + Low Correlation + + Term VIF VIF 95% CI Increased SE Tolerance Tolerance 95% CI + fem 1.06 [1.02, 1.20] 1.03 0.95 [0.83, 0.98] + mar 1.06 [1.02, 1.20] 1.03 0.95 [0.83, 0.98] + + * zero inflated component: + + Low Correlation + + Term VIF VIF 95% CI Increased SE Tolerance Tolerance 95% CI + fem 1.07 [1.02, 1.20] 1.03 0.94 [0.83, 0.98] + mar 1.07 [1.02, 1.20] 1.03 0.94 [0.83, 0.98] + diff --git a/tests/testthat/test-binned_residuals.R b/tests/testthat/test-binned_residuals.R index 7b966797e..e23e05d01 100644 --- a/tests/testthat/test-binned_residuals.R +++ b/tests/testthat/test-binned_residuals.R @@ -21,6 +21,10 @@ test_that("binned_residuals", { c(-0.05686, -0.12331, -0.35077, -0.57683, 0.17916, -0.44147), tolerance = 1e-4 ) + expect_identical( + capture.output(print(result)), + "Warning: Probably bad model fit. Only about 50% of the residuals are inside the error bounds." + ) }) diff --git a/tests/testthat/test-check_autocorrelation.R b/tests/testthat/test-check_autocorrelation.R index f70617565..a97b2eeaa 100644 --- a/tests/testthat/test-check_autocorrelation.R +++ b/tests/testthat/test-check_autocorrelation.R @@ -4,4 +4,9 @@ test_that("check_autocorrelation", { set.seed(123) out <- check_autocorrelation(m) expect_equal(as.vector(out), 0.316, ignore_attr = TRUE, tolerance = 1e-2) + expect_identical( + capture.output(print(out)), + "OK: Residuals appear to be independent and not autocorrelated (p = 0.316)." + ) + expect_warning(plot(out), "There is currently") }) diff --git a/tests/testthat/test-check_collinearity.R b/tests/testthat/test-check_collinearity.R index ea41af513..4811c5afe 100644 --- a/tests/testthat/test-check_collinearity.R +++ b/tests/testthat/test-check_collinearity.R @@ -202,14 +202,15 @@ test_that("check_collinearity, hurdle/zi models w/o zi-formula", { link = "logit" ) out <- check_collinearity(m) - expect_identical( - colnames(out), + expect_named( + out, c( "Term", "VIF", "VIF_CI_low", "VIF_CI_high", "SE_factor", "Tolerance", "Tolerance_CI_low", "Tolerance_CI_high", "Component" ) ) expect_equal(out$VIF, c(1.05772, 1.05772, 1.06587, 1.06587), tolerance = 1e-4) + expect_snapshot(print(out)) }) test_that("check_collinearity, invalid data", { diff --git a/tests/testthat/test-check_convergence.R b/tests/testthat/test-check_convergence.R index 1663d1219..8897b785d 100644 --- a/tests/testthat/test-check_convergence.R +++ b/tests/testthat/test-check_convergence.R @@ -26,3 +26,20 @@ test_that("check_convergence", { model <- lme4::lmer(Reaction ~ Days + (1 + Days | Subject), data = sleepstudy) expect_true(check_convergence(model)) }) + + +test_that("check_convergence, glmmTMB", { + skip_if_not_installed("glmmTMB") + data(iris) + model <- suppressWarnings(glmmTMB::glmmTMB( + Sepal.Length ~ poly(Petal.Width, 4) * poly(Petal.Length, 4) + + (1 + poly(Petal.Width, 4) | Species), + data = iris + )) + expect_false(check_convergence(model)) + model <- suppressWarnings(glmmTMB::glmmTMB( + Sepal.Length ~ Petal.Width + (1 | Species), + data = iris + )) + expect_true(check_convergence(model)) +}) diff --git a/tests/testthat/test-check_heterogeneity_bias.R b/tests/testthat/test-check_heterogeneity_bias.R index 7abc6af30..2bd63856e 100644 --- a/tests/testthat/test-check_heterogeneity_bias.R +++ b/tests/testthat/test-check_heterogeneity_bias.R @@ -1,7 +1,7 @@ test_that("check_heterogeneity_bias", { data(iris) set.seed(123) - iris$ID <- sample(1:4, nrow(iris), replace = TRUE) # fake-ID + iris$ID <- sample.int(4, nrow(iris), replace = TRUE) # fake-ID out <- check_heterogeneity_bias(iris, select = c("Sepal.Length", "Petal.Length"), group = "ID") expect_equal(out, c("Sepal.Length", "Petal.Length"), ignore_attr = TRUE) expect_output(print(out), "Possible heterogeneity bias due to following predictors: Sepal\\.Length, Petal\\.Length") diff --git a/tests/testthat/test-check_heteroskedasticity.R b/tests/testthat/test-check_heteroskedasticity.R new file mode 100644 index 000000000..4d64a870b --- /dev/null +++ b/tests/testthat/test-check_heteroskedasticity.R @@ -0,0 +1,17 @@ +test_that("check_heteroskedasticity", { + data(mtcars) + m <- lm(mpg ~ wt + cyl + gear + disp, data = mtcars) + out <- check_heteroscedasticity(m) + expect_equal(as.vector(out), 0.0423, ignore_attr = TRUE, tolerance = 1e-2) + expect_identical( + capture.output(print(out)), + "Warning: Heteroscedasticity (non-constant error variance) detected (p = 0.042)." + ) + m <- lm(mpg ~ hp, data = mtcars) + out <- check_heteroscedasticity(m) + expect_equal(as.vector(out), 0.8271352, ignore_attr = TRUE, tolerance = 1e-2) + expect_identical( + capture.output(print(out)), + "OK: Error variance appears to be homoscedastic (p = 0.827)." + ) +}) diff --git a/tests/testthat/test-check_model.R b/tests/testthat/test-check_model.R index 06973756f..b008658dc 100644 --- a/tests/testthat/test-check_model.R +++ b/tests/testthat/test-check_model.R @@ -64,11 +64,8 @@ test_that("`check_model()` warnings for tweedie", { )) expect_message( expect_message( - expect_message( - check_model(m, iterations = 1, verbose = TRUE), - regex = "Not enough model terms" - ), - regex = "QQ plot could not" + check_model(m, iterations = 2, verbose = TRUE), + regex = "Not enough model terms" ) ) }) diff --git a/tests/testthat/test-check_outliers.R b/tests/testthat/test-check_outliers.R index 8840e26bf..016c827af 100644 --- a/tests/testthat/test-check_outliers.R +++ b/tests/testthat/test-check_outliers.R @@ -342,3 +342,40 @@ test_that("check_outliers with invald data", { regex = "No numeric variables found" ) }) + + +test_that("check_outliers with DHARMa", { + skip_if_not_installed("DHARMa") + mt1 <- mtcars[, c(1, 3, 4)] + # create some fake outliers and attach outliers to main df + mt2 <- rbind(mt1, data.frame( + mpg = c(37, 40), disp = c(300, 400), + hp = c(110, 120) + )) + # fit model with outliers + model <- lm(disp ~ mpg + hp, data = mt2) + set.seed(123) + res <- simulate_residuals(model) + out <- check_outliers(res) + expect_equal( + out, + structure( + list( + Coefficient = 0.0294117647058824, Expected = 0.00796812749003984, + CI_low = 0.000744364234690261, CI_high = 0.153267669560318, + p_value = 0.238146844116552 + ), + class = c("check_outliers_simres", "list") + ), + ignore_attr = TRUE, + tolerance = 1e-4 + ) + expect_identical( + capture.output(print(out)), + c( + "# Outliers detection", "", " Proportion of observed outliers: 2.94%", + " Proportion of expected outliers: 0.80%, 95% CI [0.07, 15.33]", + "" + ) + ) +}) diff --git a/tests/testthat/test-check_overdispersion.R b/tests/testthat/test-check_overdispersion.R index cdd36bcd0..2930322bb 100644 --- a/tests/testthat/test-check_overdispersion.R +++ b/tests/testthat/test-check_overdispersion.R @@ -1,11 +1,12 @@ -test_that("check_overdispersion", { +test_that("check_overdispersion, glmmTMB-poisson", { skip_if_not_installed("glmmTMB") skip_if_not(getRversion() >= "4.0.0") data(Salamanders, package = "glmmTMB") m1 <- glm(count ~ spp + mined, family = poisson, data = Salamanders) + out <- check_overdispersion(m1) expect_equal( - check_overdispersion(m1), + out, structure( list( chisq_statistic = 1873.71012423995, @@ -18,9 +19,36 @@ test_that("check_overdispersion", { ), tolerance = 1e-3 ) + expect_identical( + capture.output(print(out)), + c( + "# Overdispersion test", + "", + " dispersion ratio = 2.946", + " Pearson's Chi-Squared = 1873.710", + " p-value = < 0.001", + "" + ) + ) + expect_message(capture.output(print(out)), "Overdispersion detected") + + set.seed(123) + out <- check_overdispersion(simulate_residuals(m1)) + expect_equal( + out, + structure( + list( + dispersion_ratio = 3.91516791651235, + p_value = 0 + ), + class = c("check_overdisp", "see_check_overdisp") + ), + tolerance = 1e-3 + ) }) -test_that("check_overdispersion", { + +test_that("check_overdispersion, glmmTMB-poisson mixed", { skip_if_not_installed("glmmTMB") skip_if_not(getRversion() >= "4.0.0") data(Salamanders, package = "glmmTMB") @@ -45,3 +73,133 @@ test_that("check_overdispersion", { tolerance = 1e-3 ) }) + + +test_that("check_overdispersion, zero-inflated and negbin", { + skip_if_not_installed("glmmTMB") + skip_if_not_installed("DHARMa") + skip_if_not(getRversion() >= "4.0.0") + data(Salamanders, package = "glmmTMB") + + m1 <- glmmTMB::glmmTMB( + count ~ spp + mined, + ziformula = ~ spp + mined, + family = poisson, + data = Salamanders + ) + m2 <- glmmTMB::glmmTMB( + count ~ spp + mined, + family = poisson, + data = Salamanders + ) + m3 <- glmmTMB::glmmTMB( + count ~ spp + mined, + family = glmmTMB::nbinom1(), + data = Salamanders + ) + expect_equal( + check_overdispersion(m1), + structure( + list( + dispersion_ratio = 1.98057695890769, + p_value = 0 + ), + class = c("check_overdisp", "see_check_overdisp") + ), + tolerance = 1e-4, + ignore_attr = TRUE + ) + expect_equal( + check_overdispersion(m2), + structure( + list( + chisq_statistic = 1873.7105986433, + dispersion_ratio = 2.94608584692342, + residual_df = 636L, + p_value = 3.26556213101505e-122 + ), + class = c("check_overdisp", "see_check_overdisp"), + object_name = "m1" + ), + tolerance = 1e-4, + ignore_attr = TRUE + ) + expect_equal( + check_overdispersion(m1), + structure( + list( + dispersion_ratio = 1.98057695890769, + p_value = 0 + ), + class = c("check_overdisp", "see_check_overdisp") + ), + tolerance = 1e-4, + ignore_attr = TRUE + ) +}) + + +test_that("check_overdispersion, MASS::negbin", { + skip_if_not_installed("MASS") + skip_if_not_installed("DHARMa") + set.seed(3) + mu <- rpois(500, lambda = 3) + x <- rnorm(500, mu, mu * 3) + x <- ceiling(x) + x <- pmax(x, 0) + m <- MASS::glm.nb(x ~ mu) + out <- check_overdispersion(m) + expect_equal( + out, + structure( + list( + dispersion_ratio = 0.409521313173506, + p_value = 0 + ), + class = c("check_overdisp", "see_check_overdisp") + ), + ignore_attr = TRUE, + tolerance = 1e-4 + ) + expect_identical( + capture.output(print(out)), + c( + "# Overdispersion test", + "", + " dispersion ratio = 0.410", + " p-value = < 0.001", + "" + ) + ) + expect_message(capture.output(print(out)), "Underdispersion detected") + + # check that plot works + skip_if_not_installed("see") + expect_s3_class(plot(out), "ggplot") +}) + + +test_that("check_overdispersion, genpois", { + skip_if_not_installed("glmmTMB") + skip_if_not_installed("DHARMa") + skip_if_not(getRversion() >= "4.0.0") + data(Salamanders, package = "glmmTMB") + + model <- glmmTMB::glmmTMB( + count ~ mined + spp + (1 | site), + family = glmmTMB::genpois(), + data = Salamanders + ) + expect_equal( + check_overdispersion(model), + structure( + list( + dispersion_ratio = 0.971975646955856, + p_value = 0.88 + ), + class = c("check_overdisp", "see_check_overdisp") + ), + tolerance = 1e-4, + ignore_attr = TRUE + ) +}) diff --git a/tests/testthat/test-check_residuals.R b/tests/testthat/test-check_residuals.R new file mode 100644 index 000000000..33407abf9 --- /dev/null +++ b/tests/testthat/test-check_residuals.R @@ -0,0 +1,26 @@ +test_that("check_singularity, lme4", { + skip_on_cran() + skip_if_not_installed("DHARMa") + set.seed(123) + dat <- DHARMa::createData(sampleSize = 100, overdispersion = 0.5, family = poisson()) + m <- glm(observedResponse ~ Environment1, family = poisson(), data = dat) + res <- simulate_residuals(m) + out <- check_residuals(res) + expect_equal(out, 0.01884602, ignore_attr = TRUE, tolerance = 1e-4) + expect_identical( + capture.output(print(out)), + "Warning: Non-uniformity of simulated residuals detected (p = 0.019)." + ) + expect_error(simulate_residuals(m, iterations = 1), "`iterations` must be") + + skip_if_not_installed("MASS") + set.seed(3) + mu <- rpois(500, lambda = 3) + x <- rnorm(500, mu, mu * 3) + x <- ceiling(x) + x <- pmax(x, 0) + quine.nb1 <- MASS::glm.nb(x ~ mu) + set.seed(123) + result <- check_residuals(quine.nb1) + expect_equal(result, 0.000665414, tolerance = 1e-3, ignore_attr = TRUE) +}) diff --git a/tests/testthat/test-check_zeroinflation.R b/tests/testthat/test-check_zeroinflation.R index d2e60f065..38a5c7726 100644 --- a/tests/testthat/test-check_zeroinflation.R +++ b/tests/testthat/test-check_zeroinflation.R @@ -19,7 +19,58 @@ test_that("check_zeroinflation", { ) }) + +test_that("check_zeroinflation, glmmTMB with and without zero-inflation component", { + skip_if_not_installed("glmmTMB") + skip_if_not_installed("DHARMa") + set.seed(123) + data(Salamanders, package = "glmmTMB") + + # no zero-inflation model + m <- glmmTMB::glmmTMB(count ~ spp + mined, family = poisson, data = Salamanders) + + expect_equal( + check_zeroinflation(m), + structure( + list( + predicted.zeros = 298, + observed.zeros = 387L, + ratio = 0.770025839793282, + tolerance = 0.05 + ), + class = "check_zi" + ), + tolerance = 1e-3 + ) + + # zero-inflation model + m <- glmmTMB::glmmTMB( + count ~ spp + mined, + ziformula = ~ spp + mined, + family = poisson, + data = Salamanders + ) + + set.seed(123) + expect_equal( + check_zeroinflation(m), + structure( + list( + predicted.zeros = 387, + observed.zeros = 387L, + ratio = 1.00093023255814, + tolerance = 0.1, + p.value = 1 + ), + class = "check_zi" + ), + tolerance = 1e-3 + ) +}) + + test_that("check_zeroinflation, glmer.nb", { + skip_on_cran() skip_if_not_installed("glmmTMB") skip_if_not_installed("lme4") set.seed(101) @@ -34,19 +85,107 @@ test_that("check_zeroinflation, glmer.nb", { mu <- 5 * (-4 + with(dd, as.integer(f1) + 4 * as.numeric(f2))) dd$y <- rnbinom(nrow(dd), mu = mu, size = 0.5) dat2 <<- dd - suppressMessages( + suppressMessages({ m <- lme4::glmer.nb(y ~ f1 * f2 + (1 | g), data = dat2, verbose = FALSE) - ) + }) expect_equal( check_zeroinflation(m), structure( list( - predicted.zeros = 153, observed.zeros = 155L, - ratio = 0.987096774193548, tolerance = 0.05 + predicted.zeros = 153, + observed.zeros = 155L, + ratio = 0.987329032258065, + tolerance = 0.1, + p.value = 0.944 + ), + class = "check_zi" + ), + tolerance = 1e-3 + ) +}) + + +test_that("check_zeroinflation, glmmTMB nbinom", { + skip_if_not_installed("glmmTMB") + skip_if_not_installed("DHARMa") + skip_on_cran() + + set.seed(1234) + dat <- DHARMa::createData(sampleSize = 1000) + fit <- suppressWarnings(glmmTMB::glmmTMB( + observedResponse ~ Environment1 + (1 | group), + data = dat, + family = glmmTMB::nbinom1() + )) + expect_equal( + check_zeroinflation(fit), + structure( + list( + predicted.zeros = 462, + observed.zeros = 482L, + ratio = 0.95850622406639, + tolerance = 0.1, + p.value = 0.776 ), class = "check_zi" ), tolerance = 1e-3 ) }) + + +test_that("check_zeroinflation, MASS::negbin", { + skip_if_not_installed("MASS") + skip_if_not_installed("DHARMa") + set.seed(3) + mu <- rpois(500, lambda = 3) + x <- rnorm(500, mu, mu * 3) + x <- ceiling(x) + x <- pmax(x, 0) + m <- MASS::glm.nb(x ~ mu) + expect_equal( + check_zeroinflation(m), + structure( + list( + predicted.zeros = 178, + observed.zeros = 202L, + ratio = 0.879643564356436, + tolerance = 0.1, + p.value = 0.008 + ), + class = "check_zi" + ), + ignore_attr = TRUE, + tolerance = 1e-4 + ) +}) + + +test_that("check_zeroinflation, genpois", { + skip_if_not_installed("glmmTMB") + skip_if_not_installed("DHARMa") + skip_if_not(getRversion() >= "4.0.0") + data(Salamanders, package = "glmmTMB") + + model <- glmmTMB::glmmTMB( + count ~ mined + spp + (1 | site), + family = glmmTMB::genpois(), + data = Salamanders + ) + expect_equal( + check_zeroinflation(model), + structure( + list( + predicted.zeros = 386, + observed.zeros = 387L, + ratio = 0.997860465116279, + tolerance = 0.1, + p.value = 1 + ), + class = "check_zi" + ), + tolerance = 1e-4, + ignore_attr = TRUE + ) +}) diff --git a/tests/testthat/test-checks.R b/tests/testthat/test-checks.R index 46fd77dc5..b6d59af3b 100644 --- a/tests/testthat/test-checks.R +++ b/tests/testthat/test-checks.R @@ -1,12 +1,36 @@ test_that("check_factorstructure", { skip_if_not_installed("parameters") x <- check_factorstructure(mtcars) - expect_equal(x$KMO$MSA, 0.826, tolerance = 0.01) - expect_equal(x$sphericity$chisq, 408.011, tolerance = 0.01) + expect_equal(x$KMO$MSA, 0.8265536, tolerance = 0.01) + expect_equal(x$sphericity$chisq, 408.0116, tolerance = 0.01) }) -test_that("check_clusterstructure", { +test_that("check_clusterstructure, ok", { skip_if_not_installed("parameters") set.seed(333) - expect_equal(check_clusterstructure(iris[, 1:4])$H, 0.187, tolerance = 0.01) + out <- check_clusterstructure(iris[, 1:4]) + expect_equal(out$H, 0.1869618, tolerance = 0.01) + expect_identical( + capture.output(print(out)), + c( + "# Clustering tendency", + "", + "The dataset is suitable for clustering (Hopkins' H = 0.19)." + ) + ) +}) + +test_that("check_clusterstructure, bad", { + skip_if_not_installed("parameters") + set.seed(13) + out <- check_clusterstructure(mtcars[, 10:11]) + expect_equal(out$H, 0.5142575, tolerance = 0.01) + expect_identical( + capture.output(print(out)), + c( + "# Clustering tendency", + "", + "The dataset is not suitable for clustering (Hopkins' H = 0.51)." + ) + ) }) diff --git a/vignettes/simulate_residuals.Rmd b/vignettes/simulate_residuals.Rmd new file mode 100644 index 000000000..7338aea4e --- /dev/null +++ b/vignettes/simulate_residuals.Rmd @@ -0,0 +1,94 @@ +--- +title: "Checking simulated residuals" +output: + rmarkdown::html_vignette: + toc: true + fig_width: 10.08 + fig_height: 6 +tags: [r, performance] +vignette: > + \usepackage[utf8]{inputenc} + %\VignetteIndexEntry{Checking simulated residuals} + %\VignetteEngine{knitr::rmarkdown} +editor_options: + chunk_output_type: console +--- + +```{r , include=FALSE} +library(knitr) +library(performance) +options(knitr.kable.NA = "") +knitr::opts_chunk$set( + comment = ">", + message = FALSE, + warning = FALSE, + out.width = "100%", + dpi = 450 +) +options(digits = 2) + +pkgs <- c("DHARMa", "glmmTMB") +successfully_loaded <- vapply(pkgs, requireNamespace, FUN.VALUE = logical(1L), quietly = TRUE) +can_evaluate <- all(successfully_loaded) + +if (can_evaluate) { + knitr::opts_chunk$set(eval = TRUE) + vapply(pkgs, require, FUN.VALUE = logical(1L), quietly = TRUE, character.only = TRUE) +} else { + knitr::opts_chunk$set(eval = FALSE) +} +``` + +The basic workflow for simulated residual checks using `simulate_residuals()` is as follows. + +First, fit a model: + +```{r} +library(glmmTMB) + +model <- glmmTMB( + count ~ mined + spp + (1 | site), + family = poisson, + data = Salamanders +) +``` + +Next, simulate residuals from the model: + +```{r} +simulated_residuals <- simulate_residuals(model) + +simulated_residuals +``` + + +Note that since this inherits the DHARMa class, all the methods implemented in DHARMa just work, including all the tests: + +```{r} +library(DHARMa) +residuals(simulated_residuals) + +DHARMa::testUniformity(simulated_residuals, plot = FALSE) +``` + + +Finally, run specific checks on the simulated residuals: + +```{r} +check_residuals(simulated_residuals) +``` + +Or check the entire model. + +```{r, eval=FALSE} +# TODO (not implemented) +check_model(simulated_residuals) +``` + +The `check_model()` function is the main reason we don't want to prematurely extract the residuals in `simulate_residuals()`, because if we do then the `simulated_residuals` won't contain the model fit (`fittedModel` in the output below), so we won't be able to do all of the checks we would want to do using the model (e.g., posterior predictive checks). + +```{r} +str(simulated_residuals, max.level = 1) +``` + +It would also mean we would need to reimplement some of the tests from DHARMa (e.g., `DHARMa::testOutliers()`) if we're planning to include those checks as well. We probably don't want to do that, since some of them are fairly involved rather than just being wrappers for tests supplied in base R (e.g., ) .