Skip to content

Commit

Permalink
Improve binned_residuals() (#641)
Browse files Browse the repository at this point in the history
  • Loading branch information
strengejacke authored Oct 26, 2023
1 parent 3408a7c commit eb4fafa
Show file tree
Hide file tree
Showing 8 changed files with 285 additions and 95 deletions.
10 changes: 10 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,15 @@
# performance 0.10.7

## Breaking changes

* `binned_residuals()` gains a few new arguments to control the residuals used
for the test, as well as different options to calculate confidence intervals
(namely, `ci_type`, `residuals`, `ci` and `iterations`). The default values
to compute binned residuals have changed. Default residuals are now "deviance"
residuals (and no longer "response" residuals). Default confidence intervals
are now "exact" intervals (and no longer based on Gaussian approximation).
Use `ci_type = "gaussian"` and `residuals = "response"` to get the old defaults.

## Changes to functions

* `binned_residuals()` - like `check_model()` - gains a `show_dots` argument to
Expand Down
83 changes: 72 additions & 11 deletions R/binned_residuals.R
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,19 @@
#' @param n_bins Numeric, the number of bins to divide the data. If
#' `n_bins = NULL`, the square root of the number of observations is
#' taken.
#' @param ci Numeric, the confidence level for the error bounds.
#' @param ci_type Character, the type of error bounds to calculate. Can be
#' `"exact"` (default), `"gaussian"` or `"boot"`. `"exact"` calculates the
#' error bounds based on the exact binomial distribution, using [`binom.test()`].
#' `"gaussian"` uses the Gaussian approximation, while `"boot"` uses a simple
#' bootstrap method, where confidence intervals are calculated based on the
#' quantiles of the bootstrap distribution.
#' @param residuals Character, the type of residuals to calculate. Can be
#' `"deviance"` (default), `"pearson"` or `"response"`. It is recommended to
#' use `"response"` only for those models where other residuals are not
#' available.
#' @param iterations Integer, the number of iterations to use for the
#' bootstrap method. Only used if `ci_type = "boot"`.
#' @param show_dots Logical, if `TRUE`, will show data points in the plot. Set
#' to `FALSE` for models with many observations, if generating the plot is too
#' time-consuming. By default, `show_dots = NULL`. In this case `binned_residuals()`
Expand Down Expand Up @@ -62,12 +75,24 @@
#' }
#'
#' @export
binned_residuals <- function(model, term = NULL, n_bins = NULL, show_dots = NULL, ...) {
fv <- stats::fitted(model)
binned_residuals <- function(model,
term = NULL,
n_bins = NULL,
show_dots = NULL,
ci = 0.95,
ci_type = c("exact", "gaussian", "boot"),
residuals = c("deviance", "pearson", "response"),
iterations = 1000,
...) {
# match arguments
ci_type <- match.arg(ci_type)
residuals <- match.arg(residuals)

fitted_values <- stats::fitted(model)
mf <- insight::get_data(model, verbose = FALSE)

if (is.null(term)) {
pred <- fv
pred <- fitted_values
} else {
pred <- mf[[term]]
}
Expand All @@ -78,7 +103,20 @@ binned_residuals <- function(model, term = NULL, n_bins = NULL, show_dots = NULL
show_dots <- is.null(n) || n <= 1e5
}

y <- .recode_to_zero(insight::get_response(model, verbose = FALSE)) - fv
# make sure response is 0/1 (and numeric)
y0 <- .recode_to_zero(insight::get_response(model, verbose = FALSE))

# calculate residuals
y <- switch(residuals,
response = y0 - fitted_values,
pearson = .safe((y0 - fitted_values) / sqrt(fitted_values * (1 - fitted_values))),
deviance = .safe(stats::residuals(model, type = "deviance"))
)

# make sure we really have residuals
if (is.null(y)) {
insight::format_error("Could not calculate residuals. Try using `residuals = \"response\"`.")
}

if (is.null(n_bins)) n_bins <- round(sqrt(length(pred)))

Expand All @@ -95,24 +133,32 @@ binned_residuals <- function(model, term = NULL, n_bins = NULL, show_dots = NULL
n <- length(items)
sdev <- stats::sd(y[items], na.rm = TRUE)

data.frame(
conf_int <- switch(ci_type,
gaussian = stats::qnorm(c((1 - ci) / 2, (1 + ci) / 2), mean = ybar, sd = sdev / sqrt(n)),
exact = {
out <- stats::binom.test(sum(y0[items]), n)$conf.int
# center CIs around point estimate
out <- out - (min(out) - ybar) - (diff(out) / 2)
out
},
boot = .boot_binned_ci(y[items], ci, iterations)
)
names(conf_int) <- c("CI_low", "CI_high")

d0 <- data.frame(
xbar = xbar,
ybar = ybar,
n = n,
x.lo = model.range[1],
x.hi = model.range[2],
se = stats::qnorm(0.975) * sdev / sqrt(n),
ci_range = sdev / sqrt(n)
se = stats::qnorm((1 + ci) / 2) * sdev / sqrt(n)
)
cbind(d0, rbind(conf_int))
}))

d <- do.call(rbind, d)
d <- d[stats::complete.cases(d), ]

# CIs
d$CI_low <- d$ybar - stats::qnorm(0.975) * d$ci_range
d$CI_high <- d$ybar + stats::qnorm(0.975) * d$ci_range

gr <- abs(d$ybar) > abs(d$se)
d$group <- "yes"
d$group[gr] <- "no"
Expand All @@ -129,6 +175,21 @@ binned_residuals <- function(model, term = NULL, n_bins = NULL, show_dots = NULL
}


# utilities ---------------------------

.boot_binned_ci <- function(x, ci = 0.95, iterations = 1000) {
x <- x[!is.na(x)]
n <- length(x)
out <- vector("numeric", iterations)
for (i in seq_len(iterations)) {
out[i] <- sum(x[sample.int(n, n, replace = TRUE)])
}
out <- out / n

quant <- stats::quantile(out, c((1 - ci) / 2, (1 + ci) / 2))
c(CI_low = quant[1L], CI_high = quant[2L])
}


# methods -----------------------------

Expand Down
35 changes: 18 additions & 17 deletions R/check_model.R
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,8 @@
#' tries to guess whether performance will be poor due to a very large model
#' and thus automatically shows or hides dots.
#' @param verbose If `FALSE` (default), suppress most warning messages.
#' @param ... Currently not used.
#' @param ... Arguments passed down to the individual check functions, especially
#' to `check_predictions()` and `binned_residuals()`.
#' @inheritParams check_predictions
#'
#' @return The data frame that is used for plotting.
Expand Down Expand Up @@ -185,11 +186,11 @@ check_model.default <- function(x,
ca <- tryCatch(
{
if (minfo$is_bayesian) {
suppressWarnings(.check_assumptions_stan(x))
suppressWarnings(.check_assumptions_stan(x, ...))
} else if (minfo$is_linear) {
suppressWarnings(.check_assumptions_linear(x, minfo, verbose))
suppressWarnings(.check_assumptions_linear(x, minfo, verbose, ...))
} else {
suppressWarnings(.check_assumptions_glm(x, minfo, verbose))
suppressWarnings(.check_assumptions_glm(x, minfo, verbose, ...))
}
},
error = function(e) {
Expand All @@ -202,7 +203,7 @@ check_model.default <- function(x,
}

# try to find sensible default for "type" argument
suggest_dots <- (minfo$is_bernoulli || minfo$is_count || minfo$is_ordinal || minfo$is_categorical || minfo$is_multinomial)
suggest_dots <- (minfo$is_bernoulli || minfo$is_count || minfo$is_ordinal || minfo$is_categorical || minfo$is_multinomial) # nolint
if (missing(type) && suggest_dots) {
type <- "discrete_interval"
}
Expand Down Expand Up @@ -330,7 +331,7 @@ check_model.model_fit <- function(x,

# compile plots for checks of linear models ------------------------

.check_assumptions_linear <- function(model, model_info, verbose = TRUE) {
.check_assumptions_linear <- function(model, model_info, verbose = TRUE, ...) {
dat <- list()

dat$VIF <- .diag_vif(model, verbose = verbose)
Expand All @@ -340,13 +341,13 @@ check_model.model_fit <- function(x,
dat$NCV <- .diag_ncv(model, verbose = verbose)
dat$HOMOGENEITY <- .diag_homogeneity(model, verbose = verbose)
dat$OUTLIERS <- check_outliers(model, method = "cook")
if (!is.null(dat$OUTLIERS)) {
threshold <- attributes(dat$OUTLIERS)$threshold$cook
} else {
if (is.null(dat$OUTLIERS)) {
threshold <- NULL
} else {
threshold <- attributes(dat$OUTLIERS)$threshold$cook
}
dat$INFLUENTIAL <- .influential_obs(model, threshold = threshold)
dat$PP_CHECK <- .safe(check_predictions(model))
dat$PP_CHECK <- .safe(check_predictions(model, ...))

dat <- insight::compact_list(dat)
class(dat) <- c("check_model", "see_check_model")
Expand All @@ -357,23 +358,23 @@ check_model.model_fit <- function(x,

# compile plots for checks of generalized linear models ------------------------

.check_assumptions_glm <- function(model, model_info, verbose = TRUE) {
.check_assumptions_glm <- function(model, model_info, verbose = TRUE, ...) {
dat <- list()

dat$VIF <- .diag_vif(model, verbose = verbose)
dat$QQ <- .diag_qq(model, verbose = verbose)
dat$HOMOGENEITY <- .diag_homogeneity(model, verbose = verbose)
dat$REQQ <- .diag_reqq(model, level = 0.95, model_info = model_info, verbose = verbose)
dat$OUTLIERS <- check_outliers(model, method = "cook")
if (!is.null(dat$OUTLIERS)) {
threshold <- attributes(dat$OUTLIERS)$threshold$cook
} else {
if (is.null(dat$OUTLIERS)) {
threshold <- NULL
} else {
threshold <- attributes(dat$OUTLIERS)$threshold$cook
}
dat$INFLUENTIAL <- .influential_obs(model, threshold = threshold)
dat$PP_CHECK <- .safe(check_predictions(model))
dat$PP_CHECK <- .safe(check_predictions(model, ...))
if (isTRUE(model_info$is_binomial)) {
dat$BINNED_RESID <- binned_residuals(model)
dat$BINNED_RESID <- binned_residuals(model, ...)
}
if (isTRUE(model_info$is_count)) {
dat$OVERDISPERSION <- .diag_overdispersion(model)
Expand All @@ -388,7 +389,7 @@ check_model.model_fit <- function(x,

# compile plots for checks of Bayesian models ------------------------

.check_assumptions_stan <- function(model) {
.check_assumptions_stan <- function(model, ...) {
if (inherits(model, "brmsfit")) {
# check if brms can be loaded

Expand Down
Loading

0 comments on commit eb4fafa

Please sign in to comment.