diff --git a/NAMESPACE b/NAMESPACE index 236899110..39d449fdb 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -23,6 +23,8 @@ S3method(data_plot,performance_pp_check) S3method(data_plot,point_estimate) S3method(data_plot,rope) S3method(data_plot,see_compare_parameters) +S3method(plot,dw_data_tabulate) +S3method(plot,dw_data_tabulates) S3method(plot,see_bayesfactor_models) S3method(plot,see_bayesfactor_parameters) S3method(plot,see_bayesfactor_savagedickey) diff --git a/NEWS.md b/NEWS.md index 15550cab9..124760e61 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,10 @@ # see (development version) +## New features + +* There is now a `plot()` method for outputs of `datawizard::data_tabulate()` + (#293). + ## Minor Changes * The `print()` method for `performance::check_model()` now also evaluates the diff --git a/R/plot.dw_data_tabulate.R b/R/plot.dw_data_tabulate.R new file mode 100644 index 000000000..9533c16e7 --- /dev/null +++ b/R/plot.dw_data_tabulate.R @@ -0,0 +1,137 @@ +#' Plot tabulated data. +#' +#' @param x Object created by `datawizard::data_tabulate()`. +#' @param label_values Logical. Should values and percentages be displayed at the +#' top of each bar. +#' @param show_na Should missing values be dropped? Can be `"if_any"` (default) to show +#' the missing category only if any missing values are present, `"always"` to +#' always show the missing category, or `"never"` to never show the missing +#' category. +#' @param na_label The label given to missing values when they are shown. +#' @param error_bar Logical. Should error bars be displayed? +#' If `TRUE`, confidence intervals computed using the Wilson method are shown. +#' See Brown et al. (2001) for details. +#' @param ci Confidence Interval (CI) level. Default to `0.95` (⁠95%⁠). +#' @param fill_col Color to use for category columns (default: "#87CEFA"). +#' @param color_error_bar Color to use for error bars (default: "#607B8B"). +#' @param ... Unused +#' +#' @references +#' Brown, L. D., Cai, T. T., & DasGupta, A. (2001). +#' Interval estimation for a binomial proportion. +#' _Statistical Science, 16_(2), 101–133. \doi{10.1214/ss/1009213286} +#' +#' @rdname plot.dw_data_tabulate +#' @export + +plot.dw_data_tabulates <- function(x, label_values = TRUE, + show_na = c("if_any", "always", "never"), + na_label = "(Missing)", + error_bar = TRUE, ci = .95, + fill_col = "#87CEFA", + color_error_bar = "#607B8B", + ...) { + show_na <- match.arg(show_na, choices = c("if_any", "always", "never")) + if (length(x) == 1) { + plot.dw_data_tabulate( + x[[1]], label_values = label_values, + show_na = show_na, na_label = na_label, + error_bar = error_bar, ci = ci, + fill_col = fill_col, color_error_bar = color_error_bar + ) + } else { + lapply(x, plot.dw_data_tabulate, + label_values = label_values, + show_na = show_na, na_label = na_label, + error_bar = error_bar, ci = ci, + fill_col = fill_col, color_error_bar = color_error_bar + ) + } +} + +#' @rdname plot.dw_data_tabulate +#' +#' @export + +plot.dw_data_tabulate <- function(x, label_values = TRUE, + show_na = c("if_any", "always", "never"), + na_label = "(Missing)", + error_bar = TRUE, ci = .95, + fill_col = "#87CEFA", + color_error_bar = "#607B8B", + ...) { + show_na <- match.arg(show_na, choices = c("if_any", "always", "never")) + dat <- as.data.frame(x) + + if (show_na == "if_any") { + if (any(is.na(dat$Value))) { + show_na <- ifelse(dat[is.na(dat$Value), "N"] > 0, "always", "never") + } else { + show_na <- "never" + } + } + + if (show_na == "never") { + dat <- dat[!is.na(dat$Value), ] + dat$output <- dat[[which(startsWith(names(dat), "Valid"))]] + } else { + dat$output <- dat[[which(startsWith(names(dat), "Raw"))]] + + # deal with missing values + dat$Value <- as.character(dat$Value) + dat$Value[is.na(dat$Value)] <- na_label + dat$Value <- factor( + dat$Value, + levels = c(setdiff(dat$Value, na_label), na_label) + ) + } + + if (isTRUE(error_bar)) { + total_n <- sum(dat$N) + props <- dat$output / 100 + dat <- cbind(dat, CI = ci, .wilson_ci(prop = props, total_n = total_n, ci = ci) * total_n) + dat$label <- paste0(dat$N, " (", round(dat$output, 2), "%)") + } else { + dat$label <- paste0(dat$N, "\n(", round(dat$output, 2), "%)") + } + + out <- ggplot2::ggplot(dat) + + ggplot2::aes(x = .data$Value, y = .data$N) + + ggplot2::geom_col(fill = fill_col) + + ggplot2::labs(title = unique(dat$Variable)) + + theme_modern() + + if (isTRUE(label_values)) { + if (isTRUE(error_bar)) { + out <- out + + ggplot2::geom_text(ggplot2::aes(label = .data$label), vjust = -1, hjust = 1.2) + + ggplot2::coord_cartesian(ylim = c(0, max(dat$CI_high))) + } else { + out <- out + + ggplot2::geom_text(ggplot2::aes(label = .data$label), vjust = -0.5) + + ggplot2::coord_cartesian(ylim = c(0, max(dat$N) * 1.2)) + } + } + + # add confidence intervals for frequencies + if (isTRUE(error_bar)) { + out <- out + + ggplot2::geom_linerange( + ggplot2::aes(ymin = .data$CI_low, ymax = .data$CI_high), + color = color_error_bar + ) + } + + out +} + +.wilson_ci <- function(prop, total_n, ci = .95) { + z <- stats::qnorm((1 - ci) / 2, lower.tail = FALSE) + z2 <- z^2 + p1 <- prop + 0.5 * z2 / total_n + p2 <- z * sqrt((prop * (1 - prop) + 0.25 * z2 / total_n) / total_n) + p3 <- 1 + z2 / total_n + CI_low <- (p1 - p2) / p3 + CI_high <- (p1 + p2) / p3 + return(data.frame(CI_low = CI_low, CI_high = CI_high)) +} diff --git a/man/plot.dw_data_tabulate.Rd b/man/plot.dw_data_tabulate.Rd new file mode 100644 index 000000000..21d22fd2e --- /dev/null +++ b/man/plot.dw_data_tabulate.Rd @@ -0,0 +1,64 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/plot.dw_data_tabulate.R +\name{plot.dw_data_tabulates} +\alias{plot.dw_data_tabulates} +\alias{plot.dw_data_tabulate} +\title{Plot tabulated data.} +\usage{ +\method{plot}{dw_data_tabulates}( + x, + label_values = TRUE, + show_na = c("if_any", "always", "never"), + na_label = "(Missing)", + error_bar = TRUE, + ci = 0.95, + fill_col = "#87CEFA", + color_error_bar = "#607B8B", + ... +) + +\method{plot}{dw_data_tabulate}( + x, + label_values = TRUE, + show_na = c("if_any", "always", "never"), + na_label = "(Missing)", + error_bar = TRUE, + ci = 0.95, + fill_col = "#87CEFA", + color_error_bar = "#607B8B", + ... +) +} +\arguments{ +\item{x}{Object created by \code{datawizard::data_tabulate()}.} + +\item{label_values}{Logical. Should values and percentages be displayed at the +top of each bar.} + +\item{show_na}{Should missing values be dropped? Can be \code{"if_any"} (default) to show +the missing category only if any missing values are present, \code{"always"} to +always show the missing category, or \code{"never"} to never show the missing +category.} + +\item{na_label}{The label given to missing values when they are shown.} + +\item{error_bar}{Logical. Should error bars be displayed? +If \code{TRUE}, confidence intervals computed using the Wilson method are shown. +See Brown et al. (2001) for details.} + +\item{ci}{Confidence Interval (CI) level. Default to \code{0.95} (⁠95\%⁠).} + +\item{fill_col}{Color to use for category columns (default: "#87CEFA").} + +\item{color_error_bar}{Color to use for error bars (default: "#607B8B").} + +\item{...}{Unused} +} +\description{ +Plot tabulated data. +} +\references{ +Brown, L. D., Cai, T. T., & DasGupta, A. (2001). +Interval estimation for a binomial proportion. +\emph{Statistical Science, 16}(2), 101–133. \doi{10.1214/ss/1009213286} +} diff --git a/tests/testthat/test-plot.dw_data_tabulate.R b/tests/testthat/test-plot.dw_data_tabulate.R new file mode 100644 index 000000000..40dd660a7 --- /dev/null +++ b/tests/testthat/test-plot.dw_data_tabulate.R @@ -0,0 +1,9 @@ +test_that("`plot.dw_data_tabulate()` works with single table", { + x <- datawizard::data_tabulate(mtcars, select = "cyl") + expect_s3_class(plot(x), "gg") +}) + +test_that("`plot.dw_data_tabulate()` works with multiple tables", { + x <- datawizard::data_tabulate(mtcars, select = c("cyl", "carb", "am")) + expect_true(is.list(plot(x))) +})