From 3bc7a8bf6a7c4f29d8ea7515e4e97b788702a907 Mon Sep 17 00:00:00 2001 From: Malcolm Barrett Date: Wed, 28 Aug 2024 11:43:25 -0400 Subject: [PATCH] run styler --- R/ggdag-mask.R | 32 +-- chapters/06-not-just-a-stats-problem.qmd | 223 +++++++++--------- chapters/07-prep-data.qmd | 2 +- chapters/15-g-comp.qmd | 132 ++++++----- chapters/17-missingness-and-measurement.qmd | 238 ++++++++++---------- 5 files changed, 317 insertions(+), 310 deletions(-) diff --git a/R/ggdag-mask.R b/R/ggdag-mask.R index 6a4a981..0788923 100644 --- a/R/ggdag-mask.R +++ b/R/ggdag-mask.R @@ -2,21 +2,25 @@ # TODO: when `geom_dag_label_repel2` exists, add to namespace as 1 then delete this first bit # copied from source to avoid recursion issue in overriding in ggdag namsespace ggdag_geom_dag_label_repel <- function( - mapping = NULL, data = NULL, parse = FALSE, ..., - box.padding = grid::unit(0.35,"lines"), label.padding = grid::unit(0.25, "lines"), - point.padding = grid::unit(1.5, "lines"), label.r = grid::unit(0.15, "lines"), - label.size = 0.25, segment.color = "grey50", segment.size = 0.5, arrow = NULL, - force = 1, max.iter = 2000, nudge_x = 0, nudge_y = 0, na.rm = FALSE, + mapping = NULL, data = NULL, parse = FALSE, ..., + box.padding = grid::unit(0.35, "lines"), label.padding = grid::unit(0.25, "lines"), + point.padding = grid::unit(1.5, "lines"), label.r = grid::unit(0.15, "lines"), + label.size = 0.25, segment.color = "grey50", segment.size = 0.5, arrow = NULL, + force = 1, max.iter = 2000, nudge_x = 0, nudge_y = 0, na.rm = FALSE, show.legend = NA, inherit.aes = TRUE) { - ggplot2::layer(data = data, mapping = mapping, stat = ggdag:::StatNodesRepel, - geom = ggrepel::GeomLabelRepel, position = "identity", - show.legend = show.legend, inherit.aes = inherit.aes, - params = list(parse = parse, box.padding = box.padding, - label.padding = label.padding, point.padding = point.padding, - label.r = label.r, label.size = label.size, segment.colour = segment.color %||% - segment.colour, segment.size = segment.size, - arrow = arrow, na.rm = na.rm, force = force, max.iter = max.iter, - nudge_x = nudge_x, nudge_y = nudge_y, segment.alpha = 1, ...)) + ggplot2::layer( + data = data, mapping = mapping, stat = ggdag:::StatNodesRepel, + geom = ggrepel::GeomLabelRepel, position = "identity", + show.legend = show.legend, inherit.aes = inherit.aes, + params = list( + parse = parse, box.padding = box.padding, + label.padding = label.padding, point.padding = point.padding, + label.r = label.r, label.size = label.size, segment.colour = segment.color %||% + segment.colour, segment.size = segment.size, + arrow = arrow, na.rm = na.rm, force = force, max.iter = max.iter, + nudge_x = nudge_x, nudge_y = nudge_y, segment.alpha = 1, ... + ) + ) } geom_dag_label_repel_internal <- function(..., seed = 10) { diff --git a/chapters/06-not-just-a-stats-problem.qmd b/chapters/06-not-just-a-stats-problem.qmd index 7aa617f..8f2069f 100644 --- a/chapters/06-not-just-a-stats-problem.qmd +++ b/chapters/06-not-just-a-stats-problem.qmd @@ -19,11 +19,11 @@ In the plots in @fig-anscombe, each data set has remarkably similar summary stat #| fig-cap: "Anscombe's Quartet, a set of four datasets with nearly identical summary statistics. Anscombe's point was that one must visualize the data to understand it." library(quartets) -anscombe_quartet |> - ggplot(aes(x, y)) + - geom_point() + - geom_smooth(method = "lm", se = FALSE) + - facet_wrap(~ dataset) +anscombe_quartet |> + ggplot(aes(x, y)) + + geom_point() + + geom_smooth(method = "lm", se = FALSE) + + facet_wrap(~dataset) ``` The Datasaurus Dozen is a modern take on Anscombe's Quartet. @@ -33,8 +33,8 @@ The mean, standard deviation, and correlation are nearly identical in each datas library(datasauRus) # roughly the same correlation in each dataset -datasaurus_dozen |> - group_by(dataset) |> +datasaurus_dozen |> + group_by(dataset) |> summarize(cor = round(cor(x, y), 2)) ``` @@ -43,10 +43,10 @@ datasaurus_dozen |> #| message: false #| fig-cap: "The Datasaurus Dozen, a set of datasets with nearly identical summary statistics. The Datasaurus Dozen is a modern version of Anscombe's Quartet. It's actually a baker's dozen, but who's counting?" #| fig-height: 8 -datasaurus_dozen |> - ggplot(aes(x, y)) + - geom_point() + - facet_wrap(~ dataset) +datasaurus_dozen |> + ggplot(aes(x, y)) + + geom_point() + + facet_wrap(~dataset) ``` In causal inference, however, even visualization is insufficient to untangle causal effects. @@ -61,16 +61,16 @@ The difference is the causal structure that generated each dataset. #| label: fig-causal_quartet_hidden #| message: false #| fig-cap: "The Causal Quartet, four data sets with nearly identical summary statistics and visualizations. The causal structure of each dataset is different, and data alone cannot tell us which is which." -causal_quartet |> +causal_quartet |> # hide the dataset names - mutate(dataset = as.integer(factor(dataset))) |> + mutate(dataset = as.integer(factor(dataset))) |> group_by(dataset) |> - mutate(exposure = scale(exposure), outcome = scale(outcome)) |> - ungroup() |> - ggplot(aes(exposure, outcome)) + - geom_point() + - geom_smooth(method = "lm", se = FALSE) + - facet_wrap(~ dataset) + mutate(exposure = scale(exposure), outcome = scale(outcome)) |> + ungroup() |> + ggplot(aes(exposure, outcome)) + + geom_point() + + geom_smooth(method = "lm", se = FALSE) + + facet_wrap(~dataset) ``` The question for each dataset is whether to adjust for a third variable, `covariate`. @@ -93,15 +93,15 @@ effects <- causal_quartet |> ate_xz = coef(lm(outcome ~ exposure + covariate, data = data))[2], cor = cor(data$exposure, data$covariate) ) |> - select(-data, dataset) |> + select(-data, dataset) |> ungroup() -gt(effects) |> - fmt_number(columns = -dataset) |> +gt(effects) |> + fmt_number(columns = -dataset) |> cols_label( - dataset = "Dataset", - ate_x = md("Not adjusting for `covariate`"), - ate_xz = md("Adjusting for `covariate`"), + dataset = "Dataset", + ate_x = md("Not adjusting for `covariate`"), + ate_xz = md("Adjusting for `covariate`"), cor = md("Correlation of `exposure` and `covariate`") ) ``` @@ -120,10 +120,10 @@ Even the reverse technique, *excluding* a variable when it's *less* than ten per #| label: tbl-quartet_ten_percent #| echo: false #| tbl-cap: "The percent change in the coefficient for `exposure` when including `covariate` in the model." -effects |> - mutate(percent_change = scales::percent((ate_x - ate_xz) / ate_x)) |> +effects |> + mutate(percent_change = scales::percent((ate_x - ate_xz) / ate_x)) |> select(dataset, percent_change) |> - gt() |> + gt() |> cols_label( dataset = "Dataset", percent_change = "Percent change" @@ -135,10 +135,10 @@ While the visual relationship between `covariate` and `exposure` is not identica In @fig-causal_quartet_covariate, the standardized relationship between the two is identical. ```{r} -causal_quartet |> +causal_quartet |> # hide the dataset names - mutate(dataset = as.integer(factor(dataset))) |> - group_by(dataset) |> + mutate(dataset = as.integer(factor(dataset))) |> + group_by(dataset) |> summarize(cor = round(cor(covariate, exposure), 2)) ``` @@ -146,16 +146,16 @@ causal_quartet |> #| label: fig-causal_quartet_covariate #| message: false #| fig-cap: "The correlation is the same in each dataset, but the visual relationship is not. However, the differences in the plots are not enough information to determine whether `covariate` is a confounder, mediator, or collider." -causal_quartet |> +causal_quartet |> # hide the dataset names - mutate(dataset = as.integer(factor(dataset))) |> + mutate(dataset = as.integer(factor(dataset))) |> group_by(dataset) |> - mutate(covariate = scale(covariate), exposure = scale(exposure)) |> - ungroup() |> - ggplot(aes(covariate, exposure)) + - geom_point() + - geom_smooth(method = "lm", se = FALSE) + - facet_wrap(~ dataset) + mutate(covariate = scale(covariate), exposure = scale(exposure)) |> + ungroup() |> + ggplot(aes(covariate, exposure)) + + geom_point() + + geom_smooth(method = "lm", se = FALSE) + + facet_wrap(~dataset) ``` ::: {.callout-tip} @@ -169,13 +169,13 @@ Standardizing numeric variables to have a mean of 0 and standard deviation of 1, #| label: fig-causal_quartet_covariate_unscaled #| message: false #| fig-cap: "@fig-causal_quartet_covariate, unscaled" -causal_quartet |> +causal_quartet |> # hide the dataset names - mutate(dataset = as.integer(factor(dataset))) |> - ggplot(aes(covariate, exposure)) + - geom_point() + - geom_smooth(method = "lm", se = FALSE) + - facet_wrap(~ dataset) + mutate(dataset = as.integer(factor(dataset))) |> + ggplot(aes(covariate, exposure)) + + geom_point() + + geom_smooth(method = "lm", se = FALSE) + + facet_wrap(~dataset) ``` ::: @@ -189,11 +189,11 @@ In 3, it's a mediator (it depends on the research question). #| label: fig-causal_quartet #| message: false #| fig-cap: "The Causal Quartet, revealed. The first and last datasets are types of collider bias; we should *not* control for `covariate.` In the second dataset, `covariate` is a confounder, and we *should* control for it. In the third dataset, `covariate` is a mediator, and we should control for it if we want the direct effect, but not if we want the total effect." -causal_quartet |> - ggplot(aes(exposure, outcome)) + - geom_point() + - geom_smooth(method = "lm", se = FALSE) + - facet_wrap(~ dataset) +causal_quartet |> + ggplot(aes(exposure, outcome)) + + geom_point() + + geom_smooth(method = "lm", se = FALSE) + + facet_wrap(~dataset) ``` What can we do if the data can't distinguish these causal structures? @@ -210,7 +210,7 @@ Once we compile a DAG for each dataset, we only need to query the DAG for the co #| warning: false #| layout-ncol: 2 #| fig-cap: "The DAGs for the Causal Quartet." -#| fig-subcap: +#| fig-subcap: #| - "The DAG for dataset 1, where `covariate` (c) is a collider. We should *not* adjust for `covariate`, which is a descendant of `exposure` (e) and `outcome` (o)." #| - "The DAG for dataset 2, where `covariate` (c) is a confounder. `covariate` is a mutual cause of `exposure` (e) and `outcome` (o), representing a backdoor path, so we *must* adjust for it to get the right answer." #| - "The DAG for dataset 3, where `covariate` (c) is a mediator. `covariate` is a descendant of `exposure` (e) and a cause of `outcome` (o). The path through `covariate` is the indirect path, and the path through `exposure` is the direct path. We should adjust for `covariate` if we want the direct effect, but not if we want the total effect." @@ -275,7 +275,7 @@ d_mbias <- dagify( p_coll <- d_coll |> tidy_dagitty() |> - mutate(covariate = ifelse(label == "c", "covariate", NA_character_)) |> + mutate(covariate = ifelse(label == "c", "covariate", NA_character_)) |> ggplot( aes(x = x, y = y, xend = xend, yend = yend) ) + @@ -285,10 +285,10 @@ p_coll <- d_coll |> theme_dag() + coord_cartesian(clip = "off") + theme(legend.position = "bottom") + - ggtitle("(1) Collider") + + ggtitle("(1) Collider") + guides(color = guide_legend( - title = NULL, - keywidth = unit(1.4, "mm"), + title = NULL, + keywidth = unit(1.4, "mm"), override.aes = list(size = 3.4, shape = 15) )) + scale_color_discrete(breaks = "covariate", na.value = "grey70") @@ -296,7 +296,7 @@ p_coll <- d_coll |> p_conf <- d_conf |> tidy_dagitty() |> - mutate(covariate = ifelse(label == "c", "covariate", NA_character_)) |> + mutate(covariate = ifelse(label == "c", "covariate", NA_character_)) |> ggplot( aes(x = x, y = y, xend = xend, yend = yend) ) + @@ -306,17 +306,17 @@ p_conf <- d_conf |> theme_dag() + coord_cartesian(clip = "off") + theme(legend.position = "bottom") + - ggtitle("(2) Confounder") + + ggtitle("(2) Confounder") + guides(color = guide_legend( - title = NULL, - keywidth = unit(1.4, "mm"), + title = NULL, + keywidth = unit(1.4, "mm"), override.aes = list(size = 3.4, shape = 15) )) + scale_color_discrete(breaks = "covariate", na.value = "grey70") p_med <- d_med |> tidy_dagitty() |> - mutate(covariate = ifelse(label == "c", "covariate", NA_character_)) |> + mutate(covariate = ifelse(label == "c", "covariate", NA_character_)) |> ggplot( aes(x = x, y = y, xend = xend, yend = yend) ) + @@ -326,10 +326,10 @@ p_med <- d_med |> theme_dag() + coord_cartesian(clip = "off") + theme(legend.position = "bottom") + - ggtitle("(3) Mediator") + + ggtitle("(3) Mediator") + guides(color = guide_legend( - title = NULL, - keywidth = unit(1.4, "mm"), + title = NULL, + keywidth = unit(1.4, "mm"), override.aes = list(size = 3.4, shape = 15) )) + scale_color_discrete(breaks = "covariate", na.value = "grey70") @@ -337,7 +337,7 @@ p_med <- d_med |> p_m_bias <- d_mbias |> tidy_dagitty() |> - mutate(covariate = ifelse(label == "c", "covariate", NA_character_)) |> + mutate(covariate = ifelse(label == "c", "covariate", NA_character_)) |> ggplot( aes(x = x, y = y, xend = xend, yend = yend) ) + @@ -351,10 +351,10 @@ p_m_bias <- d_mbias |> theme_dag() + coord_cartesian(clip = "off") + ggtitle("(4) M-bias") + - theme(legend.position = "bottom") + + theme(legend.position = "bottom") + guides(color = guide_legend( - title = NULL, - keywidth = unit(1.4, "mm"), + title = NULL, + keywidth = unit(1.4, "mm"), override.aes = list(size = 3.4, shape = 15) )) + scale_color_discrete(breaks = "covariate", na.value = "grey70") @@ -374,14 +374,14 @@ For dataset 3, it depends on which mediation effect we want: adjusted for the di ```{r} #| label: tbl-quartets_true_effects #| echo: false -#| tbl-cap: "The data generating mechanism and true causal effects in each dataset. Sometimes, the unadjusted effect is the same, and sometimes it is not, depending on the mechanism and question." +#| tbl-cap: "The data generating mechanism and true causal effects in each dataset. Sometimes, the unadjusted effect is the same, and sometimes it is not, depending on the mechanism and question." tibble::tribble( ~`Data generating mechanism`, ~`Correct causal model`, ~`Correct causal effect`, "(1) Collider", "outcome ~ exposure", "1", - "(2) Confounder", "outcome ~ exposure; covariate", "0.5", - "(3) Mediator", "Direct effect: outcome ~ exposure; covariate, Total Effect: outcome ~ exposure", "Direct effect: 0, Total effect: 1", + "(2) Confounder", "outcome ~ exposure; covariate", "0.5", + "(3) Mediator", "Direct effect: outcome ~ exposure; covariate, Total Effect: outcome ~ exposure", "Direct effect: 0, Total effect: 1", "(4) M-Bias", "outcome ~ exposure", "1" -) |> +) |> gt() ``` @@ -408,7 +408,7 @@ Only control for variables that precede the outcome. ```{r} #| label: fig-quartets-time-ordered #| fig-cap: "A time-ordered version of the collider DAG where each variable is measured twice. Controlling for `covariate` at follow-up is a collider, but controlling for `covariate` at baseline is not." -#| fig-subcap: +#| fig-subcap: #| - "In a time-ordered version of the collider DAG, controlling for the covariate at follow-up induces bias." #| - "Conversely, controlling for the covariate as measured at baseline does not induce bias because it is not a descendant of the outcome." #| layout-ncol: 2 @@ -416,12 +416,16 @@ Only control for variables that precede the outcome. #| fig-width: 4 #| fig-height: 3.75 coords <- list( - x = c(X_0 = 1, X_1 = 2, Z_1 = 2, Y_1 = 1.9, X_2 = 3, Y_2 = 2.9, Z_2 = 3, - X_3 = 4, Y_3 = 3.9, Z_3 = 4), - y = c(X_0 = 1, Y_0 = 1.05, - X_1 = 1, Z_1 = 1.1, Y_1 = 1.05, - X_2 = 1, Z_2 = 1.1, Y_2 = 1.05, - X_3 = 1, Z_3 = 1.1, Y_3 = 1.05) + x = c( + X_0 = 1, X_1 = 2, Z_1 = 2, Y_1 = 1.9, X_2 = 3, Y_2 = 2.9, Z_2 = 3, + X_3 = 4, Y_3 = 3.9, Z_3 = 4 + ), + y = c( + X_0 = 1, Y_0 = 1.05, + X_1 = 1, Z_1 = 1.1, Y_1 = 1.05, + X_2 = 1, Z_2 = 1.1, Y_2 = 1.05, + X_3 = 1, Z_3 = 1.1, Y_3 = 1.05 + ) ) d_coll <- dagify( Y_2 ~ X_1, @@ -453,14 +457,14 @@ d_coll |> geom_dag_edges(edge_color = "grey70") + geom_dag_text(aes(label = label)) + theme_dag() + - coord_cartesian(clip = "off") + - theme(legend.position = "bottom") + - geom_vline(xintercept = c(2.6, 3.25, 3.6, 4.25), lty = 2, color = "grey60") + - annotate("label", x = 2.925, y = 0.97, label = "baseline", color = "grey50") + - annotate("label", x = 3.925, y = 0.97, label = "follow-up", color = "grey50") + + coord_cartesian(clip = "off") + + theme(legend.position = "bottom") + + geom_vline(xintercept = c(2.6, 3.25, 3.6, 4.25), lty = 2, color = "grey60") + + annotate("label", x = 2.925, y = 0.97, label = "baseline", color = "grey50") + + annotate("label", x = 3.925, y = 0.97, label = "follow-up", color = "grey50") + guides(color = guide_legend( - title = NULL, - keywidth = unit(1.4, "mm"), + title = NULL, + keywidth = unit(1.4, "mm"), override.aes = list(size = 3.4, shape = 15) )) + scale_color_discrete(breaks = "covariate\n(follow-up)", na.value = "grey70") @@ -475,14 +479,14 @@ d_coll |> geom_dag_edges(edge_color = "grey70") + geom_dag_text(aes(label = label)) + theme_dag() + - coord_cartesian(clip = "off") + - theme(legend.position = "bottom") + - geom_vline(xintercept = c(2.6, 3.25, 3.6, 4.25), lty = 2, color = "grey60") + - annotate("label", x = 2.925, y = 0.97, label = "baseline", color = "grey50") + - annotate("label", x = 3.925, y = 0.97, label = "follow-up", color = "grey50") + + coord_cartesian(clip = "off") + + theme(legend.position = "bottom") + + geom_vline(xintercept = c(2.6, 3.25, 3.6, 4.25), lty = 2, color = "grey60") + + annotate("label", x = 2.925, y = 0.97, label = "baseline", color = "grey50") + + annotate("label", x = 3.925, y = 0.97, label = "follow-up", color = "grey50") + guides(color = guide_legend( - title = NULL, - keywidth = unit(1.4, "mm"), + title = NULL, + keywidth = unit(1.4, "mm"), override.aes = list(size = 3.4, shape = 15) )) + scale_color_discrete(breaks = "covariate\n(baseline)", na.value = "grey70") @@ -511,17 +515,18 @@ Even though `covariate_baseline` is only in the adjustment set for the second da causal_quartet_time |> nest_by(dataset) |> mutate( - adjusted_effect = + adjusted_effect = coef( - lm(outcome_followup ~ exposure_baseline + covariate_baseline, - data = data) + lm(outcome_followup ~ exposure_baseline + covariate_baseline, + data = data + ) )[2] ) |> bind_cols(tibble(truth = c(1, 0.5, 1, 1))) |> - select(-data, dataset) |> - ungroup() |> + select(-data, dataset) |> + ungroup() |> set_names(c("Dataset", "Adjusted effect", "Truth")) |> - gt() |> + gt() |> fmt_number(columns = -Dataset) ``` @@ -567,7 +572,7 @@ In the case of the collider data set, it's not even a helpful prediction tool be #| tbl-cap: "The difference in predictive metrics on `outcome` in each dataset with and without `covariate`. In each dataset, `covariate` adds information to the model, but this offers little guidance regarding the proper causal model." get_rmse <- function(data, model) { - sqrt(mean((data$outcome - predict(model, data)) ^ 2)) + sqrt(mean((data$outcome - predict(model, data))^2)) } get_r_squared <- function(model) { @@ -578,26 +583,26 @@ causal_quartet |> nest_by(dataset) |> mutate( rmse1 = get_rmse( - data, + data, lm(outcome ~ exposure, data = data) ), - rmse2 = + rmse2 = get_rmse( - data, + data, lm(outcome ~ exposure + covariate, data = data) ), rmse_diff = rmse2 - rmse1, r_squared1 = get_r_squared(lm(outcome ~ exposure, data = data)), r_squared2 = get_r_squared(lm(outcome ~ exposure + covariate, data = data)), r_squared_diff = r_squared2 - r_squared1 - ) |> + ) |> select(dataset, rmse = rmse_diff, r_squared = r_squared_diff) |> ungroup() |> gt() |> - fmt_number() |> + fmt_number() |> cols_label( - dataset = "Dataset", - rmse = "RMSE", + dataset = "Dataset", + rmse = "RMSE", r_squared = md("R^2^") ) ``` @@ -672,10 +677,10 @@ p_conf2 <- d_conf2 |> geom_dag_text(aes(label = label)) + theme_dag() + coord_cartesian(clip = "off") + - theme(legend.position = "none") + + theme(legend.position = "none") + guides(color = guide_legend( - title = NULL, - keywidth = unit(1.4, "mm"), + title = NULL, + keywidth = unit(1.4, "mm"), override.aes = list(size = 3.4, shape = 15) )) + scale_color_discrete(breaks = "confounder", na.value = "grey70") diff --git a/chapters/07-prep-data.qmd b/chapters/07-prep-data.qmd index fad20e1..24b34d5 100644 --- a/chapters/07-prep-data.qmd +++ b/chapters/07-prep-data.qmd @@ -402,7 +402,7 @@ To do so, we are going to use the `tbl_summary()` function from the `{gtsummary} library(gtsummary) library(labelled) seven_dwarfs_train_2018 <- seven_dwarfs_train_2018 |> - mutate(park_close = as.character(park_close)) |> + mutate(park_close = as.character(park_close)) |> set_variable_labels( park_ticket_season = "Ticket Season", park_close = "Close Time", diff --git a/chapters/15-g-comp.qmd b/chapters/15-g-comp.qmd index 64f6a14..103c365 100644 --- a/chapters/15-g-comp.qmd +++ b/chapters/15-g-comp.qmd @@ -29,10 +29,10 @@ n <- 1000000 tibble( roll_1 = sample(1:6, n, replace = TRUE), roll_2 = sample(1:6, n, replace = TRUE), -) |> - reframe(roll_1 + roll_2 == 2) |> - pull() |> - sum()/n +) |> + reframe(roll_1 + roll_2 == 2) |> + pull() |> + sum() / n ``` Monte Carlo simulations are extremely useful for estimating outcomes of complex processes for which closed mathematical solutions are not easy to determine. Indeed, that's why Monte Carlo simulations are so useful for the real-world causal mechanisms described in this book! @@ -123,7 +123,7 @@ seven_dwarfs_9 <- seven_dwarfs_train_2018 |> # A logistic regression for park_extra_magic_morning fit_extra_magic <- glm( - park_extra_magic_morning ~ + park_extra_magic_morning ~ park_ticket_season + park_close + park_temperature_high, data = seven_dwarfs_9, family = "binomial" @@ -131,7 +131,7 @@ fit_extra_magic <- glm( # A linear model for wait_minutes_posted_avg fit_wait_minutes <- lm( - wait_minutes_posted_avg ~ + wait_minutes_posted_avg ~ park_extra_magic_morning + park_ticket_season + park_close + park_temperature_high, data = seven_dwarfs_9 @@ -146,8 +146,8 @@ In the present case, we'll use sampling with replacement to generate a data fram # It's important to set seeds for reproducibility in Monte Carlo runs set.seed(8675309) -df_sim_baseline <- seven_dwarfs_9 |> - select(park_ticket_season, park_close, park_temperature_high) |> +df_sim_baseline <- seven_dwarfs_9 |> + select(park_ticket_season, park_close, park_temperature_high) |> sample_n(10000, replace = TRUE) ``` @@ -158,12 +158,12 @@ Other simulations (in this case, the only remaining variable, `wait_minutes_post ```{r} # Set the exposure groups for the causal contrast we wish to estimate -df_sim_time_1 <- df_sim_baseline |> +df_sim_time_1 <- df_sim_baseline |> mutate(park_extra_magic_morning = c(rep(1, 5000), rep(0, 5000))) # Simulate the outcome according to the parametric model in step 2 -df_outcome <- fit_wait_minutes |> - augment(newdata = df_sim_time_1) |> +df_outcome <- fit_wait_minutes |> + augment(newdata = df_sim_time_1) |> rename(wait_minutes_posted_avg = .fitted) ``` @@ -171,8 +171,8 @@ All that is left to do is compute the causal contrast we wish to estimate. Here, that contrast is the difference between expected wait minutes on extra magic mornings versus mornings without the extra magic program. ```{r} -df_outcome |> - group_by(park_extra_magic_morning) |> +df_outcome |> + group_by(park_extra_magic_morning) |> summarize(wait_minutes = mean(wait_minutes_posted_avg)) ``` @@ -256,37 +256,36 @@ One extension to our previous implementation is that we're going to embed the ea library(splines) fit_models <- function(.data) { - # A logistic regression for park_extra_magic_morning fit_extra_magic <- glm( - park_extra_magic_morning ~ + park_extra_magic_morning ~ park_ticket_season + park_close + park_temperature_high, data = .data, family = "binomial" ) - + # A linear model for wait_minutes_posted_avg fit_wait_minutes_posted <- lm( - wait_minutes_posted_avg ~ + wait_minutes_posted_avg ~ park_extra_magic_morning + park_ticket_season + park_close + park_temperature_high, data = .data ) - + # A linear model for wait_minutes_actual_avg # Let's go ahead an add a spline for further flexibility. # Be aware this is an area where you can add many options here # (interactions, etc) but you may get warnings and/or models which # fail to converge if you don't have enough data. fit_wait_minutes_actual <- lm( - wait_minutes_actual_avg ~ - ns(wait_minutes_posted_avg, df = 3) + + wait_minutes_actual_avg ~ + ns(wait_minutes_posted_avg, df = 3) + park_extra_magic_morning + park_ticket_season + park_close + park_temperature_high, data = .data ) - + # return a list that we can pipe into our simulation step (up next) return( list( @@ -296,7 +295,6 @@ fit_models <- function(.data) { fit_wait_minutes_actual = fit_wait_minutes_actual ) ) - } ``` @@ -308,40 +306,38 @@ Next, we write a function which will complete step 3: from a random sample from # contrast gives exposure (default 60) and control group (default 30) settings # n_sample is the size of the baseline resample of .data simulate_process <- function( - fit_obj, + fit_obj, contrast = c(60, 30), - n_sample = 10000 -) { - + n_sample = 10000) { # Draw a random sample of baseline variables - df_baseline <- fit_obj |> - pluck(".data") |> - select(park_ticket_season, park_close, park_temperature_high) |> + df_baseline <- fit_obj |> + pluck(".data") |> + select(park_ticket_season, park_close, park_temperature_high) |> sample_n(n_sample, replace = TRUE) - + # Simulate park_extra_magic_morning - df_sim_time_1 <- fit_obj |> - pluck("fit_extra_magic") |> - augment(newdata = df_baseline, type.predict = "response") |> + df_sim_time_1 <- fit_obj |> + pluck("fit_extra_magic") |> + augment(newdata = df_baseline, type.predict = "response") |> # .fitted is the probability that park_extra_magic_morning is 1, # so let's use that to generate a 0/1 outcome mutate( park_extra_magic_morning = rbinom(n(), 1, .fitted) ) - + # Assign wait_minutes_posted_avg, since it's the intervention - df_sim_time_2 <- df_sim_time_1 |> + df_sim_time_2 <- df_sim_time_1 |> mutate( - wait_minutes_posted_avg = - c(rep(contrast[1], n_sample/2), rep(contrast[2], n_sample/2)) + wait_minutes_posted_avg = + c(rep(contrast[1], n_sample / 2), rep(contrast[2], n_sample / 2)) ) - - # Simulate the outcome - df_outcome <- fit_obj |> - pluck("fit_wait_minutes_actual") |> - augment(newdata = df_sim_time_2) |> + + # Simulate the outcome + df_outcome <- fit_obj |> + pluck("fit_wait_minutes_actual") |> + augment(newdata = df_sim_time_2) |> rename(wait_minutes_actual_avg = .fitted) - + # return a list that we can pipe into the contrast estimation step (up next) return( list( @@ -357,26 +353,24 @@ Finally, in step 4, we compute the summary statistics and causal contrast of int ```{r} # sim_obj is a list created by our simulate_process() function compute_stats <- function(sim_obj) { - - exposure_val <- sim_obj |> + exposure_val <- sim_obj |> pluck("contrast", 1) - - control_val <- sim_obj |> + + control_val <- sim_obj |> pluck("contrast", 2) - - sim_obj |> - pluck("df_outcome") |> - group_by(wait_minutes_posted_avg) |> - summarize(avg_wait_actual = mean(wait_minutes_actual_avg)) |> + + sim_obj |> + pluck("df_outcome") |> + group_by(wait_minutes_posted_avg) |> + summarize(avg_wait_actual = mean(wait_minutes_actual_avg)) |> pivot_wider( - names_from = wait_minutes_posted_avg, + names_from = wait_minutes_posted_avg, values_from = avg_wait_actual, names_prefix = "x_" - ) |> + ) |> summarize( x_60, x_30, x_60 - x_30 ) - } ``` @@ -398,31 +392,31 @@ wait_times <- eight |> drop_na(wait_minutes_actual_avg) # get a single point estimate to make sure things work as we planned -wait_times |> - fit_models() |> - simulate_process() |> - compute_stats() |> +wait_times |> + fit_models() |> + simulate_process() |> + compute_stats() |> # rsample wants results labelled this way pivot_longer( - names_to = "term", - values_to = "estimate", + names_to = "term", + values_to = "estimate", cols = everything() ) # compute bootstrap confidence intervals library(rsample) -boots <- bootstraps(wait_times, times = 1000, apparent = TRUE) |> +boots <- bootstraps(wait_times, times = 1000, apparent = TRUE) |> mutate( models = map( - splits, - \(.x) analysis(.x) |> - fit_models() |> - simulate_process() |> - compute_stats() |> + splits, + \(.x) analysis(.x) |> + fit_models() |> + simulate_process() |> + compute_stats() |> pivot_longer( - names_to = "term", - values_to = "estimate", + names_to = "term", + values_to = "estimate", cols = everything() ) ) diff --git a/chapters/17-missingness-and-measurement.qmd b/chapters/17-missingness-and-measurement.qmd index 57365ac..a3c0ac0 100644 --- a/chapters/17-missingness-and-measurement.qmd +++ b/chapters/17-missingness-and-measurement.qmd @@ -57,23 +57,25 @@ glyph <- function(data, params, size) { show_edge_color <- function(...) { list( - theme(legend.position = "bottom"), - ggokabeito::scale_edge_color_okabe_ito(name = NULL, breaks = ~ .x[!is.na(.x)]), + theme(legend.position = "bottom"), + ggokabeito::scale_edge_color_okabe_ito(name = NULL, breaks = ~ .x[!is.na(.x)]), guides(color = "none") ) } edges_with_aes <- function(..., edge_color = "grey85", shadow = TRUE) { list( - if (shadow) geom_dag_edges_link( + if (shadow) { + geom_dag_edges_link( data = \(.x) filter(.x, is.na(path)), edge_color = edge_color - ), - geom_dag_edges_link( - aes(edge_color = path), - data = \(.x) mutate(.x, path = ifelse(is.na(to), NA, path)) ) + }, + geom_dag_edges_link( + aes(edge_color = path), + data = \(.x) mutate(.x, path = ifelse(is.na(to), NA, path)) ) + ) } ggdag2 <- function(.dag, ..., order = 1:9, seed = 1633, box.padding = 3.4, edges = geom_dag_edges_link(edge_color = "grey85")) { @@ -81,13 +83,13 @@ ggdag2 <- function(.dag, ..., order = 1:9, seed = 1633, box.padding = 3.4, edges .dag, aes_dag(...) ) + - edges + - geom_dag_point(key_glyph = glyph) + - geom_dag_text_repel(aes(label = label), size = 3.8, seed = seed, color = "#494949", box.padding = box.padding) + - ggokabeito::scale_color_okabe_ito(order = order, na.value = "grey90", breaks = ~ .x[!is.na(.x)]) + - theme_dag() + - theme(legend.position = "none") + - coord_cartesian(clip = "off") + edges + + geom_dag_point(key_glyph = glyph) + + geom_dag_text_repel(aes(label = label), size = 3.8, seed = seed, color = "#494949", box.padding = box.padding) + + ggokabeito::scale_color_okabe_ito(order = order, na.value = "grey90", breaks = ~ .x[!is.na(.x)]) + + theme_dag() + + theme(legend.position = "none") + + coord_cartesian(clip = "off") } add_measured <- function(.df) { @@ -97,7 +99,8 @@ add_measured <- function(.df) { str_detect(label, "measured") ~ "measured", str_detect(label, "wait") ~ "truth", .default = NA - )) + ) + ) } add_missing <- function(.df) { @@ -107,7 +110,8 @@ add_missing <- function(.df) { str_detect(label, "missing") ~ "missingness indicator", str_detect(label, "wait") ~ "wait times", .default = NA - )) + ) + ) } labels <- c( @@ -124,14 +128,14 @@ dagify( posted_star ~ u_posted + posted, actual_star ~ u_actual + actual, coords = time_ordered_coords(), - labels = labels, + labels = labels, exposure = "posted_star", outcome = "actual_star" -) |> - tidy_dagitty() |> - add_measured() |> +) |> + tidy_dagitty() |> + add_measured() |> ggdag2(color = measured, order = 6:7) + - theme(legend.position = "bottom") + + theme(legend.position = "bottom") + labs(color = NULL) + theme( legend.key.spacing.x = unit(4, "points"), @@ -160,7 +164,7 @@ The extent of this error depends on how well the measured version correlates wit ```{r} #| label: fig-meas-err-other -#| fig-subcap: +#| fig-subcap: #| - "An updated DAG where the measured version of posted wait times is only caused by the true value, meaning it is perfectly measured. For the actual wait times, the three causes are the true values, whether or not TouringPlans employed the reporter, and an unknown mismeasurement mechanism." #| - "A null DAG, meaning we have removed the arrow from actual wait times to posted wait times. When we look at a null DAG, we can see the mechanisms of mismeasurement are separate for the two wait time variables." #| echo: false @@ -182,9 +186,9 @@ dagify( actual_star ~ u_actual + actual + employed, coords = time_ordered_coords(), labels = labels -) |> - tidy_dagitty() |> - add_measured() |> +) |> + tidy_dagitty() |> + add_measured() |> ggdag2(color = measured, order = 6:7, box.padding = 3.7, seed = 123) dagitty::dagitty( @@ -200,9 +204,9 @@ employed -> actual_star posted -> posted_star u_actual -> actual_star }' -) |> - dag_label(labels = labels) |> - add_measured() |> +) |> + dag_label(labels = labels) |> + add_measured() |> ggdag2(color = measured, order = 6:7, box.padding = 2.5) ``` @@ -248,7 +252,7 @@ y <- case_when( ) x_measured <- ifelse( - x %in% c("c", "d"), + x %in% c("c", "d"), sample(c("c", "d"), size = n, replace = TRUE), x ) @@ -266,7 +270,7 @@ This is called *dependent, non-differential* measurement error. ```{r} #| label: fig-meas-err-dag-dep -#| fig-subcap: +#| fig-subcap: #| - "Now, the DAG includes an arrow from `unknown` to both measured variables, meaning that the way they are mismeasured is not independent. In other words, `unknown` is now a mutual cause of the mismeasured variables---a confounder." #| - "The open paths in this DAG. Because this is a null DAG, the only open pathway is a biasing one from the two mismeasured variables via `unknown`." #| echo: false @@ -274,12 +278,12 @@ This is called *dependent, non-differential* measurement error. #| fig-width: 4 #| fig-height: 4 labels <- c( - "actual" = "actual wait", - "actual_star" = "measured\nactual", - "posted" = "posted wait", - "posted_star" = "measured\nposted", - "employed" = "employed by TP", - "u_actual" = "unknown" + "actual" = "actual wait", + "actual_star" = "measured\nactual", + "posted" = "posted wait", + "posted_star" = "measured\nposted", + "employed" = "employed by TP", + "u_actual" = "unknown" ) depend_dag <- dagify( @@ -287,22 +291,22 @@ depend_dag <- dagify( actual_star ~ u_actual + actual + employed, coords = time_ordered_coords(), labels = labels, - exposure = "posted_star", + exposure = "posted_star", outcome = "actual_star" ) -depend_dag |> - tidy_dagitty() |> - add_measured() |> +depend_dag |> + tidy_dagitty() |> + add_measured() |> ggdag2(color = measured, order = 6:7, box.padding = 3) -depend_dag |> - tidy_dagitty() |> - dag_paths() |> +depend_dag |> + tidy_dagitty() |> + dag_paths() |> ggdag2( - color = path, - edge_color = path, - box.padding = 2.35, + color = path, + edge_color = path, + box.padding = 2.35, edges = edges_with_aes(shadow = FALSE) ) + show_edge_color() @@ -318,12 +322,12 @@ Let's expand @fig-meas-err-dag-dep-1 to include an arrow from posted time to how #| echo: false #| fig-width: 7.5 labels <- c( - "actual" = "actual wait", - "actual_star" = "measured\nactual", - "posted" = "posted wait", - "posted_star" = "measured\nposted", - "employed" = "employed by TP", - "u_actual" = "unknown" + "actual" = "actual wait", + "actual_star" = "measured\nactual", + "posted" = "posted wait", + "posted_star" = "measured\nposted", + "employed" = "employed by TP", + "u_actual" = "unknown" ) depend_dag <- dagify( @@ -335,17 +339,17 @@ depend_dag <- dagify( outcome = "actual_star" ) -depend_dag |> - tidy_dagitty() |> - dag_paths() |> +depend_dag |> + tidy_dagitty() |> + dag_paths() |> ggdag2( - color = path, - edge_color = path, + color = path, + edge_color = path, seed = 234, edges = edges_with_aes(edge_color = "grey90") ) + show_edge_color() + - facet_wrap(~ set) + + facet_wrap(~set) + theme(strip.text = element_blank()) ``` @@ -383,8 +387,8 @@ true_model <- lm(outcome ~ exposure * confounder) # mismeasure confounder confounder <- ifelse( - outcome > 0, - confounder, + outcome > 0, + confounder, confounder + 10 * rnorm(n) ) @@ -398,21 +402,21 @@ mismeasured_model <- lm(outcome ~ exposure * confounder) library(gt) library(broom) pull_interaction <- function(mdl) { - mdl |> + mdl |> tidy() |> - filter(term == "exposure:confounder") |> + filter(term == "exposure:confounder") |> mutate( estimate = round(estimate, 3), p.value = scales::label_pvalue()(p.value) - ) |> + ) |> select(term, estimate, `p-value` = p.value) } map( list("true" = true_model, "mismeasured" = mismeasured_model), pull_interaction -) |> - list_rbind(names_to = "model") |> +) |> + list_rbind(names_to = "model") |> gt() ``` @@ -454,11 +458,11 @@ missing_dag <- dagify( outcome = "actual" ) -missing_dag |> - tidy_dagitty() |> - add_missing() |> +missing_dag |> + tidy_dagitty() |> + add_missing() |> ggdag2(color = missing, order = c(5, 3), box.padding = 3) + - theme(legend.position = "bottom") + + theme(legend.position = "bottom") + labs(color = NULL) + theme( legend.key.spacing.x = unit(4, "points"), @@ -478,14 +482,14 @@ However, in this simple DAG, conditioning on missingness does not open a backdoo #| echo: false #| fig-width: 4 #| fig-height: 4 -missing_dag |> - tidy_dagitty() |> - add_missing() |> - dag_paths(adjust_for = "actual_missing") |> +missing_dag |> + tidy_dagitty() |> + add_missing() |> + dag_paths(adjust_for = "actual_missing") |> ggdag2( - color = path, + color = path, edge_color = path, - box.padding = 3, + box.padding = 3, seed = 234, edges = edges_with_aes(edge_color = "grey90") ) + @@ -509,7 +513,7 @@ In this case, there is no way to close the backdoor paths opened by conditioning ```{r} #| label: fig-missing-dag-actual -#| fig-subcap: +#| fig-subcap: #| - "Now, the DAG includes an arrow from actual wait times to the missingness indicator for actual wait times. In other words, the values of actual wait times themselves influence whether or not we've managed to measure them." #| - "Conditioning on the missingness indicator now opens a backdoor path between the exposure and the outcome, and we have no means to close it." #| echo: false @@ -532,23 +536,23 @@ missing_dag <- dagify( outcome = "actual" ) -missing_dag |> - tidy_dagitty() |> - add_missing() |> +missing_dag |> + tidy_dagitty() |> + add_missing() |> ggdag2(color = missing, order = c(5, 3)) -missing_dag |> - tidy_dagitty() |> - add_missing() |> - dag_paths(adjust_for = "actual_missing") |> +missing_dag |> + tidy_dagitty() |> + add_missing() |> + dag_paths(adjust_for = "actual_missing") |> ggdag2( - color = path, - edge_color = path, + color = path, + edge_color = path, seed = 234, edges = edges_with_aes(edge_color = "grey90") ) + show_edge_color() + - facet_wrap(~ set) + + facet_wrap(~set) + expand_plot(expand_x = expansion(c(.2, .2))) + theme(strip.text = element_blank()) ``` @@ -572,46 +576,46 @@ define_dag <- function(..., tag, title) { coords = time_ordered_coords(), exposure = "p", outcome = "a" - ) |> - ggdag(size = .7) + - labs(title = paste0(tag, ": ", title)) + + ) |> + ggdag(size = .7) + + labs(title = paste0(tag, ": ", title)) + theme_dag() + - theme(plot.title = element_text(size = 12)) + + theme(plot.title = element_text(size = 12)) + expand_plot(expansion(c(0.4, 0.4)), expansion(c(0.4, 0.4))) } dag_1 <- define_dag( a ~ p, m ~ u, - tag = "1", + tag = "1", title = "`actual` is missing" ) dag_2 <- define_dag( a ~ p, m ~ u + p, - tag = "2", + tag = "2", title = "`actual` is missing" ) dag_3 <- define_dag( a ~ p, m ~ u + a, - tag = "3", + tag = "3", title = "`actual` is missing" ) dag_4 <- define_dag( a ~ p, m ~ u + p, - tag = "4", + tag = "4", title = "`posted` is missing" ) dag_5 <- define_dag( a ~ p, m ~ u + a, - tag = "5", + tag = "5", title = "`posted` is missing" ) @@ -629,10 +633,10 @@ In DAG 4, we can calculate the mean of `actual` and the causal effect but not th #| label: fig-recoverables #| fig-cap: "A forest plot of the results of three different effects for data simulated from each DAG in @fig-missing-dags-sim. In the non-missing results, we can see what the effect should be for the sample. Each simulated dataset has 365 rows with missingness in either actual or posted wait times. For each of the DAGs, we're limited in what we can estimate correctly." #| echo: false -set.seed(123) +set.seed(123) posted <- rnorm(365, mean = 30, sd = 5) # create an effect where an hour of posted time creates 50 min of actual time -coef <- 50 / 60 +coef <- 50 / 60 actual <- coef * posted + rnorm(365, mean = 0, sd = 2) posted_60 <- posted / 60 @@ -650,16 +654,16 @@ fit_stats <- function(dag, actual, posted_60, missing_by = NULL, missing_for = " if (!is.null(missing_by) & missing_for == "actual") { actual[missing_by] <- NA } - + if (!is.null(missing_by) & missing_for == "posted") { posted_60[missing_by] <- NA } - + t_actual <- t.test(actual) t_posted <- t.test(posted_60 * 60) mdl <- lm(actual ~ posted_60) mdl_confints <- confint(mdl) - + tibble( dag = dag, mean_actual_estimate = as.numeric(t_actual$estimate), @@ -671,7 +675,7 @@ fit_stats <- function(dag, actual, posted_60, missing_by = NULL, missing_for = " coef_60_estimate = coefficients(mdl)[["posted_60"]], coef_60_lower = mdl_confints[2, 1], coef_60_upper = mdl_confints[2, 2] - ) |> + ) |> pivot_longer( cols = -dag, names_to = c("stat", ".value"), @@ -688,20 +692,20 @@ dag_stats <- bind_rows( fit_stats("DAG 5", actual, posted_60, missing_by = missing_dag_5, missing_for = "posted"), ) -dag_stats |> +dag_stats |> mutate( true_value = ifelse(dag == "No missingness", "True value", "Observed value"), dag = factor(dag, levels = c(paste("DAG", 5:1), "No missingness")), stat = factor( - stat, - levels = c("mean_posted", "mean_actual", "coef_60"), + stat, + levels = c("mean_posted", "mean_actual", "coef_60"), labels = c("Mean of Posted", "Mean of Actual", "Causal effect") ) - ) |> - ggplot(aes(color = true_value)) + - geom_point(aes(estimate, dag)) + - geom_segment(aes(x = lower, xend = upper, y = dag, yend = dag, group = stat)) + - facet_wrap(~ stat, scales = "free_x") + + ) |> + ggplot(aes(color = true_value)) + + geom_point(aes(estimate, dag)) + + geom_segment(aes(x = lower, xend = upper, y = dag, yend = dag, group = stat)) + + facet_wrap(~stat, scales = "free_x") + labs(y = NULL, color = NULL) ``` @@ -759,19 +763,19 @@ calib_model <- lm( ns(wait_minutes_posted_avg, df = 4) * wait_hour + park_temperature_high + park_close + park_ticket_season, # use log of `wait_minutes_actual_avg` - data = seven_dwarfs_train_2018 |> + data = seven_dwarfs_train_2018 |> mutate(wait_minutes_actual_avg = log1p(wait_minutes_actual_avg)) ) -seven_dwarves_calib <- calib_model |> - augment(newdata = seven_dwarfs_train_2018) |> - rename(wait_minutes_actual_calib = .fitted) |> - # convert back to the original scale +seven_dwarves_calib <- calib_model |> + augment(newdata = seven_dwarfs_train_2018) |> + rename(wait_minutes_actual_calib = .fitted) |> + # convert back to the original scale # and fill in real values where they exist mutate( wait_minutes_actual_calib = exp(wait_minutes_actual_calib) - 1, wait_minutes_actual_calib = coalesce( - wait_minutes_actual_avg, + wait_minutes_actual_avg, wait_minutes_actual_calib ) ) @@ -803,11 +807,11 @@ fit_ipw_effect <- function(.fmla, .data = seven_dwarfs, .trt = "park_extra_magic } effect_calib <- fit_ipw_effect( - park_extra_magic_morning ~ park_temperature_high + + park_extra_magic_morning ~ park_temperature_high + park_close + park_ticket_season, .outcome_fmla = wait_minutes_actual_calib ~ park_extra_magic_morning, .data = seven_dwarves_calib |> filter(wait_hour == 9) -) |> +) |> round(2) ```