cmu-delphi
diff --git a/‎DEVELOPMENT.md
Lines changed: 4 additions & 2 deletions b/‎DEVELOPMENT.md
Lines changed: 4 additions & 2 deletions
diff --git a/‎R/arx_classifier.R
Lines changed: 2 additions & 4 deletions b/‎R/arx_classifier.R
Lines changed: 2 additions & 4 deletions
diff --git a/‎R/arx_forecaster.R
Lines changed: 24 additions & 18 deletions b/‎R/arx_forecaster.R
Lines changed: 24 additions & 18 deletions
diff --git a/‎R/epi_recipe.R
Lines changed: 7 additions & 5 deletions b/‎R/epi_recipe.R
Lines changed: 7 additions & 5 deletions
diff --git a/‎R/epi_workflow.R
Lines changed: 45 additions & 35 deletions b/‎R/epi_workflow.R
Lines changed: 45 additions & 35 deletions
diff --git a/‎R/extrapolate_quantiles.R
Lines changed: 14 additions & 2 deletions b/‎R/extrapolate_quantiles.R
Lines changed: 14 additions & 2 deletions
diff --git a/‎R/flatline_forecaster.R
Lines changed: 32 additions & 9 deletions b/‎R/flatline_forecaster.R
Lines changed: 32 additions & 9 deletions
diff --git a/‎R/frosting.R
Lines changed: 2 additions & 1 deletion b/‎R/frosting.R
Lines changed: 2 additions & 1 deletion
diff --git a/‎R/get_test_data.R
Lines changed: 4 additions & 5 deletions b/‎R/get_test_data.R
Lines changed: 4 additions & 5 deletions
@@ -35,12 +35,14 @@ R -e 'devtools::document()'
 R -e 'pkgdown::build_site()'
 ```
 
-Note that sometimes the caches from either `pkgdown` or `knitr` can cause difficulties. To clear those, run `make`, with either `clean_knitr`, `clean_site`, or `clean` (which does both).
+Note that sometimes the caches from either `pkgdown` or `knitr` can cause
+difficulties. To clear those, run `make`, with either `clean_knitr`,
+`clean_site`, or `clean` (which does both).
 
 If you work without R Studio and want to iterate on documentation, you might
 find [this
 script](https://gist.github.com/gadenbuie/d22e149e65591b91419e41ea5b2e0621)
-helpful.
+helpful. For updating references, you will need to manually call `pkgdown::build_reference()`.
 
 ## Versioning
 
 
@@ -166,9 +166,7 @@ arx_classifier <- function(
   }
   forecast_date <- args_list$forecast_date %||% forecast_date_default
   target_date <- args_list$target_date %||% (forecast_date + args_list$ahead)
-  preds <- forecast(
-    wf,
-  ) %>%
+  preds <- forecast(wf) %>%
     as_tibble() %>%
     select(-time_value)
 
@@ -347,7 +345,7 @@ arx_class_epi_workflow <- function(
 #'   be created using growth rates (as the predictors are) or lagged
 #'   differences. The second case is closer to the requirements for the
 #'   [2022-23 CDC Flusight Hospitalization Experimental Target](https://github.com/cdcepi/Flusight-forecast-data/blob/745511c436923e1dc201dea0f4181f21a8217b52/data-experimental/README.md).
-#'   See the Classification Vignette for details of how to create a reasonable
+#'   See the [Classification chapter from the forecasting book](https://cmu-delphi.github.io/delphi-tooling-book/arx-classifier.html) Vignette for details of how to create a reasonable
 #'   baseline for this case. Selecting `"growth_rate"` (the default) uses
 #'   [epiprocess::growth_rate()] to create the outcome using some of the
 #'   additional arguments below. Choosing `"lag_difference"` instead simply
 
@@ -1,26 +1,29 @@
 #' Direct autoregressive forecaster with covariates
 #'
 #' This is an autoregressive forecasting model for
-#' [epiprocess::epi_df][epiprocess::as_epi_df] data. It does "direct" forecasting, meaning
-#' that it estimates a model for a particular target horizon.
+#' [epiprocess::epi_df][epiprocess::as_epi_df] data. It does "direct"
+#' forecasting, meaning that it estimates a model for a particular target
+#' horizon of `outcome` based on the lags of the `predictors`. See the [Get
+#' started vignette](../articles/epipredict.html) for some worked examples and
+#' [Custom epi_workflows vignette](../articles/custom_epiworkflows.html) for a
+#' recreation using a custom `epi_workflow()`.
 #'
 #'
 #' @param epi_data An `epi_df` object
-#' @param outcome A character (scalar) specifying the outcome (in the
-#'   `epi_df`).
+#' @param outcome A character (scalar) specifying the outcome (in the `epi_df`).
 #' @param predictors A character vector giving column(s) of predictor variables.
-#'   This defaults to the `outcome`. However, if manually specified, only those variables
-#'   specifically mentioned will be used. (The `outcome` will not be added.)
-#'   By default, equals the outcome. If manually specified, does not add the
-#'   outcome variable, so make sure to specify it.
-#' @param trainer A `{parsnip}` model describing the type of estimation.
-#'   For now, we enforce `mode = "regression"`.
-#' @param args_list A list of customization arguments to determine
-#'   the type of forecasting model. See [arx_args_list()].
+#'   This defaults to the `outcome`. However, if manually specified, only those
+#'   variables specifically mentioned will be used. (The `outcome` will not be
+#'   added.)  By default, equals the outcome. If manually specified, does not
+#'   add the outcome variable, so make sure to specify it.
+#' @param trainer A `{parsnip}` model describing the type of estimation.  For
+#'   now, we enforce `mode = "regression"`.
+#' @param args_list A list of customization arguments to determine the type of
+#'   forecasting model. See [arx_args_list()].
 #'
-#' @return A list with (1) `predictions` an `epi_df` of predicted values
-#'   and (2) `epi_workflow`, a list that encapsulates the entire estimation
-#'   workflow
+#' @return An `arx_fcast`, with the fields `predictions` and `epi_workflow`.
+#'   `predictions` is an `epi_df` of predicted values while `epi_workflow()` is
+#'   the fit workflow used to make those predictions
 #' @export
 #' @seealso [arx_fcast_epi_workflow()], [arx_args_list()]
 #'
@@ -29,15 +32,18 @@
 #'   dplyr::filter(time_value >= as.Date("2021-12-01"))
 #'
 #' out <- arx_forecaster(
-#'   jhu, "death_rate",
+#'   jhu,
+#'   "death_rate",
 #'   c("case_rate", "death_rate")
 #' )
 #'
-#' out <- arx_forecaster(jhu, "death_rate",
+#' out <- arx_forecaster(jhu,
+#'   "death_rate",
 #'   c("case_rate", "death_rate"),
 #'   trainer = quantile_reg(),
 #'   args_list = arx_args_list(quantile_levels = 1:9 / 10)
 #' )
+#' out
 arx_forecaster <- function(
     epi_data,
     outcome,
@@ -60,7 +66,7 @@ arx_forecaster <- function(
   forecast_date <- args_list$forecast_date %||% forecast_date_default
 
 
-  preds <- forecast(wf, forecast_date = forecast_date) %>%
+  preds <- forecast(wf) %>%
     as_tibble() %>%
     select(-time_value)
 
 
@@ -232,9 +232,10 @@ is_epi_recipe <- function(x) {
 
 
 
-#' Add an `epi_recipe` to a workflow
+#' Given an `epi_recipe`, add it to, remove it from, or update it in an
+#' `epi_workflow`
 #'
-#' @seealso [workflows::add_recipe()]
+#' @description
 #' - `add_recipe()` specifies the terms of the model and any preprocessing that
 #'   is required through the usage of a recipe.
 #'
@@ -244,9 +245,9 @@ is_epi_recipe <- function(x) {
 #' recipe with the new one.
 #'
 #' @details
-#' `add_epi_recipe` has the same behaviour as
-#' [workflows::add_recipe()] but sets a different
-#' default blueprint to automatically handle [epiprocess::epi_df][epiprocess::as_epi_df] data.
+#' `add_epi_recipe()` has the same behaviour as [workflows::add_recipe()] but
+#'   sets a different default blueprint to automatically handle
+#'   `epiprocess::epi_df()` data.
 #'
 #' @param x A `workflow` or `epi_workflow`
 #'
@@ -265,6 +266,7 @@ is_epi_recipe <- function(x) {
 #' `x`, updated with a new recipe preprocessor.
 #'
 #' @export
+#' @seealso [workflows::add_recipe()]
 #' @examples
 #' jhu <- covid_case_death_rates %>%
 #'   filter(time_value > "2021-08-01")
 
@@ -1,19 +1,20 @@
 #' Create an epi_workflow
 #'
 #' This is a container object that unifies preprocessing, fitting, prediction,
-#' and post-processing for predictive modeling on epidemiological data. It extends
-#' the functionality of a [workflows::workflow()] to handle the typical panel
-#' data structures found in this field. This extension is handled completely
-#' internally, and should be invisible to the user. For all intents and purposes,
-#' this operates exactly like a [workflows::workflow()]. For more details
-#' and numerous examples, see there.
+#' and post-processing for predictive modeling on epidemiological data. It
+#' extends the functionality of a [workflows::workflow()] to handle the typical
+#' panel data structures found in this field. This extension is handled
+#' completely internally, and should be invisible to the user. For all intents
+#' and purposes, this operates exactly like a [workflows::workflow()]. For some
+#' `{epipredict}` specific examples, see the [custom epiworkflows
+#' vignette](../articles/custom_epiworkflows.html).
 #'
 #' @inheritParams workflows::workflow
 #' @param postprocessor An optional postprocessor to add to the workflow.
 #'   Currently only `frosting` is allowed using, `add_frosting()`.
 #'
 #' @return A new `epi_workflow` object.
-#' @seealso workflows::workflow
+#' @seealso [workflows::workflow()]
 #' @importFrom rlang is_null
 #' @importFrom stats predict
 #' @importFrom generics fit
@@ -62,9 +63,9 @@ is_epi_workflow <- function(x) {
 #' Fit an `epi_workflow` object
 #'
 #' @description
-#' This is the `fit()` method for an `epi_workflow` object that
+#' This is the `fit()` method for an `epi_workflow()` object that
 #' estimates parameters for a given model from a set of data.
-#' Fitting an `epi_workflow` involves two main steps, which are
+#' Fitting an `epi_workflow()` involves two main steps, which are
 #' preprocessing the data and fitting the underlying parsnip model.
 #'
 #' @inheritParams workflows::fit.workflow
@@ -79,7 +80,7 @@ is_epi_workflow <- function(x) {
 #' @return The `epi_workflow` object, updated with a fit parsnip
 #' model in the `object$fit$fit` slot.
 #'
-#' @seealso workflows::fit-workflow
+#' @seealso [workflows::fit-workflow()]
 #'
 #' @name fit-epi_workflow
 #' @export
@@ -111,20 +112,20 @@ fit.epi_workflow <- function(object, data, ..., control = workflows::control_wor
 #' Predict from an epi_workflow
 #'
 #' @description
-#' This is the `predict()` method for a fit epi_workflow object. The nice thing
-#' about predicting from an epi_workflow is that it will:
+#' This is the `predict()` method for a fit epi_workflow object. The 3 steps that this implements are:
 #'
-#' - Preprocess `new_data` using the preprocessing method specified when the
-#'   workflow was created and fit. This is accomplished using
-#'   [hardhat::forge()], which will apply any formula preprocessing or call
-#'   [recipes::bake()] if a recipe was supplied.
+#' - Preprocessing `new_data` using the preprocessing method specified when the
+#'   epi_workflow was created and fit. This is accomplished using
+#'   `recipes::bake()` if a recipe was supplied. Note that this is a slightly
+#'   different `bake` operation than the one occuring during the fit. Any `step`
+#'   that has `skip = TRUE` isn't applied during prediction; for example in
+#'   `step_epi_naomit()`, `all_outcomes()` isn't `NA` omitted, since doing so
+#'   would drop the exact `time_values` we are trying to predict.
 #'
-#' - Call [parsnip::predict.model_fit()] for you using the underlying fit
+#' - Calling `parsnip::predict.model_fit()` for you using the underlying fit
 #'   parsnip model.
 #'
-#' - Ensure that the returned object is an [epiprocess::epi_df][epiprocess::as_epi_df] where
-#'   possible. Specifically, the output will have `time_value` and
-#'   `geo_value` columns as well as the prediction.
+#' - `slather()` any frosting that has been included in the `epi_workflow`.
 #'
 #' @param object An epi_workflow that has been fit by
 #'   [workflows::fit.workflow()]
@@ -136,7 +137,7 @@ fit.epi_workflow <- function(object, data, ..., control = workflows::control_wor
 #'
 #' @return
 #' A data frame of model predictions, with as many rows as `new_data` has.
-#' If `new_data` is an `epi_df` or a data frame with `time_value` or
+#' If `new_data` is an `epi_df()` or a data frame with `time_value` or
 #' `geo_value` columns, then the result will have those as well.
 #'
 #' @name predict-epi_workflow
@@ -177,6 +178,11 @@ predict.epi_workflow <- function(object, new_data, type = NULL, opts = list(), .
 
 #' Augment data with predictions
 #'
+#' `augment()`, unlike `forecast()`, has the goal of modifying the training
+#' data, rather than just producing new forecasts. It does a prediction on
+#' `new_data`, which will produce a prediction for most `time_values`, and then
+#' adds `.pred` as a column to `new_data` and returns the resulting join.
+#'
 #' @param x A trained epi_workflow
 #' @param new_data A epi_df of predictors
 #' @param ... Arguments passed on to the predict method.
@@ -228,26 +234,30 @@ print.epi_workflow <- function(x, ...) {
 }
 
 
-#' Produce a forecast from an epi workflow
+#' Produce a forecast from just an epi workflow
+#'
+#' `forecast.epi_workflow` predicts by restricting the training data to the
+#' latest available data, and predicting on that. It binds together
+#' `get_test_data()` and `predict()`.
 #'
 #' @param object An epi workflow.
-#' @param ... Not used.
-#' @param n_recent Integer or NULL. If filling missing data with locf = TRUE,
-#' how far back are we willing to tolerate missing data? Larger values allow
-#' more filling. The default NULL will determine this from the the recipe. For
-#' example, suppose n_recent = 3, then if the 3 most recent observations in any
-#' geo_value are all NA’s, we won’t be able to fill anything, and an error
-#' message will be thrown. (See details.)
-#' @param forecast_date By default, this is set to the maximum time_value in x.
-#' But if there is data latency such that recent NA's should be filled, this may
-#' be after the last available time_value.
 #'
 #' @return A forecast tibble.
 #'
 #' @export
-forecast.epi_workflow <- function(object, ..., n_recent = NULL, forecast_date = NULL) {
-  rlang::check_dots_empty()
-
+#' @examples
+#' jhu <- covid_case_death_rates %>%
+#'   filter(time_value > "2021-08-01")
+#'
+#' r <- epi_recipe(jhu) %>%
+#'   step_epi_lag(death_rate, lag = c(0, 7, 14)) %>%
+#'   step_epi_ahead(death_rate, ahead = 7) %>%
+#'   step_epi_naomit()
+#'
+#' epi_workflow(r, parsnip::linear_reg()) %>%
+#'   fit(jhu) %>%
+#'   forecast()
+forecast.epi_workflow <- function(object) {
   if (!object$trained) {
     cli_abort(c(
       "You cannot `forecast()` a {.cls workflow} that has not been trained.",
 
@@ -1,4 +1,16 @@
-#' Summarize a distribution with a set of quantiles
+#' Extrapolate the quantiles to new quantile levels
+#'
+#' This both interpolates between quantile levels already defined in `x` and
+#' extrapolates quantiles outside their bounds. The interpolation method is
+#' determined by the `quantile` argument `middle`, which can be either `"cubic"`
+#' for a (hyman) cubic spline interpolation, or `"linear"` for simple linear
+#' interpolation.
+#'
+#' There is only one extrapolation method for values greater than the largest
+#' known quantile level or smaller than the smallest known quantile level. It
+#' assumes a roughly exponential tail, whose decay rate and offset is derived
+#' from the slope of the two most extreme quantile levels on a logistic scale.
+#' See the internal function `tail_extrapolate()` for the exact implementation.
 #'
 #' @param x a `distribution` vector
 #' @param probs a vector of probabilities at which to calculate quantiles
@@ -26,7 +38,7 @@
 #'   dist_normal(c(10, 2), c(5, 10)),
 #'   dist_quantiles(list(1:4, 8:11), list(c(.2, .4, .6, .8)))
 #' )
-#' extrapolate_quantiles(dstn, probs = c(.25, 0.5, .75))
+#' extrapolate_quantiles(dstn, probs = c(0.0001, 0.25, 0.5, 0.75, 0.99999))
 extrapolate_quantiles <- function(x, probs, replace_na = TRUE, ...) {
   UseMethod("extrapolate_quantiles")
 }
 
@@ -1,18 +1,41 @@
 #' Predict the future with today's value
 #'
-#' This is a simple forecasting model for
-#' [epiprocess::epi_df][epiprocess::as_epi_df] data. It uses the most recent
-#' observation as the
-#' forecast for any future date, and produces intervals based on the quantiles
-#' of the residuals of such a "flatline" forecast over all available training
-#' data.
+#' @description This is a simple forecasting model for
+#'   [epiprocess::epi_df][epiprocess::as_epi_df] data. It uses the most recent
+#'   observation as the forecast for any future date, and produces intervals
+#'   based on the quantiles of the residuals of such a "flatline" forecast over
+#'   all available training data.
 #'
 #' By default, the predictive intervals are computed separately for each
-#' combination of key values (`geo_value` + any additional keys) in the
-#' `epi_data` argument.
+#'   combination of key values (`geo_value` + any additional keys) in the
+#'   `epi_data` argument.
 #'
 #' This forecaster is very similar to that used by the
-#' [COVID19ForecastHub](https://covid19forecasthub.org)
+#'   [COVID19ForecastHub](https://covid19forecasthub.org)
+#'
+#' @details
+#'  Here is (roughly) the code for the `flatline_forecaster()` applied to the
+#'   `case_rate` for `epidatasets::covid_case_death_rates`.
+#'
+#' ```{r}
+#' jhu <- covid_case_death_rates %>%
+#'   filter(time_value > "2021-11-01", geo_value %in% c("ak", "ca", "ny"))
+#' r <- epi_recipe(covid_case_death_rates) %>%
+#'   step_epi_ahead(case_rate, ahead = 7, skip = TRUE) %>%
+#'   recipes::update_role(case_rate, new_role = "predictor") %>%
+#'   recipes::add_role(all_of(key_colnames(jhu)), new_role = "predictor")
+#'
+#' f <- frosting() %>%
+#'   layer_predict() %>%
+#'   layer_residual_quantiles() %>%
+#'   layer_add_forecast_date() %>%
+#'   layer_add_target_date() %>%
+#'   layer_threshold(starts_with(".pred"))
+#'
+#' eng <- linear_reg() %>% set_engine("flatline")
+#' wf <- epi_workflow(r, eng, f) %>% fit(jhu)
+#' preds <- forecast(wf)
+#' ```
 #'
 #' @param epi_data An [epiprocess::epi_df][epiprocess::as_epi_df]
 #' @param outcome A scalar character for the column name we wish to predict.
 
@@ -1,4 +1,5 @@
-#' Add frosting to a workflow
+#' Given a `frosting()`, add it to, remove it from, or update it in an
+#' `epi_workflow`
 #'
 #' @param x A workflow
 #' @param frosting A frosting object created using `frosting()`.
 
@@ -1,10 +1,9 @@
 #' Get test data for prediction based on longest lag period
 #'
-#' Based on the longest lag period in the recipe,
-#' `get_test_data()` creates an [epi_df][epiprocess::as_epi_df]
-#' with columns `geo_value`, `time_value`
-#' and other variables in the original dataset,
-#' which will be used to create features necessary to produce forecasts.
+#' If `predict()` is given the full training dataset, it will produce a forecast
+#' for every day which has enough data. For most cases, this is far more
+#' forecasts than is necessary. `get_test_data()` is designed to restrict the given dataset to the minimum amount needed to produce a forecast on the `forecast_date`.
+#' Primarily this is based on the longest lag period in the recipe.
 #'
 #' The minimum required (recent) data to produce a forecast is equal to
 #' the maximum lag requested (on any predictor) plus the longest horizon