cmu-delphi
diff --git a/‎R/methods-epi_archive.R
+97-100 b/‎R/methods-epi_archive.R
+97-100
@@ -622,67 +622,83 @@ epix_detailed_restricted_mutate <- function(.data, ...) {
 }
 
 
-#' Slide a function over variables in an `epi_archive` or `grouped_epi_archive`
+#' Take each requested (group and) version in an archive, run a computation (e.g., forecast)
 #'
-#' Slides a given function over variables in an `epi_archive` object. This
-#' behaves similarly to `epi_slide()`, with the key exception that it is
-#' version-aware: the sliding computation at any given reference time t is
-#' performed on **data that would have been available as of t**. This function
-#' is intended for use in accurate backtesting of models; see
+#' ... and collect the results. This is useful for more accurately simulating
+#' how a forecaster, nowcaster, or other algorithm would have behaved in real
+#' time, factoring in reporting latency and data revisions; see
 #' \href{https://cmu-delphi.github.io/epipredict/articles/backtesting.html}{`vignette("backtesting",
 #' package="epipredict")`} for a walkthrough.
 #'
+#' This is similar to looping over versions and calling [`epix_as_of`], but has
+#' some conveniences such as working naturally with [`grouped_epi_archive`]s,
+#' optional time windowing, and syntactic sugar to make things shorter to write.
+#'
 #' @param .x An [`epi_archive`] or [`grouped_epi_archive`] object. If ungrouped,
 #'   all data in `x` will be treated as part of a single data group.
 #' @param .f Function, formula, or missing; together with `...` specifies the
-#'   computation to slide. To "slide" means to apply a computation over a
-#'   sliding (a.k.a. "rolling") time window for each data group. The window is
-#'   determined by the `.before` parameter (see details for more). If a
-#'   function, `.f` must have the form `function(x, g, t, ...)`, where
-#'
-#'   - "x" is an epi_df with the same column names as the archive's `DT`, minus
-#'     the `version` column
-#'   - "g" is a one-row tibble containing the values of the grouping variables
-#'   for the associated group
-#'   - "t" is the ref_time_value for the current window
-#'   - "..." are additional arguments
+#'   computation. The computation will be run on each requested group-version
+#'   combination, with a time window filter applied if `.before` is supplied.
+#'
+#'   - If `.f` is a function must have the form `function(x, g, v)` or
+#'     `function(x, g, v, <additional configuration args>)`, where
+#'
+#'     - `x` is an `epi_df` with the same column names as the archive's `DT`,
+#'       minus the `version` column. (Or, if `.all_versions = TRUE`, an
+#'       `epi_archive` with the requested partial version history.)
+#'
+#'     - `g` is a one-row tibble containing the values of the grouping variables
+#'       for the associated group.
+#'
+#'     - `v` (length-1) is the associated `version` (one of the requested
+#'       `.versions`)
+#'
+#'     - `<additional configuration args>` are optional; you can add such
+#'       arguments to your function and set them by passing them through the
+#'       `...` argument to `epix_slide()`.
 #'
 #'   If a formula, `.f` can operate directly on columns accessed via `.x$var` or
 #'   `.$var`, as in `~ mean (.x$var)` to compute a mean of a column `var` for
 #'   each group-`ref_time_value` combination. The group key can be accessed via
-#'   `.y` or `.group_key`, and the reference time value can be accessed via `.z`
-#'   or `.ref_time_value`. If `.f` is missing, then `...` will specify the
-#'   computation.
+#'   `.y` or `.group_key`, and the reference time value can be accessed via
+#'   `.z`, `.version`, or `.ref_time_value`. If `.f` is missing, then `...` will
+#'   specify the computation.
 #' @param ... Additional arguments to pass to the function or formula specified
 #'   via `f`. Alternatively, if `.f` is missing, then the `...` is interpreted
 #'   as a ["data-masking"][rlang::args_data_masking] expression or expressions
 #'   for tidy evaluation; in addition to referring columns directly by name, the
 #'   expressions have access to `.data` and `.env` pronouns as in `dplyr` verbs,
 #'   and can also refer to `.x` (not the same as the input epi_archive),
-#'   `.group_key`, and `.ref_time_value`. See details for more.
-#' @param .before How many time values before the `.ref_time_value`
-#'   should each snapshot handed to the function `.f` contain? If provided, it
-#'   should be a single value that is compatible with the time_type of the
-#'   time_value column (more below), but most commonly an integer. This window
-#'   endpoint is inclusive. For example, if `.before = 7`, `time_type`
-#'   in the archive is "day", and the `.ref_time_value` is January 8, then the
-#'   smallest time_value in the snapshot will be January 1. If missing, then the
-#'   default is no limit on the time values, so the full snapshot is given.
-#' @param .versions Reference time values / versions for sliding
-#'   computations; each element of this vector serves both as the anchor point
-#'   for the `time_value` window for the computation and the `max_version`
-#'   `epix_as_of` which we fetch data in this window. If missing, then this will
-#'   set to a regularly-spaced sequence of values set to cover the range of
-#'   `version`s in the `DT` plus the `versions_end`; the spacing of values will
-#'   be guessed (using the GCD of the skips between values).
+#'   `.group_key` and `.version`/`.ref_time_value`. See details for more.
+#' @param .before Optional; applies a `time_value` filter before running each
+#'   computation. The default is not to apply a `time_value` filter. If
+#'   provided, it should be a single integer or difftime that is compatible with
+#'   the time_type of the time_value column. If an integer, then the minimum
+#'   possible `time_value` included will be that many time steps (according to
+#'   the `time_type`) before each requested `.version`. This window endpoint is
+#'   inclusive. For example, if `.before = 14`, the `time_type` in the archive
+#'   is "day", and the requested `.version` is January 15, then the smallest
+#'   possible `time_value` possible in the snapshot will be January 1. Note that
+#'   this does not mean that there will be 14 or 15 distinct `time_value`s
+#'   actually appearing in the data; for most reporting streams, reporting as of
+#'   January 15 won't include `time_value`s all the way through January 14, due
+#'   to reporting latency. Unlike `epi_slide()`, `epix_slide()` won't fill in
+#'   any missing `time_values` in this window.
+#' @param .versions Requested versions on which to run the computation. Each
+#'   requested `.version` also serves as the anchor point around which for which
+#'   the `time_value` window specified by `.before` is drawn. If `.versions` is
+#'   missing, it will be set to a regularly-spaced sequence of values set to
+#'   cover the range of `version`s in the `DT` plus the `versions_end`; the
+#'   spacing of values will be guessed (using the GCD of the skips between
+#'   values).
 #' @param .new_col_name Either `NULL` or a string indicating the name of the new
 #'   column that will contain the derived values. The default, `NULL`, will use
 #'   the name "slide_value" unless your slide computations output data frames,
-#'   in which case they will be unpacked into the constituent columns and those
-#'   names used. If the resulting column name(s) overlap with the column names
-#'   used for labeling the computations, which are `group_vars(x)` and
-#'   `"version"`, then the values for these columns must be identical to the
-#'   labels we assign.
+#'   in which case they will be unpacked into the constituent columns and the
+#'   data frame's column names will be used instead. If the resulting column
+#'   name(s) overlap with the column names used for labeling the computations,
+#'   which are `group_vars(x)` and `"version"`, then the values for these
+#'   columns must be identical to the labels we assign.
 #' @param .all_versions (Not the same as `.all_rows` parameter of `epi_slide`.)
 #'   If `.all_versions = TRUE`, then the slide computation will be passed the
 #'   version history (all `version <= .version` where `.version` is one of the
@@ -697,16 +713,17 @@ epix_detailed_restricted_mutate <- function(.data, ...) {
 #' @details A few key distinctions between the current function and `epi_slide()`:
 #'   1. In `.f` functions for `epix_slide`, one should not assume that the input
 #'   data to contain any rows with `time_value` matching the computation's
-#'   `.ref_time_value` (accessible via `attributes(<data>)$metadata$as_of`); for
-#'   typical epidemiological surveillance data, observations pertaining to a
-#'   particular time period (`time_value`) are first reported `as_of` some
-#'   instant after that time period has ended.
+#'   `.version`, due to reporting latency; for typical epidemiological
+#'   surveillance data, observations pertaining to a particular time period
+#'   (`time_value`) are first reported `as_of` some instant after that time
+#'   period has ended. No time window completion is performed as in
+#'   `epi_slide()`.
 #'   2. The input class and columns are similar but different: `epix_slide`
 #'   (with the default `.all_versions=FALSE`) keeps all columns and the
 #'   `epi_df`-ness of the first argument to each computation; `epi_slide` only
 #'   provides the grouping variables in the second input, and will convert the
 #'   first input into a regular tibble if the grouping variables include the
-#'   essential `geo_value` column. (With .all_versions=TRUE`, `epix_slide` will
+#'   essential `geo_value` column. (With `.all_versions=TRUE`, `epix_slide` will
 #'   will provide an `epi_archive` rather than an `epi-df` to each
 #'   computation.)
 #'   3. The output class and columns are similar but different: `epix_slide()`
@@ -726,75 +743,55 @@ epix_detailed_restricted_mutate <- function(.data, ...) {
 #'   computations are allowed more flexibility in their outputs than in
 #'   `epi_slide`, we can't guess a good representation for missing computations
 #'   for excluded group-`.ref_time_value` pairs.
-#'   76. The `.versions` default for `epix_slide` is based on making an
+#'   6. The `.versions` default for `epix_slide` is based on making an
 #'   evenly-spaced sequence out of the `version`s in the `DT` plus the
 #'   `versions_end`, rather than the `time_value`s.
+#'   7. `epix_slide()` computations can refer to the current element of
+#'   `.versions` as either `.version` or `.ref_time_value`, while `epi_slide()`
+#'   computations refer to the current element of `.ref_time_values` with
+#'   `.ref_time_value`.
 #'
 #' Apart from the above distinctions, the interfaces between `epix_slide()` and
 #' `epi_slide()` are the same.
 #'
-#' Furthermore, the current function can be considerably slower than
-#'   `epi_slide()`, for two reasons: (1) it must repeatedly fetch
-#'   properly-versioned snapshots from the data archive (via `epix_as_of()`),
-#'   and (2) it performs a "manual" sliding of sorts, and does not benefit from
-#'   the highly efficient `slider` package. For this reason, it should never be
-#'   used in place of `epi_slide()`, and only used when version-aware sliding is
-#'   necessary (as it its purpose).
-#'
 #' @examples
 #' library(dplyr)
 #'
-#' # Reference time points for which we want to compute slide values:
-#' versions <- seq(as.Date("2020-06-02"),
-#'   as.Date("2020-06-15"),
-#'   by = "1 day"
-#' )
+#' # Request only a small set of versions, for example's sake:
+#' requested_versions <-
+#'   seq(as.Date("2020-09-02"), as.Date("2020-09-15"), by = "1 day")
 #'
-#' # A simple (but not very useful) example (see the archive vignette for a more
-#' # realistic one):
+#' # Investigate reporting lag of `percent_cli` signal (though normally we'd
+#' # probably work off of the dedicated `revision_summary()` function instead):
 #' archive_cases_dv_subset %>%
-#'   group_by(geo_value) %>%
 #'   epix_slide(
-#'     .f = ~ mean(.x$case_rate_7d_av),
-#'     .before = 2,
-#'     .versions = versions,
-#'     .new_col_name = "case_rate_7d_av_recent_av"
-#'   ) %>%
-#'   ungroup()
-#' # We requested time windows that started 2 days before the corresponding time
-#' # values. The actual number of `time_value`s in each computation depends on
-#' # the reporting latency of the signal and `time_value` range covered by the
-#' # archive (2020-06-01 -- 2021-11-30 in this example).  In this case, we have
-#' # * 0 `time_value`s, for ref time 2020-06-01 --> the result is automatically
-#' #                                                discarded
-#' # * 1 `time_value`, for ref time 2020-06-02
-#' # * 2 `time_value`s, for the rest of the results
-#' # * never the 3 `time_value`s we would get from `epi_slide`, since, because
-#' #   of data latency, we'll never have an observation
-#' #   `time_value == .ref_time_value` as of `.ref_time_value`.
-#' # The example below shows this type of behavior in more detail.
-#'
-#' # Examining characteristics of the data passed to each computation with
-#' # `all_versions=FALSE`.
+#'     geowide_percent_cli_max_time = max(time_value[!is.na(percent_cli)]),
+#'     geowide_percent_cli_rpt_lag = .version - geowide_percent_cli_max_time,
+#'     .versions = requested_versions
+#'   )
 #' archive_cases_dv_subset %>%
 #'   group_by(geo_value) %>%
 #'   epix_slide(
-#'     function(x, gk, rtv) {
-#'       tibble(
-#'         time_range = if (nrow(x) == 0L) {
-#'           "0 `time_value`s"
-#'         } else {
-#'           sprintf("%s -- %s", min(x$time_value), max(x$time_value))
-#'         },
-#'         n = nrow(x),
-#'         class1 = class(x)[[1L]]
-#'       )
-#'     },
-#'     .before = 5, .all_versions = FALSE,
-#'     .versions = versions
-#'   ) %>%
-#'   ungroup() %>%
-#'   arrange(geo_value, version)
+#'     percent_cli_max_time = max(time_value[!is.na(percent_cli)]),
+#'     percent_cli_rpt_lag = .version - percent_cli_max_time,
+#'     .versions = requested_versions
+#'   )
+#'
+#' # Backtest a forecaster "pseudoprospectively" (i.e., faithfully with respect
+#' # to the data version history):
+#' case_death_rate_archive %>%
+#'   epix_slide(
+#'     .versions = as.Date(c("2021-10-01", "2021-10-08")),
+#'     function(x, g, v) {
+#'       epipredict::arx_forecaster(
+#'         x,
+#'         outcome = "death_rate",
+#'         predictors = c("death_rate_7d_av", "case_rate_7d_av")
+#'       )$predictions
+#'     }
+#'   )
+#' # See `vignette("backtesting", package="epipredict")` for a full walkthrough
+#' # on backtesting forecasters, including plots, etc.
 #'
 #' # --- Advanced: ---
 #'