@@ -622,67 +622,83 @@ epix_detailed_restricted_mutate <- function(.data, ...) {
622
622
}
623
623
624
624
625
- # ' Slide a function over variables in an `epi_archive` or `grouped_epi_archive`
625
+ # ' Take each requested (group and) version in an archive, run a computation (e.g., forecast)
626
626
# '
627
- # ' Slides a given function over variables in an `epi_archive` object. This
628
- # ' behaves similarly to `epi_slide()`, with the key exception that it is
629
- # ' version-aware: the sliding computation at any given reference time t is
630
- # ' performed on **data that would have been available as of t**. This function
631
- # ' is intended for use in accurate backtesting of models; see
627
+ # ' ... and collect the results. This is useful for more accurately simulating
628
+ # ' how a forecaster, nowcaster, or other algorithm would have behaved in real
629
+ # ' time, factoring in reporting latency and data revisions; see
632
630
# ' \href{https://cmu-delphi.github.io/epipredict/articles/backtesting.html}{`vignette("backtesting",
633
631
# ' package="epipredict")`} for a walkthrough.
634
632
# '
633
+ # ' This is similar to looping over versions and calling [`epix_as_of`], but has
634
+ # ' some conveniences such as working naturally with [`grouped_epi_archive`]s,
635
+ # ' optional time windowing, and syntactic sugar to make things shorter to write.
636
+ # '
635
637
# ' @param .x An [`epi_archive`] or [`grouped_epi_archive`] object. If ungrouped,
636
638
# ' all data in `x` will be treated as part of a single data group.
637
639
# ' @param .f Function, formula, or missing; together with `...` specifies the
638
- # ' computation to slide. To "slide" means to apply a computation over a
639
- # ' sliding (a.k.a. "rolling") time window for each data group. The window is
640
- # ' determined by the `.before` parameter (see details for more). If a
641
- # ' function, `.f` must have the form `function(x, g, t, ...)`, where
642
- # '
643
- # ' - "x" is an epi_df with the same column names as the archive's `DT`, minus
644
- # ' the `version` column
645
- # ' - "g" is a one-row tibble containing the values of the grouping variables
646
- # ' for the associated group
647
- # ' - "t" is the ref_time_value for the current window
648
- # ' - "..." are additional arguments
640
+ # ' computation. The computation will be run on each requested group-version
641
+ # ' combination, with a time window filter applied if `.before` is supplied.
642
+ # '
643
+ # ' - If `.f` is a function must have the form `function(x, g, v)` or
644
+ # ' `function(x, g, v, <additional configuration args>)`, where
645
+ # '
646
+ # ' - `x` is an `epi_df` with the same column names as the archive's `DT`,
647
+ # ' minus the `version` column. (Or, if `.all_versions = TRUE`, an
648
+ # ' `epi_archive` with the requested partial version history.)
649
+ # '
650
+ # ' - `g` is a one-row tibble containing the values of the grouping variables
651
+ # ' for the associated group.
652
+ # '
653
+ # ' - `v` (length-1) is the associated `version` (one of the requested
654
+ # ' `.versions`)
655
+ # '
656
+ # ' - `<additional configuration args>` are optional; you can add such
657
+ # ' arguments to your function and set them by passing them through the
658
+ # ' `...` argument to `epix_slide()`.
649
659
# '
650
660
# ' If a formula, `.f` can operate directly on columns accessed via `.x$var` or
651
661
# ' `.$var`, as in `~ mean (.x$var)` to compute a mean of a column `var` for
652
662
# ' each group-`ref_time_value` combination. The group key can be accessed via
653
- # ' `.y` or `.group_key`, and the reference time value can be accessed via `.z`
654
- # ' or `.ref_time_value`. If `.f` is missing, then `...` will specify the
655
- # ' computation.
663
+ # ' `.y` or `.group_key`, and the reference time value can be accessed via
664
+ # ' `.z`, `.version`, or `.ref_time_value`. If `.f` is missing, then `...` will
665
+ # ' specify the computation.
656
666
# ' @param ... Additional arguments to pass to the function or formula specified
657
667
# ' via `f`. Alternatively, if `.f` is missing, then the `...` is interpreted
658
668
# ' as a ["data-masking"][rlang::args_data_masking] expression or expressions
659
669
# ' for tidy evaluation; in addition to referring columns directly by name, the
660
670
# ' expressions have access to `.data` and `.env` pronouns as in `dplyr` verbs,
661
671
# ' and can also refer to `.x` (not the same as the input epi_archive),
662
- # ' `.group_key`, and `.ref_time_value`. See details for more.
663
- # ' @param .before How many time values before the `.ref_time_value`
664
- # ' should each snapshot handed to the function `.f` contain? If provided, it
665
- # ' should be a single value that is compatible with the time_type of the
666
- # ' time_value column (more below), but most commonly an integer. This window
667
- # ' endpoint is inclusive. For example, if `.before = 7`, `time_type`
668
- # ' in the archive is "day", and the `.ref_time_value` is January 8, then the
669
- # ' smallest time_value in the snapshot will be January 1. If missing, then the
670
- # ' default is no limit on the time values, so the full snapshot is given.
671
- # ' @param .versions Reference time values / versions for sliding
672
- # ' computations; each element of this vector serves both as the anchor point
673
- # ' for the `time_value` window for the computation and the `max_version`
674
- # ' `epix_as_of` which we fetch data in this window. If missing, then this will
675
- # ' set to a regularly-spaced sequence of values set to cover the range of
676
- # ' `version`s in the `DT` plus the `versions_end`; the spacing of values will
677
- # ' be guessed (using the GCD of the skips between values).
672
+ # ' `.group_key` and `.version`/`.ref_time_value`. See details for more.
673
+ # ' @param .before Optional; applies a `time_value` filter before running each
674
+ # ' computation. The default is not to apply a `time_value` filter. If
675
+ # ' provided, it should be a single integer or difftime that is compatible with
676
+ # ' the time_type of the time_value column. If an integer, then the minimum
677
+ # ' possible `time_value` included will be that many time steps (according to
678
+ # ' the `time_type`) before each requested `.version`. This window endpoint is
679
+ # ' inclusive. For example, if `.before = 14`, the `time_type` in the archive
680
+ # ' is "day", and the requested `.version` is January 15, then the smallest
681
+ # ' possible `time_value` possible in the snapshot will be January 1. Note that
682
+ # ' this does not mean that there will be 14 or 15 distinct `time_value`s
683
+ # ' actually appearing in the data; for most reporting streams, reporting as of
684
+ # ' January 15 won't include `time_value`s all the way through January 14, due
685
+ # ' to reporting latency. Unlike `epi_slide()`, `epix_slide()` won't fill in
686
+ # ' any missing `time_values` in this window.
687
+ # ' @param .versions Requested versions on which to run the computation. Each
688
+ # ' requested `.version` also serves as the anchor point around which for which
689
+ # ' the `time_value` window specified by `.before` is drawn. If `.versions` is
690
+ # ' missing, it will be set to a regularly-spaced sequence of values set to
691
+ # ' cover the range of `version`s in the `DT` plus the `versions_end`; the
692
+ # ' spacing of values will be guessed (using the GCD of the skips between
693
+ # ' values).
678
694
# ' @param .new_col_name Either `NULL` or a string indicating the name of the new
679
695
# ' column that will contain the derived values. The default, `NULL`, will use
680
696
# ' the name "slide_value" unless your slide computations output data frames,
681
- # ' in which case they will be unpacked into the constituent columns and those
682
- # ' names used. If the resulting column name(s) overlap with the column names
683
- # ' used for labeling the computations, which are `group_vars(x)` and
684
- # ' ` "version"`, then the values for these columns must be identical to the
685
- # ' labels we assign.
697
+ # ' in which case they will be unpacked into the constituent columns and the
698
+ # ' data frame's column names will be used instead. If the resulting column
699
+ # ' name(s) overlap with the column names used for labeling the computations,
700
+ # ' which are `group_vars(x)` and ` "version"`, then the values for these
701
+ # ' columns must be identical to the labels we assign.
686
702
# ' @param .all_versions (Not the same as `.all_rows` parameter of `epi_slide`.)
687
703
# ' If `.all_versions = TRUE`, then the slide computation will be passed the
688
704
# ' version history (all `version <= .version` where `.version` is one of the
@@ -697,16 +713,17 @@ epix_detailed_restricted_mutate <- function(.data, ...) {
697
713
# ' @details A few key distinctions between the current function and `epi_slide()`:
698
714
# ' 1. In `.f` functions for `epix_slide`, one should not assume that the input
699
715
# ' data to contain any rows with `time_value` matching the computation's
700
- # ' `.ref_time_value` (accessible via `attributes(<data>)$metadata$as_of`); for
701
- # ' typical epidemiological surveillance data, observations pertaining to a
702
- # ' particular time period (`time_value`) are first reported `as_of` some
703
- # ' instant after that time period has ended.
716
+ # ' `.version`, due to reporting latency; for typical epidemiological
717
+ # ' surveillance data, observations pertaining to a particular time period
718
+ # ' (`time_value`) are first reported `as_of` some instant after that time
719
+ # ' period has ended. No time window completion is performed as in
720
+ # ' `epi_slide()`.
704
721
# ' 2. The input class and columns are similar but different: `epix_slide`
705
722
# ' (with the default `.all_versions=FALSE`) keeps all columns and the
706
723
# ' `epi_df`-ness of the first argument to each computation; `epi_slide` only
707
724
# ' provides the grouping variables in the second input, and will convert the
708
725
# ' first input into a regular tibble if the grouping variables include the
709
- # ' essential `geo_value` column. (With .all_versions=TRUE`, `epix_slide` will
726
+ # ' essential `geo_value` column. (With ` .all_versions=TRUE`, `epix_slide` will
710
727
# ' will provide an `epi_archive` rather than an `epi-df` to each
711
728
# ' computation.)
712
729
# ' 3. The output class and columns are similar but different: `epix_slide()`
@@ -726,75 +743,55 @@ epix_detailed_restricted_mutate <- function(.data, ...) {
726
743
# ' computations are allowed more flexibility in their outputs than in
727
744
# ' `epi_slide`, we can't guess a good representation for missing computations
728
745
# ' for excluded group-`.ref_time_value` pairs.
729
- # ' 76 . The `.versions` default for `epix_slide` is based on making an
746
+ # ' 6 . The `.versions` default for `epix_slide` is based on making an
730
747
# ' evenly-spaced sequence out of the `version`s in the `DT` plus the
731
748
# ' `versions_end`, rather than the `time_value`s.
749
+ # ' 7. `epix_slide()` computations can refer to the current element of
750
+ # ' `.versions` as either `.version` or `.ref_time_value`, while `epi_slide()`
751
+ # ' computations refer to the current element of `.ref_time_values` with
752
+ # ' `.ref_time_value`.
732
753
# '
733
754
# ' Apart from the above distinctions, the interfaces between `epix_slide()` and
734
755
# ' `epi_slide()` are the same.
735
756
# '
736
- # ' Furthermore, the current function can be considerably slower than
737
- # ' `epi_slide()`, for two reasons: (1) it must repeatedly fetch
738
- # ' properly-versioned snapshots from the data archive (via `epix_as_of()`),
739
- # ' and (2) it performs a "manual" sliding of sorts, and does not benefit from
740
- # ' the highly efficient `slider` package. For this reason, it should never be
741
- # ' used in place of `epi_slide()`, and only used when version-aware sliding is
742
- # ' necessary (as it its purpose).
743
- # '
744
757
# ' @examples
745
758
# ' library(dplyr)
746
759
# '
747
- # ' # Reference time points for which we want to compute slide values:
748
- # ' versions <- seq(as.Date("2020-06-02"),
749
- # ' as.Date("2020-06-15"),
750
- # ' by = "1 day"
751
- # ' )
760
+ # ' # Request only a small set of versions, for example's sake:
761
+ # ' requested_versions <-
762
+ # ' seq(as.Date("2020-09-02"), as.Date("2020-09-15"), by = "1 day")
752
763
# '
753
- # ' # A simple (but not very useful) example (see the archive vignette for a more
754
- # ' # realistic one ):
764
+ # ' # Investigate reporting lag of `percent_cli` signal (though normally we'd
765
+ # ' # probably work off of the dedicated `revision_summary()` function instead ):
755
766
# ' archive_cases_dv_subset %>%
756
- # ' group_by(geo_value) %>%
757
767
# ' epix_slide(
758
- # ' .f = ~ mean(.x$case_rate_7d_av),
759
- # ' .before = 2,
760
- # ' .versions = versions,
761
- # ' .new_col_name = "case_rate_7d_av_recent_av"
762
- # ' ) %>%
763
- # ' ungroup()
764
- # ' # We requested time windows that started 2 days before the corresponding time
765
- # ' # values. The actual number of `time_value`s in each computation depends on
766
- # ' # the reporting latency of the signal and `time_value` range covered by the
767
- # ' # archive (2020-06-01 -- 2021-11-30 in this example). In this case, we have
768
- # ' # * 0 `time_value`s, for ref time 2020-06-01 --> the result is automatically
769
- # ' # discarded
770
- # ' # * 1 `time_value`, for ref time 2020-06-02
771
- # ' # * 2 `time_value`s, for the rest of the results
772
- # ' # * never the 3 `time_value`s we would get from `epi_slide`, since, because
773
- # ' # of data latency, we'll never have an observation
774
- # ' # `time_value == .ref_time_value` as of `.ref_time_value`.
775
- # ' # The example below shows this type of behavior in more detail.
776
- # '
777
- # ' # Examining characteristics of the data passed to each computation with
778
- # ' # `all_versions=FALSE`.
768
+ # ' geowide_percent_cli_max_time = max(time_value[!is.na(percent_cli)]),
769
+ # ' geowide_percent_cli_rpt_lag = .version - geowide_percent_cli_max_time,
770
+ # ' .versions = requested_versions
771
+ # ' )
779
772
# ' archive_cases_dv_subset %>%
780
773
# ' group_by(geo_value) %>%
781
774
# ' epix_slide(
782
- # ' function(x, gk, rtv) {
783
- # ' tibble(
784
- # ' time_range = if (nrow(x) == 0L) {
785
- # ' "0 `time_value`s"
786
- # ' } else {
787
- # ' sprintf("%s -- %s", min(x$time_value), max(x$time_value))
788
- # ' },
789
- # ' n = nrow(x),
790
- # ' class1 = class(x)[[1L]]
791
- # ' )
792
- # ' },
793
- # ' .before = 5, .all_versions = FALSE,
794
- # ' .versions = versions
795
- # ' ) %>%
796
- # ' ungroup() %>%
797
- # ' arrange(geo_value, version)
775
+ # ' percent_cli_max_time = max(time_value[!is.na(percent_cli)]),
776
+ # ' percent_cli_rpt_lag = .version - percent_cli_max_time,
777
+ # ' .versions = requested_versions
778
+ # ' )
779
+ # '
780
+ # ' # Backtest a forecaster "pseudoprospectively" (i.e., faithfully with respect
781
+ # ' # to the data version history):
782
+ # ' case_death_rate_archive %>%
783
+ # ' epix_slide(
784
+ # ' .versions = as.Date(c("2021-10-01", "2021-10-08")),
785
+ # ' function(x, g, v) {
786
+ # ' epipredict::arx_forecaster(
787
+ # ' x,
788
+ # ' outcome = "death_rate",
789
+ # ' predictors = c("death_rate_7d_av", "case_rate_7d_av")
790
+ # ' )$predictions
791
+ # ' }
792
+ # ' )
793
+ # ' # See `vignette("backtesting", package="epipredict")` for a full walkthrough
794
+ # ' # on backtesting forecasters, including plots, etc.
798
795
# '
799
796
# ' # --- Advanced: ---
800
797
# '
0 commit comments