cmu-delphi · brookslogan · Feb 26, 2025 · Feb 26, 2025 · Feb 28, 2025 · Mar 3, 2025
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -97,6 +97,8 @@ Collate:
     'correlation.R'
     'epi_df.R'
     'epi_df_forbidden_methods.R'
+    'epi_slide_opt_archive.R'
+    'epi_slide_opt_edf.R'
     'epiprocess-package.R'
     'group_by_epi_df_methods.R'
     'methods-epi_archive.R'
@@ -105,6 +107,7 @@ Collate:
     'key_colnames.R'
     'methods-epi_df.R'
     'outliers.R'
+    'patch.R'
     'reexports.R'
     'revision_analysis.R'
     'slide.R'

diff --git a/NAMESPACE b/NAMESPACE
@@ -27,6 +27,9 @@ S3method(dplyr_col_modify,col_modify_recorder_df)
 S3method(dplyr_col_modify,epi_df)
 S3method(dplyr_reconstruct,epi_df)
 S3method(dplyr_row_slice,epi_df)
+S3method(epi_slide_opt,epi_archive)
+S3method(epi_slide_opt,epi_df)
+S3method(epi_slide_opt,grouped_epi_archive)
 S3method(epix_slide,epi_archive)
 S3method(epix_slide,grouped_epi_archive)
 S3method(epix_truncate_versions_after,epi_archive)
@@ -101,6 +104,7 @@ export(time_column_names)
 export(ungroup)
 export(unnest)
 export(validate_epi_archive)
+export(vec_approx_equal)
 export(version_column_names)
 import(epidatasets)
 importFrom(checkmate,anyInfinite)
@@ -117,13 +121,19 @@ importFrom(checkmate,assert_logical)
 importFrom(checkmate,assert_number)
 importFrom(checkmate,assert_numeric)
 importFrom(checkmate,assert_scalar)
+importFrom(checkmate,assert_set_equal)
 importFrom(checkmate,assert_string)
 importFrom(checkmate,assert_subset)
 importFrom(checkmate,assert_tibble)
+importFrom(checkmate,assert_true)
 importFrom(checkmate,checkInt)
 importFrom(checkmate,check_atomic)
+importFrom(checkmate,check_character)
 importFrom(checkmate,check_data_frame)
+importFrom(checkmate,check_logical)
 importFrom(checkmate,check_names)
+importFrom(checkmate,check_null)
+importFrom(checkmate,check_numeric)
 importFrom(checkmate,expect_class)
 importFrom(checkmate,test_int)
 importFrom(checkmate,test_set_equal)
@@ -143,6 +153,7 @@ importFrom(data.table,address)
 importFrom(data.table,as.data.table)
 importFrom(data.table,between)
 importFrom(data.table,copy)
+importFrom(data.table,fifelse)
 importFrom(data.table,frollapply)
 importFrom(data.table,frollmean)
 importFrom(data.table,frollsum)
@@ -151,6 +162,8 @@ importFrom(data.table,key)
 importFrom(data.table,rbindlist)
 importFrom(data.table,set)
 importFrom(data.table,setDF)
+importFrom(data.table,setDT)
+importFrom(data.table,setcolorder)
 importFrom(data.table,setkeyv)
 importFrom(dplyr,"%>%")
 importFrom(dplyr,across)
@@ -173,8 +186,8 @@ importFrom(dplyr,if_all)
 importFrom(dplyr,if_any)
 importFrom(dplyr,if_else)
 importFrom(dplyr,is_grouped_df)
-importFrom(dplyr,lag)
 importFrom(dplyr,mutate)
+importFrom(dplyr,n_groups)
 importFrom(dplyr,pick)
 importFrom(dplyr,pull)
 importFrom(dplyr,relocate)
@@ -200,6 +213,7 @@ importFrom(rlang,"%||%")
 importFrom(rlang,.data)
 importFrom(rlang,.env)
 importFrom(rlang,arg_match)
+importFrom(rlang,arg_match0)
 importFrom(rlang,caller_arg)
 importFrom(rlang,caller_env)
 importFrom(rlang,check_dots_empty)
@@ -212,6 +226,7 @@ importFrom(rlang,expr_label)
 importFrom(rlang,f_env)
 importFrom(rlang,f_rhs)
 importFrom(rlang,is_bare_integerish)
+importFrom(rlang,is_bare_list)
 importFrom(rlang,is_bare_numeric)
 importFrom(rlang,is_environment)
 importFrom(rlang,is_formula)
@@ -235,10 +250,12 @@ importFrom(slider,slide_sum)
 importFrom(stats,cor)
 importFrom(stats,median)
 importFrom(tibble,as_tibble)
+importFrom(tibble,is_tibble)
 importFrom(tibble,new_tibble)
 importFrom(tibble,validate_tibble)
 importFrom(tidyr,complete)
 importFrom(tidyr,full_seq)
+importFrom(tidyr,nest)
 importFrom(tidyr,unnest)
 importFrom(tidyselect,any_of)
 importFrom(tidyselect,eval_select)
@@ -248,15 +265,24 @@ importFrom(tsibble,as_tsibble)
 importFrom(utils,capture.output)
 importFrom(utils,tail)
 importFrom(vctrs,"vec_slice<-")
+importFrom(vctrs,obj_is_vector)
 importFrom(vctrs,vec_cast)
+importFrom(vctrs,vec_cast_common)
 importFrom(vctrs,vec_data)
 importFrom(vctrs,vec_duplicate_any)
+importFrom(vctrs,vec_duplicate_id)
 importFrom(vctrs,vec_equal)
 importFrom(vctrs,vec_in)
+importFrom(vctrs,vec_match)
 importFrom(vctrs,vec_order)
+importFrom(vctrs,vec_ptype)
 importFrom(vctrs,vec_rbind)
+importFrom(vctrs,vec_recycle)
 importFrom(vctrs,vec_recycle_common)
 importFrom(vctrs,vec_rep)
+importFrom(vctrs,vec_rep_each)
+importFrom(vctrs,vec_seq_along)
 importFrom(vctrs,vec_size)
+importFrom(vctrs,vec_size_common)
 importFrom(vctrs,vec_slice)
 importFrom(vctrs,vec_sort)
diff --git a/NEWS.md b/NEWS.md
@@ -2,6 +2,20 @@
 
 Pre-1.0.0 numbering scheme: 0.x will indicate releases, while 0.x.y will indicate PR's.
 
+# epiprocess 0.12
+
+## Breaking changes
+
+- The low-level `new_epi_archive()` function's `x` argument has been replaced
+  with a `data_table` argument, which now has extra requirements; see
+  `?new_epi_archive`. Users should still be using `as_epi_archive()` unless they
+  have a need for something lower-level.
+
+## New features
+
+- `epi_slide_{mean,sum,opt}` now work on `epi_archive`s, preparing version
+  histories for 7-day-averages of signals, etc.
+
 # epiprocess 0.11
 
 ## Breaking changes

diff --git a/R/archive.R b/R/archive.R
@@ -186,17 +186,21 @@ next_after.Date <- function(x) x + 1L
 #'  archive. Unexpected behavior may result from modifying the metadata
 #'  directly.
 #'
-#' @param x A data.frame, data.table, or tibble, with columns `geo_value`,
-#'   `time_value`, `version`, and then any additional number of columns.
+#' @param data_table a `data.table` with [`data.table::key()`] equal to
+#'   `c("geo_value", other_keys, "time_value", "version")`. For `data.table`
+#'   users: this sets up an alias of `data_table`; if you plan to keep on
+#'   working with `data_table` or working directly with the archive's `$DT`
+#'   using mutating operations, you should `copy()` if appropriate. We will not
+#'   mutate the `DT` with any exported `{epiprocess}` functions, though.
 #' @param geo_type DEPRECATED Has no effect. Geo value type is inferred from the
-#' location column and set to "custom" if not recognized.
-#' @param time_type DEPRECATED Has no effect. Time value type inferred from the time
-#' column and set to "custom" if not recognized. Unpredictable behavior may result
-#' if the time type is not recognized.
+#'   location column and set to "custom" if not recognized.
+#' @param time_type DEPRECATED Has no effect. Time value type inferred from the
+#'   time column and set to "custom" if not recognized. Unpredictable behavior
+#'   may result if the time type is not recognized.
 #' @param other_keys Character vector specifying the names of variables in `x`
 #'   that should be considered key variables (in the language of `data.table`)
-#'   apart from "geo_value", "time_value", and "version". Typical examples
-#'   are "age" or more granular geographies.
+#'   apart from "geo_value", "time_value", and "version". Typical examples are
+#'   "age" or more granular geographies.
 #' @param compactify Optional; `TRUE`, `FALSE`, or `"message"`. `TRUE` will
 #'   remove some redundant rows, `FALSE` will not. `"message"` is like `TRUE`
 #'   but will emit a message if anything was changed. Default is `TRUE`. See
@@ -278,41 +282,22 @@ next_after.Date <- function(x) x + 1L
 #' x <- df %>% as_epi_archive(other_keys = "county")
 #'
 new_epi_archive <- function(
-    x,
+    data_table,
     geo_type,
     time_type,
     other_keys,
     clobberable_versions_start,
     versions_end) {
-  assert_data_frame(x)
+  assert_class(data_table, "data.table")
   assert_string(geo_type)
   assert_string(time_type)
   assert_character(other_keys, any.missing = FALSE)
   if (any(c("geo_value", "time_value", "version") %in% other_keys)) {
     cli_abort("`other_keys` cannot contain \"geo_value\", \"time_value\", or \"version\".")
   }
-  validate_version_bound(clobberable_versions_start, x, na_ok = TRUE)
-  validate_version_bound(versions_end, x, na_ok = FALSE)
-
-  key_vars <- c("geo_value", "time_value", other_keys, "version")
-  if (!all(key_vars %in% names(x))) {
-    # Give a more tailored error message than as.data.table would:
-    cli_abort(c(
-      "`x` is missing the following expected columns:
-       {format_varnames(setdiff(key_vars, names(x)))}.",
-      ">" = "You might need to `dplyr::rename()` beforehand
-                       or use `as_epi_archive()`'s renaming feature.",
-      ">" = if (!all(other_keys %in% names(x))) {
-        "Check also for typos in `other_keys`."
-      }
-    ))
-  }
-
-  # Create the data table; if x was an un-keyed data.table itself,
-  # then the call to as.data.table() will fail to set keys, so we
-  # need to check this, then do it manually if needed
-  data_table <- as.data.table(x, key = key_vars)
-  if (!identical(key_vars, key(data_table))) setkeyv(data_table, cols = key_vars)
+  assert_true(identical(key(data_table), c("geo_value", other_keys, "time_value", "version")))
+  validate_version_bound(clobberable_versions_start, data_table, na_ok = TRUE)
+  validate_version_bound(versions_end, data_table, na_ok = FALSE)
 
   structure(
     list(
@@ -334,7 +319,7 @@ new_epi_archive <- function(
 validate_epi_archive <- function(x) {
   assert_class(x, "epi_archive")
 
-  ukey_vars1 <- c("geo_value", "time_value", x$other_keys, "version")
+  ukey_vars1 <- c("geo_value", x$other_keys, "time_value", "version")
   ukey_vars2 <- key(x$DT)
   if (!identical(ukey_vars1, ukey_vars2)) {
     cli_abort(c("`data.table::key(x$DT)` not as expected",
@@ -401,7 +386,7 @@ validate_epi_archive <- function(x) {
 #'   would be `key(DT)`.
 #' @param abs_tol numeric, >=0; absolute tolerance to use on numeric measurement
 #'   columns when determining whether something can be compactified away; see
-#'   [`is_locf`]
+#'   [`vec_approx_equal`]
 #'
 #' @importFrom data.table is.data.table key
 #' @importFrom dplyr arrange filter
@@ -420,10 +405,23 @@ apply_compactify <- function(updates_df, ukey_names, abs_tol = 0) {
   }
   assert_numeric(abs_tol, len = 1, lower = 0)
 
-  if (!is.data.table(updates_df) || !identical(key(updates_df), ukey_names)) {
+  if (is.data.table(updates_df)) {
+    if (!identical(key(updates_df), ukey_names)) {
+      cli_abort(c("`ukey_names` should match `key(updates_df)`",
+        "i" = "`ukey_names` was {format_chr_deparse(ukey_names)}",
+        "i" = "`key(updates_df)` was {format_chr_deparse(key(updates_df))}"
+      ))
+    }
+  } else {
     updates_df <- updates_df %>% arrange(pick(all_of(ukey_names)))
   }
-  updates_df[!update_is_locf(updates_df, ukey_names, abs_tol), ]
+
+  # In case updates_df is a data.table, store keep flags in a local: "When the
+  # first argument inside DT[...] is a single symbol (e.g. DT[var]), data.table
+  # looks for var in calling scope". In case it's not a data.table, make sure to
+  # use df[i,] not just df[i].
+  to_keep <- !update_is_locf(updates_df, ukey_names, abs_tol)
+  updates_df[to_keep, ]
 }
 
 #' get the entries that `compactify` would remove
@@ -460,56 +458,38 @@ update_is_locf <- function(arranged_updates_df, ukey_names, abs_tol) {
   ekt_names <- ukey_names[ukey_names != "version"]
   val_names <- all_names[!all_names %in% ukey_names]
 
-  Reduce(`&`, lapply(updates_col_refs[ekt_names], is_locf, abs_tol, TRUE)) &
-    Reduce(`&`, lapply(updates_col_refs[val_names], is_locf, abs_tol, FALSE))
-}
-
-#' Checks to see if a value in a vector is LOCF
-#' @description LOCF meaning last observation carried forward (to later
-#'   versions). Lags the vector by 1, then compares with itself. If `is_key` is
-#'   `TRUE`, only values that are exactly the same between the lagged and
-#'   original are considered LOCF. If `is_key` is `FALSE` and `vec` is a vector
-#'   of numbers ([`base::is.numeric`]), then approximate equality will be used,
-#'   checking whether the absolute difference between each pair of entries is
-#'   `<= abs_tol`; if `vec` is something else, then exact equality is used
-#'   instead.
-#'
-#' @details
-#'
-#' We include epikey-time columns in LOCF comparisons as part of an optimization
-#' to avoid slower grouped operations while still ensuring that the first
-#' observation for each time series will not be marked as LOCF. We test these
-#' key columns for exact equality to prevent chopping off consecutive
-#' time_values during flat periods when `abs_tol` is high.
-#'
-#' We use exact equality for non-`is.numeric` double/integer columns such as
-#' dates, datetimes, difftimes, `tsibble::yearmonth`s, etc., as these may be
-#' used as part of re-indexing or grouping procedures, and we don't want to
-#' change the number of groups for those operations when we remove LOCF data
-#' during compactification.
-#'
-#' @importFrom dplyr lag if_else
-#' @importFrom rlang is_bare_numeric
-#' @importFrom vctrs vec_equal
-#' @keywords internal
-is_locf <- function(vec, abs_tol, is_key) { # nolint: object_usage_linter
-  lag_vec <- lag(vec)
-  if (is.vector(vec, mode = "numeric") && !is_key) {
-    # (integer or double vector, no class (& no dims); maybe names, which we'll
-    # ignore like `vec_equal`); not a key column
-    unname(if_else(
-      !is.na(vec) & !is.na(lag_vec),
-      abs(vec - lag_vec) <= abs_tol,
-      is.na(vec) & is.na(lag_vec)
-    ))
+  n_updates <- nrow(arranged_updates_df)
+  if (n_updates == 0L) {
+    logical(0L)
+  } else if (n_updates == 1L) {
+    FALSE # sole observation is not LOCF
   } else {
-    vec_equal(vec, lag_vec, na_equal = TRUE)
+    ekts_tbl <- new_tibble(updates_col_refs[ekt_names])
+    vals_tbl <- new_tibble(updates_col_refs[val_names])
+    # n_updates >= 2L so we can use `:` naturally (this is the reason for
+    # separating out n_updates == 1L from this case):
+    inds1 <- 2L:n_updates
+    inds2 <- 1L:(n_updates - 1L)
+    c(
+      FALSE, # first observation is not LOCF
+      vec_approx_equal0(ekts_tbl,
+        inds1 = inds1, ekts_tbl, inds2 = inds2,
+        # check ekt (key) cols with 0 tolerance:
+        na_equal = TRUE, abs_tol = 0
+      ) &
+        vec_approx_equal0(vals_tbl,
+          inds1 = inds1, vals_tbl, inds2 = inds2,
+          na_equal = TRUE, abs_tol = abs_tol
+        )
+    )
   }
 }
 
 #' `as_epi_archive` converts a data frame, data table, or tibble into an
 #' `epi_archive` object.
 #'
+#' @param x A data.frame, data.table, or tibble, with columns `geo_value`,
+#'   `time_value`, `version`, and then any additional number of columns.
 #' @param ... used for specifying column names, as in [`dplyr::rename`]. For
 #'   example `version = release_date`
 #' @param .versions_end location based versions_end, used to avoid prefix
@@ -530,11 +510,32 @@ as_epi_archive <- function(
     .versions_end = max_version_with_row_in(x), ...,
     versions_end = .versions_end) {
   assert_data_frame(x)
+  # Convert first to data.frame to guard against data.table#6859 and potentially
+  # other things epiprocess#618:
+  x_already_copied <- identical(class(x), c("data.table", "data.frame"))
+  x <- as.data.frame(x)
   x <- rename(x, ...)
-  x <- guess_column_name(x, "time_value", time_column_names())
   x <- guess_column_name(x, "geo_value", geo_column_names())
+  if (!all(other_keys %in% names(x))) {
+    # Give a more tailored error message than as.data.table would:
+    cli_abort(c(
+      "`x` is missing the following expected columns:
+       {format_varnames(setdiff(other_keys, names(x)))}.",
+      ">" = "You might need to `dplyr::rename()` beforehand
+             or using `as_epi_archive()`'s renaming feature."
+    ))
+  }
+  x <- guess_column_name(x, "time_value", time_column_names())
   x <- guess_column_name(x, "version", version_column_names())
 
+  # Convert to data.table:
+  key_vars <- c("geo_value", other_keys, "time_value", "version")
+  if (x_already_copied) {
+    setDT(x, key = key_vars)
+  } else {
+    x <- as.data.table(x, key = key_vars)
+  }
+
   if (lifecycle::is_present(geo_type)) {
     cli_warn("epi_archive constructor argument `geo_type` is now ignored. Consider removing.")
   }
@@ -555,11 +556,10 @@ as_epi_archive <- function(
     cli_abort('`compactify` must be `TRUE`, `FALSE`, or `"message"`')
   }
 
-  data_table <- result$DT
-  key_vars <- key(data_table)
+  data_table <- result$DT # probably just `x`, but take no chances
 
   nrow_before_compactify <- nrow(data_table)
-  # Runs compactify on data frame
+  # Runs compactify on data_table
   if (identical(compactify, TRUE) || identical(compactify, "message")) {
     compactified <- apply_compactify(data_table, key_vars, compactify_abs_tol)
   } else {

diff --git a/R/epi_slide_opt_archive.R b/R/epi_slide_opt_archive.R
@@ -0,0 +1,221 @@
+#' Core operation of `epi_slide_opt.epi_archive` for a single epikey's history
+#'
+#' @param updates tibble with two columns: `version` and `subtbl`; `subtbl` is a
+#'   list of tibbles, each with a `time_value` column and measurement columns.
+#'   The epikey should not appear.
+#' @param in_colnames chr; names of columns to which to apply `f_dots_baked`
+#' @param f_dots_baked supported sliding function from `{data.table}` or
+#'   `{slider}`, potentially with some arguments baked in with
+#'   [`purrr::partial`]
+#' @param f_from_package string; name of package from which `f_dots_baked`
+#'   (pre-`partial`) originates
+#' @param before integerish >=0 or Inf; number of time steps before each
+#'   ref_time_value to include in the sliding window computation; Inf to include
+#'   all times beginning with the min `time_value`
+#' @param after integerish >=0; number of time steps after each ref_time_value
+#'   to include in the sliding window computation
+#' @param time_type as in `new_epi_archive`
+#' @param out_colnames chr, same length as `in_colnames`; column names to use
+#'   for results
+#' @return list of tibbles with same names as `subtbl`s plus: `c(out_colnames,
+#'   "version")`; (compactified) diff data to put into an `epi_archive`
+#'
+#' @examples
+#'
+#' library(dplyr)
+#' updates <- bind_rows(
+#'   tibble(version = 10, time_value = 1:20, value = 1:20),
+#'   tibble(version = 12, time_value = 4:5, value = 5:4),
+#'   tibble(version = 13, time_value = 8, value = 9),
+#'   tibble(version = 14, time_value = 11, value = NA),
+#'   tibble(version = 15, time_value = -10, value = -10),
+#'   tibble(version = 16, time_value = 50, value = 50)
+#' ) %>%
+#'   mutate(across(c(version, time_value), ~ as.Date("2020-01-01") - 1 + .x)) %>%
+#'   tidyr::nest(.by = version, .key = "subtbl")
+#'
+#' f <- purrr::partial(data.table::frollmean, algo = "exact")
+#'
+#' updates %>%
+#'   epiprocess:::epi_slide_opt_archive_one_epikey(
+#'     "value", f, "data.table", 2L, 0L, "day", "slide_value"
+#'   )
+#'
+#' @keywords internal
+epi_slide_opt_archive_one_epikey <- function(
+    updates,
+    in_colnames,
+    f_dots_baked, f_from_package, before, after, time_type,
+    out_colnames) {
+  # TODO check for col name clobbering
+  unit_step <- unit_time_delta(time_type)
+  prev_inp_snapshot <- NULL
+  prev_out_snapshot <- NULL
+  result <- map(seq_len(nrow(updates)), function(update_i) {
+    version <- updates$version[[update_i]]
+    inp_update <- updates$subtbl[[update_i]]
+    inp_snapshot <- tbl_patch(prev_inp_snapshot, inp_update, "time_value")
+    if (before == Inf) {
+      if (after != 0) {
+        cli_abort('.window_size = Inf is only supported with .align = "right"')
+      }
+      # We need to use the entire input snapshot range, filling in time gaps. We
+      # shouldn't pad the ends.
+      slide_t_min <- min(inp_snapshot$time_value)
+      slide_t_max <- max(inp_snapshot$time_value)
+    } else {
+      # If the input had updates in the range t1..t2, this could produce changes
+      # in slide outputs in the range t1-after..t2+before, and to compute those
+      # slide values, we need to look at the input snapshot from
+      # t1-after-before..t2+before+after. nolint: commented_code_linter
+      inp_update_t_min <- min(inp_update$time_value)
+      inp_update_t_max <- max(inp_update$time_value)
+      slide_t_min <- inp_update_t_min - (before + after) * unit_step
+      slide_t_max <- inp_update_t_max + (before + after) * unit_step
+    }
+    slide_nrow <- time_delta_to_n_steps(slide_t_max - slide_t_min, time_type) + 1L
+    slide_time_values <- slide_t_min + 0L:(slide_nrow - 1L) * unit_step
+    slide_inp_backrefs <- vec_match(slide_time_values, inp_snapshot$time_value)
+    # Get additional values needed from inp_snapshot + perform any NA
+    # tail-padding needed to make slider results a fixed window size rather than
+    # adaptive at tails + perform any NA gap-filling needed:
+    slide <- inp_snapshot[slide_inp_backrefs, ]
+    slide$time_value <- slide_time_values
+    if (f_from_package == "data.table") {
+      for (col_i in seq_along(in_colnames)) {
+        if (before == Inf) {
+          slide[[out_colnames[[col_i]]]] <-
+            f_dots_baked(slide[[in_colnames[[col_i]]]], seq_len(slide_nrow), adaptive = TRUE)
+        } else {
+          out_col <- f_dots_baked(slide[[in_colnames[[col_i]]]], before + after + 1L)
+          if (after != 0L) {
+            # data.table always puts NAs at tails, even with na.rm = TRUE; chop
+            # off extra NAs from beginning and place missing NAs at end:
+            out_col <- c(out_col[seq(after + 1L, slide_nrow)], rep(NA, after))
+          }
+          slide[[out_colnames[[col_i]]]] <- out_col
+        }
+      }
+    } else if (f_from_package == "slider") {
+      for (col_i in seq_along(in_colnames)) {
+        slide[[out_colnames[[col_i]]]] <- f_dots_baked(slide[[in_colnames[[col_i]]]], before = before, after = after)
+      }
+    } else {
+      cli_abort("epiprocess internal error: `f_from_package` was {format_chr_deparse(f_from_package)}")
+    }
-    if (f_from_package == "data.table") {
-      for (col_i in seq_along(in_colnames)) {
-        if (before == Inf) {
-          slide[[out_colnames[[col_i]]]] <-
-            f_dots_baked(slide[[in_colnames[[col_i]]]], seq_len(slide_nrow), adaptive = TRUE)
-        } else {
-          out_col <- f_dots_baked(slide[[in_colnames[[col_i]]]], before + after + 1L)
-          if (after != 0L) {
-            # data.table always puts NAs at tails, even with na.rm = TRUE; chop
-            # off extra NAs from beginning and place missing NAs at end:
-            out_col <- c(out_col[seq(after + 1L, slide_nrow)], rep(NA, after))
-          }
-          slide[[out_colnames[[col_i]]]] <- out_col
-        }
-      }
-    } else if (f_from_package == "slider") {
-      for (col_i in seq_along(in_colnames)) {
-        slide[[out_colnames[[col_i]]]] <- f_dots_baked(slide[[in_colnames[[col_i]]]], before = before, after = after)
-      }
-    } else {
-      cli_abort("epiprocess internal error: `f_from_package` was {format_chr_deparse(f_from_package)}")
-    }
+    if (f_from_package == "data.table") {
+      if (before == Inf) {
+        slide[, out_colnames] <-
+          f_dots_baked(slide[, in_colnames], seq_len(slide_nrow), adaptive = TRUE)
+      } else {
+        out_cols <- f_dots_baked(slide[, in_colnames], before + after + 1L)
+        if (after != 0L) {
+          # data.table always puts NAs at tails, even with na.rm = TRUE; chop
+          # off extra NAs from beginning and place missing NAs at end:
+          out_cols <- purrr::map(out_cols, function(.x) {
+            c(.x[seq(after + 1L, slide_nrow)], rep(NA, after))
+          })
+        }
+        slide[, out_colnames] <- out_cols
+      }
+    } else if (f_from_package == "slider") {
+      for (col_i in seq_along(in_colnames)) {
+        slide[[out_colnames[[col_i]]]] <- f_dots_baked(slide[[in_colnames[[col_i]]]], before = before, after = after)
+      }
+    } else {
+      cli_abort("epiprocess internal error: `f_from_package` was {format_chr_deparse(f_from_package)}")
+    }
-    if (f_from_package == "data.table") {
-      for (col_i in seq_along(in_colnames)) {
-        if (before == Inf) {
-          slide[[out_colnames[[col_i]]]] <-
-            f_dots_baked(slide[[in_colnames[[col_i]]]], seq_len(slide_nrow), adaptive = TRUE)
-        } else {
-          out_col <- f_dots_baked(slide[[in_colnames[[col_i]]]], before + after + 1L)
-          if (after != 0L) {
-            # data.table always puts NAs at tails, even with na.rm = TRUE; chop
-            # off extra NAs from beginning and place missing NAs at end:
-            out_col <- c(out_col[seq(after + 1L, slide_nrow)], rep(NA, after))
-          }
-          slide[[out_colnames[[col_i]]]] <- out_col
-        }
-      }
-    } else if (f_from_package == "slider") {
-      for (col_i in seq_along(in_colnames)) {
-        slide[[out_colnames[[col_i]]]] <- f_dots_baked(slide[[in_colnames[[col_i]]]], before = before, after = after)
-      }
-    } else {
-      cli_abort("epiprocess internal error: `f_from_package` was {format_chr_deparse(f_from_package)}")
-    }
+    if (f_from_package == "data.table") {
+      if (before == Inf) {
+        slide[, out_colnames] <-
+          f_dots_baked(slide[, in_colnames], seq_len(slide_nrow), adaptive = TRUE)
+      } else {
+        out_cols <- f_dots_baked(slide[, in_colnames], before + after + 1L)
+        if (after != 0L) {
+          # data.table always puts NAs at tails, even with na.rm = TRUE; chop
+          # off extra NAs from beginning and place missing NAs at end:
+          out_cols <- purrr::map(out_cols, function(.x) {
+            c(.x[seq(after + 1L, slide_nrow)], rep(NA, after))
+          })
+        }
+        slide[, out_colnames] <- out_cols
+      }
+    } else if (f_from_package == "slider") {
+      for (col_i in seq_along(in_colnames)) {
+        slide[[out_colnames[[col_i]]]] <- f_dots_baked(slide[[in_colnames[[col_i]]]], before = before, after = after)
+      }
+    } else {
+      cli_abort("epiprocess internal error: `f_from_package` was {format_chr_deparse(f_from_package)}")
+    }
+    rows_should_keep <-
+      if (before == Inf) {
+        # Re-introduce time gaps:
+        !is.na(slide_inp_backrefs)
+      } else {
+        # Get back to t1-after..t2+before; times outside this range were included
+        # only so those inside would have enough context for their slide
+        # computations, but these "context" rows may contain invalid slide
+        # computation outputs:
+        vec_rep_each(c(FALSE, TRUE, FALSE), c(before, slide_nrow - before - after, after)) &
+          # Only include time_values that appeared in the input snapshot:
+          !is.na(slide_inp_backrefs)
+      }
+    out_update <- slide[rows_should_keep, ]
+    out_diff <- tbl_diff2(prev_out_snapshot, out_update, "time_value", "update")
+    out_snapshot <- tbl_patch(prev_out_snapshot, out_diff, "time_value")
+    prev_inp_snapshot <<- inp_snapshot
+    prev_out_snapshot <<- out_snapshot
+    out_diff$version <- version
+    out_diff
+  })
+  result
+}
+
+#' @method epi_slide_opt grouped_epi_archive
+#' @export
+epi_slide_opt.grouped_epi_archive <- function(.x, ...) {
+  assert_set_equal(
+    group_vars(.x),
+    key_colnames(.x$private$ungrouped, exclude = c("time_value", "version"))
+  )
+  orig_group_vars <- group_vars(.x)
+  orig_drop <- .x$private$drop
+  .x %>%
+    ungroup() %>%
+    epi_slide_opt(...) %>%
+    group_by(pick(all_of(orig_group_vars)), .drop = orig_drop)
+}
+
+#' @method epi_slide_opt epi_archive
+#' @export
+epi_slide_opt.epi_archive <-
+  function(.x, .col_names, .f, ...,
+           .window_size = NULL, .align = c("right", "center", "left"),
+           .prefix = NULL, .suffix = NULL, .new_col_names = NULL,
+           .ref_time_values = NULL, .all_rows = FALSE,
+           .progress = FALSE) {
+    # Extract metadata:
+    time_type <- .x$time_type
+    epikey_names <- key_colnames(.x, exclude = c("time_value", "version"))
+    # Validation & pre-processing:
+    .align <- arg_match(.align)
+    .f_info <- upstream_slide_f_info(.f)
+    .f_dots_baked <-
+      if (rlang::dots_n(...) == 0L) {
+        # Leaving `.f` unchanged slightly improves computation speed and trims
+        # debug stack traces:
+        .f
+      } else {
+        purrr::partial(.f, ...)
+      }
+    col_names_quo <- enquo(.col_names)
+    names_info <- across_ish_names_info(
+      .x$DT, time_type, col_names_quo, .f_info$namer,
+      .window_size, .align, .prefix, .suffix, .new_col_names
+    )
+    window_args <- get_before_after_from_window(.window_size, .align, time_type)
+    if (!is.null(.ref_time_values)) {
+      cli_abort("epi_slide.epi_archive does not support the `.ref_time_values` argument")
+    }
+    if (!identical(.all_rows, FALSE)) {
+      cli_abort("epi_slide.epi_archive does not support the `.all_rows` argument")
+    }
+    assert(
+      checkmate::check_logical(.progress, any.missing = FALSE, len = 1L, names = "unnamed"),
+      checkmate::check_string(.progress)
+    )
+    if (isTRUE(.progress)) {
+      .progress <- "Time series processed:"
+    }
+    use_progress <- !isFALSE(.progress)
-    if (isTRUE(.progress)) {
-      .progress <- "Time series processed:"
-    }
-    use_progress <- !isFALSE(.progress)
+    use_progress <- isTRUE(.progress)
+    if (use_progress) {
+      .progress <- "Time series processed:"
+    }
-    if (isTRUE(.progress)) {
-      .progress <- "Time series processed:"
-    }
-    use_progress <- !isFALSE(.progress)
+    use_progress <- isTRUE(.progress)
+    if (use_progress) {
+      .progress <- "Time series processed:"
+    }
+    # Perform the slide:
+    updates_grouped <- .x$DT %>%
+      as.data.frame() %>%
+      as_tibble(.name_repair = "minimal") %>%
+      # 0 rows input -> 0 rows output, so we can just say drop = TRUE:
+      grouped_df(epikey_names, TRUE)
+    if (use_progress) progress_bar_id <- cli::cli_progress_bar(.progress, total = n_groups(updates_grouped))
+    result <- updates_grouped %>%
+      group_modify(function(group_values, group_key) {
+        group_updates <- group_values %>%
+          nest(.by = version, .key = "subtbl") %>%
+          arrange(version)
+        # TODO move nesting inside the helper?
+        res <- epi_slide_opt_archive_one_epikey(
+          group_updates,
+          names_info$input_col_names,
+          .f_dots_baked, .f_info$from_package, window_args$before, window_args$after, time_type,
+          names_info$output_col_names
+        ) %>%
+          list_rbind()
+        if (use_progress) cli::cli_progress_update(id = progress_bar_id)
+        res
+      }) %>%
+      ungroup() %>%
+      as.data.frame() %>% # data.table#6859
+      as.data.table(key = key(.x$DT)) %>%
+      new_epi_archive(
+        .x$geo_type, .x$time_type, .x$other_keys,
+        .x$clobberable_versions_start, .x$versions_end
+      )
+    if (use_progress) cli::cli_progress_done(id = progress_bar_id)
+    # Keep ordering of old columns, place new columns at end:
+    setcolorder(result$DT, names(.x$DT))
+    result
+  }
diff --git a/R/epi_slide_opt_edf.R b/R/epi_slide_opt_edf.R
diff --git a/R/epiprocess-package.R b/R/epiprocess-package.R
@@ -6,36 +6,61 @@
 #' @importFrom checkmate anyInfinite anyMissing assert assert_character
 #' @importFrom checkmate assert_class assert_data_frame assert_int assert_list
 #' @importFrom checkmate assert_false
+#' @importFrom checkmate assert_function
 #' @importFrom checkmate assert_logical assert_numeric assert_scalar checkInt
+#' @importFrom checkmate assert_set_equal
 #' @importFrom checkmate assert_string
 #' @importFrom checkmate assert_subset
 #' @importFrom checkmate assert_tibble
+#' @importFrom checkmate assert_true
 #' @importFrom checkmate check_atomic check_data_frame expect_class test_int
+#' @importFrom checkmate check_character
+#' @importFrom checkmate check_logical
 #' @importFrom checkmate check_names
+#' @importFrom checkmate check_null
+#' @importFrom checkmate check_numeric
 #' @importFrom checkmate test_subset test_set_equal vname
 #' @importFrom cli cli_abort cli_warn
 #' @importFrom cli pluralize
 #' @importFrom cli qty
 #' @importFrom data.table as.data.table
+#' @importFrom data.table fifelse
 #' @importFrom data.table key
+#' @importFrom data.table setcolorder
+#' @importFrom data.table setDT
 #' @importFrom data.table setkeyv
 #' @importFrom dplyr arrange
 #' @importFrom dplyr grouped_df
 #' @importFrom dplyr is_grouped_df
+#' @importFrom dplyr n_groups
 #' @importFrom dplyr select
 #' @importFrom lifecycle deprecated
 #' @importFrom purrr list_rbind
 #' @importFrom rlang %||%
+#' @importFrom rlang arg_match0
 #' @importFrom rlang is_bare_integerish
+#' @importFrom rlang is_bare_list
+#' @importFrom rlang is_bare_numeric
+#' @importFrom tibble is_tibble
+#' @importFrom tidyr nest
 #' @importFrom tools toTitleCase
+#' @importFrom vctrs obj_is_vector
 #' @importFrom vctrs vec_cast
+#' @importFrom vctrs vec_cast_common
 #' @importFrom vctrs vec_data
+#' @importFrom vctrs vec_duplicate_id
 #' @importFrom vctrs vec_equal
 #' @importFrom vctrs vec_in
+#' @importFrom vctrs vec_match
 #' @importFrom vctrs vec_order
+#' @importFrom vctrs vec_ptype
 #' @importFrom vctrs vec_rbind
+#' @importFrom vctrs vec_recycle
 #' @importFrom vctrs vec_recycle_common
 #' @importFrom vctrs vec_rep
+#' @importFrom vctrs vec_rep_each
+#' @importFrom vctrs vec_seq_along
+#' @importFrom vctrs vec_size_common
 #' @importFrom vctrs vec_slice
 #' @importFrom vctrs vec_slice<-
 #' @importFrom vctrs vec_sort

diff --git a/R/patch.R b/R/patch.R
@@ -0,0 +1,348 @@
+#' Test two vctrs vectors for equality with some tolerance in some cases
+#'
+#' Similar to [`vctrs::vec_equal`]. Behavior may differ from `vec_equal` with
+#' non-`NA` `NaN`s involved, or for bare lists that contain named vectors, and
+#' the precise behavior in these cases may change and should not be relied upon.
+#'
+#' @param vec1,vec2 vctrs vectors (includes data frames). Take care when using
+#'   on named vectors or "keyed" data frames; [`vec_names()`] are largely
+#'   ignored, and key columns are treated as normal value columns (when they
+#'   should probably generate an error if they are not lined up correctly, or be
+#'   tested for exact rather than approximate equality).
+#' @param na_equal should `NA`s be considered equal to each other? (In
+#'   epiprocess, we usually want this to be `TRUE`, but that doesn't match the
+#'   [`vctrs::vec_equal()`] default, so this is mandatory.)
+#' @param .ptype as in [`vctrs::vec_equal()`].
+#' @param ... should be empty (it's here to force later arguments to be passed
+#'   by name)
+#' @param abs_tol absolute tolerance; will be used for bare numeric `vec1`,
+#'   `vec2`, or any such columns within `vec1`, `vec2` if they are data frames.
+#' @param inds1,inds2 optional (row) indices into vec1 and vec2 compatible with
+#'   [`vctrs::vec_slice()`]; output should be consistent with `vec_slice`-ing to
+#'   these indices beforehand, but can give faster computation if `vec1` and
+#'   `vec2` are data frames.
+#'
+#' @return logical vector, with length matching the result of recycling `vec1`
+#'   (at `inds1` if provided) and `vec2` (at `inds2` if provided); entries
+#'   should all be `TRUE` or `FALSE` if `na_equal = TRUE`.
+#'
+#' @examples
+#'
+#' library(dplyr)
+#'
+#' # On numeric vectors:
+#' vec_approx_equal(
+#'   c(1, 2, 3, NA),
+#'   c(1, 2 + 1e-10, NA, NA),
+#'   na_equal = TRUE,
+#'   abs_tol = 1e-8
+#' )
+#'
+#' # On tibbles:
+#' tbl1 <- tibble(
+#'   a = 1:5,
+#'   b = list(1:5, 1:4, 1:3, 1:2, 1:1) %>% lapply(as.numeric),
+#'   c = tibble(
+#'     c1 = 1:5
+#'   ),
+#'   d = matrix(1:10, 5, 2)
+#' )
+#' tbl2 <- tbl1
+#' tbl2$a[[2]] <- tbl1$a[[2]] + 1e-10
+#' tbl2$b[[3]][[1]] <- tbl1$b[[3]][[1]] + 1e-10
+#' tbl2$c$c1[[4]] <- tbl1$c$c1[[4]] + 1e-10
+#' tbl2$d[[5, 2]] <- tbl1$d[[5, 2]] + 1e-10
+#' vctrs::vec_equal(tbl1, tbl2, na_equal = TRUE)
+#' vec_approx_equal(tbl1, tbl2, na_equal = TRUE, abs_tol = 1e-12)
+#' vec_approx_equal(tbl1, tbl2, na_equal = TRUE, abs_tol = 1e-8)
+#'
+#'
+#'
+#'
+#'
+#' # Type comparison within lists is stricter, matching vctrs:
+#' vctrs::vec_equal(list(1:2), list(as.numeric(1:2)))
+#' vec_approx_equal(list(1:2), list(as.numeric(1:2)), FALSE, abs_tol = 0)
+#'
+#' @export
+vec_approx_equal <- function(vec1, vec2, na_equal, .ptype = NULL, ..., abs_tol, inds1 = NULL, inds2 = NULL) {
+  if (!obj_is_vector(vec1)) cli_abort("`vec1` must be recognized by vctrs as a vector")
+  if (!obj_is_vector(vec2)) cli_abort("`vec2` must be recognized by vctrs as a vector")
+  # Leave vec size checking to vctrs recycling ops.
+  assert_logical(na_equal, any.missing = FALSE, len = 1L)
+  # Leave .ptype checks to cast operation.
+  check_dots_empty()
+  assert_numeric(abs_tol, lower = 0, len = 1L)
+  assert(
+    check_null(inds1),
+    check_numeric(inds1),
+    check_logical(inds1),
+    check_character(inds1)
+  )
+  assert(
+    check_null(inds2),
+    check_numeric(inds2),
+    check_logical(inds2),
+    check_character(inds2)
+  )
+  # Leave heavier index validation to the vctrs recycling & indexing ops.
+
+  # Recycle inds if provided; vecs if not:
+  common_size <- vec_size_common(
+    if (is.null(inds1)) vec1 else inds1,
+    if (is.null(inds2)) vec2 else inds2
+  )
+  if (is.null(inds1)) {
+    vec1 <- vec_recycle(vec1, common_size)
+  } else {
+    inds1 <- vec_recycle(inds1, common_size)
+  }
+  if (is.null(inds2)) {
+    vec2 <- vec_recycle(vec2, common_size)
+  } else {
+    inds2 <- vec_recycle(inds2, common_size)
+  }
+  vecs <- vec_cast_common(vec1, vec2, .to = .ptype)
+  vec_approx_equal0(vecs[[1]], vecs[[2]], na_equal, abs_tol, inds1, inds2)
+}
+
+#' Helper for [`vec_approx_equal`] for vecs guaranteed to have the same ptype and size
+#'
+#' @keywords internal
+vec_approx_equal0 <- function(vec1, vec2, na_equal, abs_tol, inds1 = NULL, inds2 = NULL) {
+  if (is_bare_numeric(vec1) && abs_tol != 0) {
+    # perf: since we're working with bare numerics and logicals: we can use `[`
+    # and `fifelse`. Matching vec_equal, we ignore names and other attributes.
+    if (!is.null(inds1)) vec1 <- vec_slice(vec1, inds1)
+    if (!is.null(inds2)) vec2 <- vec_slice(vec2, inds2)
+    res <- fifelse(
+      !is.na(vec1) & !is.na(vec2),
+      abs(vec1 - vec2) <= abs_tol,
+      if (na_equal) is.na(vec1) & is.na(vec2) else NA
+      # XXX ^ inconsistent with vec_equal treatment: NA vs. NaN comparison
+      # behavior with na_equal = TRUE is different
+    )
+    if (!is.null(dim(vec1))) {
+      dim(res) <- dim(vec1)
+      res <- rowSums(res) == ncol(res)
+    }
+    # `fifelse` inherits any unrecognized attributes; drop them instead:
+    attributes(res) <- NULL
+    return(res)
+  } else if (is.data.frame(vec1) && abs_tol != 0) {
+    # (we only need to manually recurse if we potentially have columns that would
+    # be specially processed by the above)
+    if (ncol(vec1) == 0) {
+      rep(TRUE, nrow(vec1))
+    } else {
+      Reduce(`&`, lapply(seq_len(ncol(vec1)), function(col_i) {
+        vec_approx_equal0(vec1[[col_i]], vec2[[col_i]], na_equal, abs_tol, inds1, inds2)
+      }))
+    }
+  } else if (is_bare_list(vec1)) {
+    vapply(seq_along(vec1), function(i) {
+      entry1 <- vec1[[i]]
+      entry2 <- vec2[[i]]
+      vec_size(entry1) == vec_size(entry2) &&
+        # This is inconsistent with vec_equal on named vectors; to be
+        # consistently inconsistent, we avoid dispatching to vec_equal for bare
+        # lists even with abs_tol = 0:
+        identical(vec_ptype(entry1), vec_ptype(entry2)) &&
+        all(vec_approx_equal0(entry1, entry2, na_equal, abs_tol))
+    }, logical(1L))
+  } else {
+    # XXX No special handling for any other types/situations. Makes sense for
+    # unclassed atomic things; custom classes (e.g., distributions) might want
+    # recursion / specialization, though. vec_approx_equal0 should probably be an S3
+    # method. Also, abs_tol == 0 --> vec_equal logic should maybe be either be
+    # hoisted to vec_approx_equal or we should manually recurse on data frames even
+    # with abs_tol = 0 when that's faster (might depend on presence of inds*),
+    # after some inconsistencies are ironed out.
+    if (!is.null(inds1)) {
+      vec1 <- vec_slice(vec1, inds1)
+      vec2 <- vec_slice(vec2, inds2)
+    }
+    res <- vec_equal(vec1, vec2, na_equal = na_equal)
+    return(res)
+  }
+}
+
+#' Calculate compact patch to move from one snapshot/update to another
+#'
+#' @param earlier_snapshot tibble or `NULL`; `NULL` represents that there was no
+#'   data before `later_tbl`.
+#' @param later_tbl tibble; must have the same column names as
+#'   `earlier_snapshot` if it is a tibble.
+#' @param ukey_names character; column names that together, form a unique key
+#'   for `earlier_snapshot` and for `later_tbl`. This is unchecked; see
+#'   [`check_ukey_unique`] if you don't already have this guaranteed.
+#' @param later_format "snapshot" or "update"; default is "snapshot". If
+#'   "snapshot", `later_tbl` will be interpreted as a full snapshot of the data
+#'   set including all ukeys, and any ukeys that are in `earlier_snapshot` but
+#'   not in `later_tbl` are interpreted as deletions, which are currently
+#'   (imprecisely) represented in the output patch as revisions of all
+#'   non-`ukey_names` columns to NA values (using `{vctrs}`). If "update", then
+#'   it's assumed that any deletions have already been represented this way in
+#'   `later_tbl` and any ukeys not in `later_tbl` are simply unchanged; we are
+#'   just ensuring that the update is fully compact for the given
+#'   `compactify_abs_tol`.
+#' @param compactify_abs_tol compactification tolerance; see `apply_compactify`
+#' @return a tibble in compact "update" (diff) format
+#'
+#' @keywords internal
+tbl_diff2 <- function(earlier_snapshot, later_tbl,
+                      ukey_names,
+                      later_format = c("snapshot", "update"),
+                      compactify_abs_tol = 0) {
+  # Most input validation + handle NULL earlier_snapshot. This is a small function so
+  # use faster validation variants:
+  if (!is_tibble(later_tbl)) {
+    cli_abort("`later_tbl` must be a tibble")
+  }
+  if (is.null(earlier_snapshot)) {
+    return(later_tbl)
+  }
+  if (!is_tibble(earlier_snapshot)) {
+    cli_abort("`earlier_snapshot` must be a tibble or `NULL`")
+  }
+  if (!is.character(ukey_names) || !all(ukey_names %in% names(earlier_snapshot))) {
+    cli_abort("`ukey_names` must be a subset of column names")
+  }
+  later_format <- arg_match0(later_format, c("snapshot", "update"))
+  if (!(is.vector(compactify_abs_tol, mode = "numeric") &&
+    length(compactify_abs_tol) == 1L && # nolint:indentation_linter
+    compactify_abs_tol >= 0)) {
+    # Give a specific message:
+    assert_numeric(compactify_abs_tol, lower = 0, any.missing = FALSE, len = 1L)
+    # Fallback e.g. for invalid classes not caught by assert_numeric:
+    cli_abort("`compactify_abs_tol` must be a length-1 double/integer >= 0")
+  }
+
+  # Extract metadata:
+  earlier_n <- nrow(earlier_snapshot)
+  later_n <- nrow(later_tbl)
+  tbl_names <- names(earlier_snapshot)
+  val_names <- tbl_names[!tbl_names %in% ukey_names]
+
+  # More input validation:
+  if (!identical(tbl_names, names(later_tbl))) {
+    cli_abort(c("`earlier_snapshot` and `later_tbl` should have identical column
+                 names and ordering.",
+      "*" = "`earlier_snapshot` colnames: {format_chr_deparse(tbl_names)}",
+      "*" = "`later_tbl` colnames: {format_chr_deparse(names(later_tbl))}"
+    ))
+  }
+
+  combined_tbl <- vec_rbind(earlier_snapshot, later_tbl)
+  combined_n <- nrow(combined_tbl)
+
+  # We'll also need epikeytimes and value columns separately:
+  combined_ukeys <- combined_tbl[ukey_names]
+  combined_vals <- combined_tbl[val_names]
+
+  # We have five types of rows in combined_tbl:
+  # 1. From earlier_snapshot, no matching ukey in later_tbl (deletion; turn vals to
+  #    NAs to match epi_archive format)
+  # 2. From earlier_snapshot, with matching ukey in later_tbl (context; exclude from
+  #    result)
+  # 3. From later_tbl, with matching ukey in earlier_snapshot, with value "close" (change
+  #    that we'll compactify away)
+  # 4. From later_tbl, with matching ukey in earlier_snapshot, value not "close" (change
+  #    that we'll record)
+  # 5. From later_tbl, with no matching ukey in later_tbl (addition)
+
+  # For "snapshot" later_format, we need to filter to 1., 4., and 5., and alter
+  # values for 1.  For "update" later_format, we need to filter to 4. and 5.
+
+  # (For compactify_abs_tol = 0, we could potentially streamline things by dropping
+  # ukey+val duplicates (cases 2. and 3.).)
+
+  # Row indices of first occurrence of each ukey; will be the same as
+  # seq_len(combined_n) except for when that ukey has been re-reported in
+  # `later_tbl`, in which case (3. or 4.) it will point back to the row index of
+  # the same ukey in `earlier_snapshot`:
+  combined_ukey_firsts <- vec_duplicate_id(combined_ukeys)
+
+  # Which rows from combined are cases 3. or 4.?
+  combined_ukey_is_repeat <- combined_ukey_firsts != seq_len(combined_n)
+  # For each row in 3. or 4., row numbers of the ukey appearance in earlier:
+  ukey_repeat_first_i <- combined_ukey_firsts[combined_ukey_is_repeat]
+
+  # Which rows from combined are in case 3.?
+  combined_compactify_away <- rep(FALSE, combined_n)
+  combined_compactify_away[combined_ukey_is_repeat] <-
+    vec_approx_equal0(combined_vals,
+      combined_vals,
+      na_equal = TRUE,
+      abs_tol = compactify_abs_tol,
+      inds1 = combined_ukey_is_repeat,
+      inds2 = ukey_repeat_first_i
+    )
+
+  # Which rows from combined are in cases 3., 4., or 5.?
+  combined_from_later <- vec_rep_each(c(FALSE, TRUE), c(earlier_n, later_n))
+
+  if (later_format == "update") {
+    # Cases 4. and 5.:
+    combined_tbl <- combined_tbl[combined_from_later & !combined_compactify_away, ]
+  } else { # later_format is "snapshot"
+    # Which rows from combined are in case 1.?
+    combined_is_deletion <- vec_rep_each(c(TRUE, FALSE), c(earlier_n, later_n))
+    combined_is_deletion[ukey_repeat_first_i] <- FALSE
+    # Which rows from combined are in cases 1., 4., or 5.?
+    combined_include <- combined_is_deletion | combined_from_later & !combined_compactify_away
+    combined_tbl <- combined_tbl[combined_include, ]
+    # Represent deletion in 1. with NA-ing of all value columns. (In some
+    # previous approaches to epi_diff2, this seemed to be faster than using
+    # vec_rbind(case_1_ukeys, cases_45_tbl) or bind_rows to fill with NAs, and more
+    # general than data.table's rbind(case_1_ukeys, cases_45_tbl, fill = TRUE).)
+    combined_tbl[combined_is_deletion[combined_include], val_names] <- NA
+  }
+
+  combined_tbl
+}
+
+#' Apply an update (e.g., from `tbl_diff2`) to a snapshot
+#'
+#' @param snapshot tibble or `NULL`; entire data set as of some version, or
+#'   `NULL` to treat `update` as the initial version of the data set.
+#' @param update tibble; ukeys + initial values for added rows, ukeys + new
+#'   values for changed rows. Deletions must be imprecisely represented as
+#'   changing all values to NAs.
+#' @param ukey_names character; names of columns that should form a unique key
+#'   for `snapshot` and for `update`. Uniqueness is unchecked; if you don't have
+#'   this guaranteed, see [`check_ukey_unique()`].
+#' @return tibble; snapshot of the data set with the update applied.
+#'
+#' @keywords internal
+tbl_patch <- function(snapshot, update, ukey_names) {
+  # Most input validation. This is a small function so use faster validation
+  # variants:
+  if (!is_tibble(update)) {
+    cli_abort("`update` must be a tibble")
+  }
+  if (is.null(snapshot)) {
+    return(update)
+  }
+  if (!is_tibble(snapshot)) {
+    cli_abort("`snapshot` must be a tibble")
+  }
+  if (!is.character(ukey_names) || !all(ukey_names %in% names(snapshot))) {
+    cli_abort("`ukey_names` must be a subset of column names")
+  }
+  if (!identical(names(snapshot), names(update))) {
+    cli_abort(c("`snapshot` and `update` should have identical column
+                 names and ordering.",
+      "*" = "`snapshot` colnames: {format_chr_deparse(tbl_names)}",
+      "*" = "`update` colnames: {format_chr_deparse(names(update))}"
+    ))
+  }
+
+  result_tbl <- vec_rbind(update, snapshot)
+
+  dup_ids <- vec_duplicate_id(result_tbl[ukey_names])
+  not_overwritten <- dup_ids == vec_seq_along(result_tbl)
+  result_tbl <- result_tbl[not_overwritten, ]
+
+  result_tbl
+}
diff --git a/R/slide.R b/R/slide.R
diff --git a/_pkgdown.yml b/_pkgdown.yml
@@ -93,6 +93,10 @@ reference:
       - epidatasets::covid_incidence_outliers
       - epidatasets::covid_case_death_rates_extended
 
+  - title: Other utilities
+  - contents:
+      - vec_approx_equal
+
   - title: internal
   - contents:
       - starts_with("internal")
diff --git a/man/across_ish_names_info.Rd b/man/across_ish_names_info.Rd
diff --git a/man/apply_compactify.Rd b/man/apply_compactify.Rd
diff --git a/man/epi_archive.Rd b/man/epi_archive.Rd
diff --git a/man/epi_slide_opt.Rd b/man/epi_slide_opt.Rd
diff --git a/man/epi_slide_opt_archive_one_epikey.Rd b/man/epi_slide_opt_archive_one_epikey.Rd
diff --git a/man/full_date_seq.Rd b/man/full_date_seq.Rd
diff --git a/man/is_locf.Rd b/man/is_locf.Rd
diff --git a/man/tbl_diff2.Rd b/man/tbl_diff2.Rd
diff --git a/man/tbl_patch.Rd b/man/tbl_patch.Rd
diff --git a/man/upstream_slide_f_info.Rd b/man/upstream_slide_f_info.Rd
diff --git a/man/upstream_slide_f_possibilities.Rd b/man/upstream_slide_f_possibilities.Rd
diff --git a/man/vec_approx_equal.Rd b/man/vec_approx_equal.Rd
diff --git a/man/vec_approx_equal0.Rd b/man/vec_approx_equal0.Rd
diff --git a/tests/testthat/test-archive.R b/tests/testthat/test-archive.R
@@ -92,7 +92,7 @@ test_that("epi_archives are correctly instantiated with a variety of data types"
   expect_equal(key(ea1$DT), c("geo_value", "time_value", "version"))
 
   ea2 <- as_epi_archive(df, other_keys = "value", compactify = FALSE)
-  expect_equal(key(ea2$DT), c("geo_value", "time_value", "value", "version"))
+  expect_equal(key(ea2$DT), c("geo_value", "value", "time_value", "version"))
 
   # Tibble
   tib <- tibble::tibble(df, code = "x")
@@ -101,7 +101,7 @@ test_that("epi_archives are correctly instantiated with a variety of data types"
   expect_equal(key(ea3$DT), c("geo_value", "time_value", "version"))
 
   ea4 <- as_epi_archive(tib, other_keys = "code", compactify = FALSE)
-  expect_equal(key(ea4$DT), c("geo_value", "time_value", "code", "version"))
+  expect_equal(key(ea4$DT), c("geo_value", "code", "time_value", "version"))
 
   # Keyed data.table
   kdt <- data.table::data.table(
@@ -119,7 +119,7 @@ test_that("epi_archives are correctly instantiated with a variety of data types"
 
   ea6 <- as_epi_archive(kdt, other_keys = "value", compactify = FALSE)
   # Mismatched keys, but the one from as_epi_archive overrides
-  expect_equal(key(ea6$DT), c("geo_value", "time_value", "value", "version"))
+  expect_equal(key(ea6$DT), c("geo_value", "value", "time_value", "version"))
 
   # Unkeyed data.table
   udt <- data.table::data.table(
@@ -134,7 +134,7 @@ test_that("epi_archives are correctly instantiated with a variety of data types"
   expect_equal(key(ea7$DT), c("geo_value", "time_value", "version"))
 
   ea8 <- as_epi_archive(udt, other_keys = "code", compactify = FALSE)
-  expect_equal(key(ea8$DT), c("geo_value", "time_value", "code", "version"))
+  expect_equal(key(ea8$DT), c("geo_value", "code", "time_value", "version"))
 
   # epi_df
   edf1 <- cases_deaths_subset %>%
@@ -145,7 +145,7 @@ test_that("epi_archives are correctly instantiated with a variety of data types"
   expect_equal(key(ea9$DT), c("geo_value", "time_value", "version"))
 
   ea10 <- as_epi_archive(edf1, other_keys = "code", compactify = FALSE)
-  expect_equal(key(ea10$DT), c("geo_value", "time_value", "code", "version"))
+  expect_equal(key(ea10$DT), c("geo_value", "code", "time_value", "version"))
 
   # Keyed epi_df
   edf2 <- data.frame(
@@ -164,7 +164,7 @@ test_that("epi_archives are correctly instantiated with a variety of data types"
   expect_equal(key(ea11$DT), c("geo_value", "time_value", "version"))
 
   ea12 <- as_epi_archive(edf2, other_keys = "misc", compactify = FALSE)
-  expect_equal(key(ea12$DT), c("geo_value", "time_value", "misc", "version"))
+  expect_equal(key(ea12$DT), c("geo_value", "misc", "time_value", "version"))
 })
 
 test_that("`epi_archive` rejects nonunique keys", {
@@ -217,8 +217,16 @@ test_that("`epi_archive` rejects dataframes where time_value and version columns
   expect_error(as_epi_archive(tbl3), class = "epiprocess__time_value_version_mismatch")
 })
 
-test_that("is_locf works as expected", {
+test_that("is_locf replacement works as expected", {
   vec <- c(1, 1, 1e-10, 1.1e-10, NA, NA, NaN, NaN)
   is_repeated <- c(0, 1, 0, 1, 0, 1, 1, 1)
-  expect_equal(is_locf(vec, .Machine$double.eps^0.5, FALSE), as.logical(is_repeated))
+  expect_equal(
+    c(
+      FALSE,
+      vec_approx_equal(head(vec, -1L), tail(vec, -1L),
+        na_equal = TRUE, abs_tol = .Machine$double.eps^0.5
+      )
+    ),
+    as.logical(is_repeated)
+  )
 })
diff --git a/tests/testthat/test-epi_slide_opt_archive.R b/tests/testthat/test-epi_slide_opt_archive.R
@@ -0,0 +1,218 @@
+library(dplyr)
+
+test_that("epi_slide_opt_archive_one_epikey works as expected", {
+  start_date <- as.Date("2020-01-01")
+
+  updates <- bind_rows(
+    tibble(version = 10, time_value = 0:20, value = 0:20),
+    tibble(version = 12, time_value = 4:5, value = 5:4),
+    tibble(version = 13, time_value = 8, value = 9),
+    tibble(version = 14, time_value = 11, value = NA),
+    tibble(version = 15, time_value = -10, value = -10),
+    tibble(version = 16, time_value = 50, value = 50)
+  ) %>%
+    mutate(across(c(version, time_value), ~ start_date - 1 + .x)) %>%
+    tidyr::nest(.by = version, .key = "subtbl")
+
+  expected <- list(
+    vctrs::vec_cbind(
+      tibble(version = 10),
+      updates$subtbl[[1L]] %>%
+        mutate(time_value = as.numeric(time_value - start_date) + 1) %>%
+        mutate(slide_value = frollmean(value, 3, algo = "exact"))
+    ),
+    tibble(
+      version = 12,
+      time_value = c(4, 5, 7), # time 6 unchanged, compactified away
+      # time 7 `value` unchanged, but here because `slide_value` changed:
+      value = c(5, 4, 7),
+      slide_value = c(
+        mean(c(2, 3, 5)),
+        # time 5 `slide_value` unchanged, but here because `value` changed:
+        mean(c(3, 5, 4)),
+        mean(c(4, 6, 7))
+      )
+    ),
+    tibble(
+      version = 13, time_value = 8:10, value = c(9, 9, 10),
+      slide_value = frollmean(c(6, 7, 9, 9, 10), 3, algo = "exact")[-(1:2)]
+    ),
+    tibble(
+      version = 14, time_value = 11:13, value = c(NA, 12, 13), slide_value = rep(NA_real_, 3L)
+    ),
+    tibble(
+      version = 15, time_value = -10, value = -10, slide_value = NA_real_
+    ),
+    tibble(
+      version = 16, time_value = 50, value = 50, slide_value = NA_real_
+    )
+  ) %>%
+    lapply(function(x) {
+      x %>%
+        mutate(across(c(version, time_value), ~ start_date - 1 + .x))
+    })
+
+  f <- purrr::partial(data.table::frollmean, algo = "exact")
+
+  result <- updates %>%
+    epiprocess:::epi_slide_opt_archive_one_epikey("value", f, "data.table", 2L, 0L, "day", "slide_value") %>%
+    lapply(function(x) {
+      x %>%
+        arrange(time_value) %>%
+        select(version, time_value, everything())
+    })
+
+  expect_equal(result, expected)
+})
+
+
+test_that("epi_slide_opt.epi_archive is not confused by unique(DT$version) unsorted", {
+  start_date <- as.Date("2020-01-01")
+  tibble(
+    geo_value = 1,
+    time_value = start_date - 1 + 1:4,
+    version = start_date - 1 + c(5, 5, 4, 4),
+    value = c(1, 2, 3, 4)
+  ) %>%
+    as_epi_archive() %>%
+    epi_slide_opt(value, frollmean, .window_size = 2L) %>%
+    expect_equal(
+      tibble(
+        geo_value = 1,
+        time_value = start_date - 1 + c(1, 2, 3, 3, 4),
+        version = start_date - 1 + c(5, 5, 4, 5, 4),
+        value = c(1, 2, 3, 3, 4),
+        value_2dav = c(NA, 1.5, NA, 2.5, 3.5)
+      ) %>%
+        as_epi_archive()
+    )
+})
+
+test_that("epi_slide_opt.epi_archive is not confused by unique(DT$time_value) unsorted", {
+  start_date <- as.Date("2020-01-01")
+  tibble(
+    geo_value = c(1, 1, 2, 2),
+    time_value = start_date - 1 + c(2, 3, 1, 2),
+    version = start_date - 1 + c(1, 2, 2, 2),
+    value = c(1, 2, 3, 4)
+  ) %>%
+    as_epi_archive() %>%
+    epi_slide_opt(value, frollmean, .window_size = 2L) %>%
+    expect_equal(
+      tibble(
+        geo_value = c(1, 1, 2, 2),
+        time_value = start_date - 1 + c(2, 3, 1, 2),
+        version = start_date - 1 + c(1, 2, 2, 2),
+        value = c(1, 2, 3, 4),
+        value_2dav = c(NA, 1.5, NA, 3.5)
+      ) %>%
+        as_epi_archive()
+    )
+})
+
+test_that("epi_slide_opt.epi_archive gives expected results on example data; also grouped behavior", {
+  # vs. built-in case_rate_7d_av column.
+  #
+  # If we were to compare the keyset vs.
+  # the original, it changes, as the original contains some tiny deviations in
+  # values that don't seem achievable with available sliding functions. E.g., in
+  # the recomputed result, geo "ak" version "2020-11-01" changes time 2020-03-13
+  # from 0 to 0.138 and time 2020-03-14 from a slightly different value of 0.138
+  # to 0, while nearby times remained stable; in the original, this resulted in
+  # a tiny update to the 7d_av for 2020-03-14 but not following times somehow,
+  # while in the recomputation there are also minute updates to 2020-03-15 and
+  # 2020-03-16; 2020-03-17 onward have other case_rate changes factoring in.
+  # Compactifying and comparing with tolerances would help account for some of
+  # these differences, but only through writing this was it realized that both
+  # archives would need the recompactification with tolerance; it's not just
+  # epi_slide_opt.epi_archive's very rigid compactification that's the cause.
+  # (Side note: allowing configurable compactification tolerance in
+  # epi_slide_opt.epi_archive wasn't included due to either feeling strange
+  # applying the compactification tolerance to all columns rather than just
+  # computed columns, and a slowdown when using one approach to compactify just
+  # the new columns + also awkward not matching what's possible with just
+  # construction functions.)
+  #
+  # --> just compare essentially an epix_merge of the original & the recomputation:
+  case_death_rate_archive_time <- system.time(
+    case_death_rate_archive_result <- case_death_rate_archive %>%
+      epi_slide_opt(case_rate, frollmean, algo = "exact", .window_size = 7)
+  )
+  expect_equal(
+    case_death_rate_archive_result$DT$case_rate_7dav,
+    case_death_rate_archive_result$DT$case_rate_7d_av
+  )
+
+  # vs. computing via epix_slide:
+
+  mini_case_death_rate_archive <- case_death_rate_archive %>%
+    {
+      as_tibble(as.data.frame(.$DT))
+    } %>%
+    filter(geo_value %in% head(unique(geo_value), 4L)) %>%
+    as_epi_archive()
+
+  mini_case_death_rate_archive_time_opt <- system.time(
+    mini_case_death_rate_archive_result <- mini_case_death_rate_archive %>%
+      epi_slide_opt(case_rate, frollmean, .window_size = 7)
+  )
+
+  mini_case_death_rate_archive_time_gen <- system.time(
+    mini_case_death_rate_archive_expected <- mini_case_death_rate_archive %>%
+      epix_slide(
+        ~ .x %>% epi_slide_opt(case_rate, frollmean, .window_size = 7)
+      ) %>%
+      select(names(mini_case_death_rate_archive$DT), everything()) %>%
+      as_epi_archive()
+  )
+
+  expect_equal(mini_case_death_rate_archive_result, mini_case_death_rate_archive_expected)
+
+  mini_case_death_rate_archive_result2 <- mini_case_death_rate_archive %>%
+    group_by(geo_value) %>%
+    epi_slide_opt(case_rate, frollmean, .window_size = 7)
+
+  expect_equal(
+    mini_case_death_rate_archive_result2,
+    mini_case_death_rate_archive_result %>%
+      group_by(geo_value)
+  )
+
+  mini_case_death_rate_archive_b <- mini_case_death_rate_archive %>%
+    {
+      as_tibble(as.data.frame(.$DT))
+    } %>%
+    mutate(age_group = "overall") %>%
+    as_epi_archive(other_keys = "age_group")
+
+  expect_equal(
+    mini_case_death_rate_archive_b %>%
+      group_by(geo_value, age_group) %>%
+      epi_slide_opt(case_rate, frollmean, .window_size = 7),
+    mini_case_death_rate_archive_b %>%
+      epi_slide_opt(case_rate, frollmean, .window_size = 7) %>%
+      group_by(geo_value, age_group)
+  )
+
+  expect_error(
+    mini_case_death_rate_archive_b %>%
+      group_by(age_group) %>%
+      epi_slide_opt(case_rate, frollmean, .window_size = 7)
+  )
+
+  archive_cases_dv_subset_time_opt <- system.time(
+    archive_cases_dv_subset_result <- archive_cases_dv_subset %>%
+      epi_slide_opt(percent_cli, frollmean, .window_size = 7)
+  )
+
+  archive_cases_dv_subset_time_gen <- system.time(
+    archive_cases_dv_subset_expected <- archive_cases_dv_subset %>%
+      epix_slide(
+        ~ .x %>% epi_slide_opt(percent_cli, frollmean, .window_size = 7)
+      ) %>%
+      select(geo_value, time_value, version, everything()) %>%
+      as_epi_archive()
+  )
+
+  expect_equal(archive_cases_dv_subset_result, archive_cases_dv_subset_expected)
+})