cmu-delphi · brookslogan · Feb 26, 2025 · Feb 26, 2025 · Feb 28, 2025 · Mar 3, 2025
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -97,6 +97,8 @@ Collate:
     'correlation.R'
     'epi_df.R'
     'epi_df_forbidden_methods.R'
+    'epi_slide_opt_archive.R'
+    'epi_slide_opt_edf.R'
     'epiprocess-package.R'
     'group_by_epi_df_methods.R'
     'methods-epi_archive.R'
@@ -105,6 +107,7 @@ Collate:
     'key_colnames.R'
     'methods-epi_df.R'
     'outliers.R'
+    'patch.R'
     'reexports.R'
     'revision_analysis.R'
     'slide.R'

diff --git a/NAMESPACE b/NAMESPACE
@@ -27,6 +27,9 @@ S3method(dplyr_col_modify,col_modify_recorder_df)
 S3method(dplyr_col_modify,epi_df)
 S3method(dplyr_reconstruct,epi_df)
 S3method(dplyr_row_slice,epi_df)
+S3method(epi_slide_opt,epi_archive)
+S3method(epi_slide_opt,epi_df)
+S3method(epi_slide_opt,grouped_epi_archive)
 S3method(epix_slide,epi_archive)
 S3method(epix_slide,grouped_epi_archive)
 S3method(epix_truncate_versions_after,epi_archive)
@@ -101,6 +104,7 @@ export(time_column_names)
 export(ungroup)
 export(unnest)
 export(validate_epi_archive)
+export(vec_approx_equal)
 export(version_column_names)
 import(epidatasets)
 importFrom(checkmate,anyInfinite)
@@ -117,13 +121,19 @@ importFrom(checkmate,assert_logical)
 importFrom(checkmate,assert_number)
 importFrom(checkmate,assert_numeric)
 importFrom(checkmate,assert_scalar)
+importFrom(checkmate,assert_set_equal)
 importFrom(checkmate,assert_string)
 importFrom(checkmate,assert_subset)
 importFrom(checkmate,assert_tibble)
+importFrom(checkmate,assert_true)
 importFrom(checkmate,checkInt)
 importFrom(checkmate,check_atomic)
+importFrom(checkmate,check_character)
 importFrom(checkmate,check_data_frame)
+importFrom(checkmate,check_logical)
 importFrom(checkmate,check_names)
+importFrom(checkmate,check_null)
+importFrom(checkmate,check_numeric)
 importFrom(checkmate,expect_class)
 importFrom(checkmate,test_int)
 importFrom(checkmate,test_set_equal)
@@ -143,6 +153,7 @@ importFrom(data.table,address)
 importFrom(data.table,as.data.table)
 importFrom(data.table,between)
 importFrom(data.table,copy)
+importFrom(data.table,fifelse)
 importFrom(data.table,frollapply)
 importFrom(data.table,frollmean)
 importFrom(data.table,frollsum)
@@ -151,6 +162,8 @@ importFrom(data.table,key)
 importFrom(data.table,rbindlist)
 importFrom(data.table,set)
 importFrom(data.table,setDF)
+importFrom(data.table,setDT)
+importFrom(data.table,setcolorder)
 importFrom(data.table,setkeyv)
 importFrom(dplyr,"%>%")
 importFrom(dplyr,across)
@@ -173,8 +186,8 @@ importFrom(dplyr,if_all)
 importFrom(dplyr,if_any)
 importFrom(dplyr,if_else)
 importFrom(dplyr,is_grouped_df)
-importFrom(dplyr,lag)
 importFrom(dplyr,mutate)
+importFrom(dplyr,n_groups)
 importFrom(dplyr,pick)
 importFrom(dplyr,pull)
 importFrom(dplyr,relocate)
@@ -200,6 +213,7 @@ importFrom(rlang,"%||%")
 importFrom(rlang,.data)
 importFrom(rlang,.env)
 importFrom(rlang,arg_match)
+importFrom(rlang,arg_match0)
 importFrom(rlang,caller_arg)
 importFrom(rlang,caller_env)
 importFrom(rlang,check_dots_empty)
@@ -212,6 +226,7 @@ importFrom(rlang,expr_label)
 importFrom(rlang,f_env)
 importFrom(rlang,f_rhs)
 importFrom(rlang,is_bare_integerish)
+importFrom(rlang,is_bare_list)
 importFrom(rlang,is_bare_numeric)
 importFrom(rlang,is_environment)
 importFrom(rlang,is_formula)
@@ -235,10 +250,12 @@ importFrom(slider,slide_sum)
 importFrom(stats,cor)
 importFrom(stats,median)
 importFrom(tibble,as_tibble)
+importFrom(tibble,is_tibble)
 importFrom(tibble,new_tibble)
 importFrom(tibble,validate_tibble)
 importFrom(tidyr,complete)
 importFrom(tidyr,full_seq)
+importFrom(tidyr,nest)
 importFrom(tidyr,unnest)
 importFrom(tidyselect,any_of)
 importFrom(tidyselect,eval_select)
@@ -248,15 +265,24 @@ importFrom(tsibble,as_tsibble)
 importFrom(utils,capture.output)
 importFrom(utils,tail)
 importFrom(vctrs,"vec_slice<-")
+importFrom(vctrs,obj_is_vector)
 importFrom(vctrs,vec_cast)
+importFrom(vctrs,vec_cast_common)
 importFrom(vctrs,vec_data)
 importFrom(vctrs,vec_duplicate_any)
+importFrom(vctrs,vec_duplicate_id)
 importFrom(vctrs,vec_equal)
 importFrom(vctrs,vec_in)
+importFrom(vctrs,vec_match)
 importFrom(vctrs,vec_order)
+importFrom(vctrs,vec_ptype)
 importFrom(vctrs,vec_rbind)
+importFrom(vctrs,vec_recycle)
 importFrom(vctrs,vec_recycle_common)
 importFrom(vctrs,vec_rep)
+importFrom(vctrs,vec_rep_each)
+importFrom(vctrs,vec_seq_along)
 importFrom(vctrs,vec_size)
+importFrom(vctrs,vec_size_common)
 importFrom(vctrs,vec_slice)
 importFrom(vctrs,vec_sort)
diff --git a/NEWS.md b/NEWS.md
@@ -2,6 +2,20 @@
 
 Pre-1.0.0 numbering scheme: 0.x will indicate releases, while 0.x.y will indicate PR's.
 
+# epiprocess 0.12
+
+## Breaking changes
+
+- `new_epi_archive()`'s `x` argument has been replaced with a `data_table`
+  argument, which must be a `data.table` with the key already set appropriately.
+  The `key()` of its `DT` will also now place `other_keys` before rather than after
+  `"time_value"`.
+
+## Bug fixes
+
+- `as_epi_archive()` no longer has issues setting its `DT`'s `key` on some
+  versions of `{data.table}` when `x` is a tibble.
+
 # epiprocess 0.11
 
 ## Breaking changes

diff --git a/R/archive.R b/R/archive.R
@@ -186,8 +186,8 @@ next_after.Date <- function(x) x + 1L
 #'  archive. Unexpected behavior may result from modifying the metadata
 #'  directly.
 #'
-#' @param x A data.frame, data.table, or tibble, with columns `geo_value`,
-#'   `time_value`, `version`, and then any additional number of columns.
+#' @param data_table a data.table with [`data.table::key()`] equal to
+#'   `c("geo_value", other_keys, "time_value", "version")`.
 #' @param geo_type DEPRECATED Has no effect. Geo value type is inferred from the
 #' location column and set to "custom" if not recognized.
 #' @param time_type DEPRECATED Has no effect. Time value type inferred from the time
@@ -278,41 +278,22 @@ next_after.Date <- function(x) x + 1L
 #' x <- df %>% as_epi_archive(other_keys = "county")
 #'
 new_epi_archive <- function(
-    x,
+    data_table,
     geo_type,
     time_type,
     other_keys,
     clobberable_versions_start,
     versions_end) {
-  assert_data_frame(x)
+  assert_class(data_table, "data.table")
   assert_string(geo_type)
   assert_string(time_type)
   assert_character(other_keys, any.missing = FALSE)
   if (any(c("geo_value", "time_value", "version") %in% other_keys)) {
     cli_abort("`other_keys` cannot contain \"geo_value\", \"time_value\", or \"version\".")
   }
-  validate_version_bound(clobberable_versions_start, x, na_ok = TRUE)
-  validate_version_bound(versions_end, x, na_ok = FALSE)
-
-  key_vars <- c("geo_value", "time_value", other_keys, "version")
-  if (!all(key_vars %in% names(x))) {
-    # Give a more tailored error message than as.data.table would:
-    cli_abort(c(
-      "`x` is missing the following expected columns:
-       {format_varnames(setdiff(key_vars, names(x)))}.",
-      ">" = "You might need to `dplyr::rename()` beforehand
-                       or use `as_epi_archive()`'s renaming feature.",
-      ">" = if (!all(other_keys %in% names(x))) {
-        "Check also for typos in `other_keys`."
-      }
-    ))
-  }
-
-  # Create the data table; if x was an un-keyed data.table itself,
-  # then the call to as.data.table() will fail to set keys, so we
-  # need to check this, then do it manually if needed
-  data_table <- as.data.table(x, key = key_vars)
-  if (!identical(key_vars, key(data_table))) setkeyv(data_table, cols = key_vars)
+  assert_true(identical(key(data_table), c("geo_value", other_keys, "time_value", "version")))
+  validate_version_bound(clobberable_versions_start, data_table, na_ok = TRUE)
+  validate_version_bound(versions_end, data_table, na_ok = FALSE)
 
   structure(
     list(
@@ -334,7 +315,7 @@ new_epi_archive <- function(
 validate_epi_archive <- function(x) {
   assert_class(x, "epi_archive")
 
-  ukey_vars1 <- c("geo_value", "time_value", x$other_keys, "version")
+  ukey_vars1 <- c("geo_value", x$other_keys, "time_value", "version")
   ukey_vars2 <- key(x$DT)
   if (!identical(ukey_vars1, ukey_vars2)) {
     cli_abort(c("`data.table::key(x$DT)` not as expected",
@@ -401,7 +382,7 @@ validate_epi_archive <- function(x) {
 #'   would be `key(DT)`.
 #' @param abs_tol numeric, >=0; absolute tolerance to use on numeric measurement
 #'   columns when determining whether something can be compactified away; see
-#'   [`is_locf`]
+#'   [`vec_approx_equal`]
 #'
 #' @importFrom data.table is.data.table key
 #' @importFrom dplyr arrange filter
@@ -420,10 +401,23 @@ apply_compactify <- function(updates_df, ukey_names, abs_tol = 0) {
   }
   assert_numeric(abs_tol, len = 1, lower = 0)
 
-  if (!is.data.table(updates_df) || !identical(key(updates_df), ukey_names)) {
+  if (is.data.table(updates_df)) {
+    if (!identical(key(updates_df), ukey_names)) {
+      cli_abort(c("`ukey_names` should match `key(updates_df)`",
+        "i" = "`ukey_names` was {format_chr_deparse(ukey_names)}",
+        "i" = "`key(updates_df)` was {format_chr_deparse(key(updates_df))}"
+      ))
+    }
+  } else {
     updates_df <- updates_df %>% arrange(pick(all_of(ukey_names)))
   }
-  updates_df[!update_is_locf(updates_df, ukey_names, abs_tol), ]
+
+  # In case updates_df is a data.table, store keep flags in a local: "When the
+  # first argument inside DT[...] is a single symbol (e.g. DT[var]), data.table
+  # looks for var in calling scope". In case it's not a data.table, make sure to
+  # use df[i,] not just df[i].
+  to_keep <- !update_is_locf(updates_df, ukey_names, abs_tol)
+  updates_df[to_keep, ]
 }
 
 #' get the entries that `compactify` would remove
@@ -460,56 +454,38 @@ update_is_locf <- function(arranged_updates_df, ukey_names, abs_tol) {
   ekt_names <- ukey_names[ukey_names != "version"]
   val_names <- all_names[!all_names %in% ukey_names]
 
-  Reduce(`&`, lapply(updates_col_refs[ekt_names], is_locf, abs_tol, TRUE)) &
-    Reduce(`&`, lapply(updates_col_refs[val_names], is_locf, abs_tol, FALSE))
-}
-
-#' Checks to see if a value in a vector is LOCF
-#' @description LOCF meaning last observation carried forward (to later
-#'   versions). Lags the vector by 1, then compares with itself. If `is_key` is
-#'   `TRUE`, only values that are exactly the same between the lagged and
-#'   original are considered LOCF. If `is_key` is `FALSE` and `vec` is a vector
-#'   of numbers ([`base::is.numeric`]), then approximate equality will be used,
-#'   checking whether the absolute difference between each pair of entries is
-#'   `<= abs_tol`; if `vec` is something else, then exact equality is used
-#'   instead.
-#'
-#' @details
-#'
-#' We include epikey-time columns in LOCF comparisons as part of an optimization
-#' to avoid slower grouped operations while still ensuring that the first
-#' observation for each time series will not be marked as LOCF. We test these
-#' key columns for exact equality to prevent chopping off consecutive
-#' time_values during flat periods when `abs_tol` is high.
-#'
-#' We use exact equality for non-`is.numeric` double/integer columns such as
-#' dates, datetimes, difftimes, `tsibble::yearmonth`s, etc., as these may be
-#' used as part of re-indexing or grouping procedures, and we don't want to
-#' change the number of groups for those operations when we remove LOCF data
-#' during compactification.
-#'
-#' @importFrom dplyr lag if_else
-#' @importFrom rlang is_bare_numeric
-#' @importFrom vctrs vec_equal
-#' @keywords internal
-is_locf <- function(vec, abs_tol, is_key) { # nolint: object_usage_linter
-  lag_vec <- lag(vec)
-  if (is.vector(vec, mode = "numeric") && !is_key) {
-    # (integer or double vector, no class (& no dims); maybe names, which we'll
-    # ignore like `vec_equal`); not a key column
-    unname(if_else(
-      !is.na(vec) & !is.na(lag_vec),
-      abs(vec - lag_vec) <= abs_tol,
-      is.na(vec) & is.na(lag_vec)
-    ))
+  n_updates <- nrow(arranged_updates_df)
+  if (n_updates == 0L) {
+    logical(0L)
+  } else if (n_updates == 1L) {
+    FALSE # sole observation is not LOCF
   } else {
-    vec_equal(vec, lag_vec, na_equal = TRUE)
+    ekts_tbl <- new_tibble(updates_col_refs[ekt_names])
+    vals_tbl <- new_tibble(updates_col_refs[val_names])
+    # n_updates >= 2L so we can use `:` naturally (this is the reason for
+    # separating out n_updates == 1L from this case):
+    inds1 <- 2L:n_updates
+    inds2 <- 1L:(n_updates - 1L)
+    c(
+      FALSE, # first observation is not LOCF
+      vec_approx_equal0(ekts_tbl,
+        inds1 = inds1, ekts_tbl, inds2 = inds2,
+        # check ekt (key) cols with 0 tolerance:
+        na_equal = TRUE, abs_tol = 0
+      ) &
+        vec_approx_equal0(vals_tbl,
+          inds1 = inds1, vals_tbl, inds2 = inds2,
+          na_equal = TRUE, abs_tol = abs_tol
+        )
+    )
   }
 }
 
 #' `as_epi_archive` converts a data frame, data table, or tibble into an
 #' `epi_archive` object.
 #'
+#' @param x A data.frame, data.table, or tibble, with columns `geo_value`,
+#'   `time_value`, `version`, and then any additional number of columns.
 #' @param ... used for specifying column names, as in [`dplyr::rename`]. For
 #'   example `version = release_date`
 #' @param .versions_end location based versions_end, used to avoid prefix
@@ -530,11 +506,32 @@ as_epi_archive <- function(
     .versions_end = max_version_with_row_in(x), ...,
     versions_end = .versions_end) {
   assert_data_frame(x)
+  # Convert first to data.frame to guard against data.table#6859 and potentially
+  # other things epiprocess#618:
+  x_already_copied <- identical(class(x), c("data.table", "data.frame"))
+  x <- as.data.frame(x)
   x <- rename(x, ...)
-  x <- guess_column_name(x, "time_value", time_column_names())
   x <- guess_column_name(x, "geo_value", geo_column_names())
+  if (!all(other_keys %in% names(x))) {
+    # Give a more tailored error message than as.data.table would:
+    cli_abort(c(
+      "`x` is missing the following expected columns:
+       {format_varnames(setdiff(other_keys, names(x)))}.",
+      ">" = "You might need to `dplyr::rename()` beforehand
+             or using `as_epi_archive()`'s renaming feature."
+    ))
+  }
+  x <- guess_column_name(x, "time_value", time_column_names())
   x <- guess_column_name(x, "version", version_column_names())
 
+  # Convert to data.table:
+  key_vars <- c("geo_value", other_keys, "time_value", "version")
+  if (x_already_copied) {
+    setDT(x, key = key_vars)
+  } else {
+    x <- as.data.table(x, key = key_vars)
+  }
+
   if (lifecycle::is_present(geo_type)) {
     cli_warn("epi_archive constructor argument `geo_type` is now ignored. Consider removing.")
   }
@@ -555,11 +552,10 @@ as_epi_archive <- function(
     cli_abort('`compactify` must be `TRUE`, `FALSE`, or `"message"`')
   }
 
-  data_table <- result$DT
-  key_vars <- key(data_table)
+  data_table <- result$DT # probably just `x`, but take no chances
 
   nrow_before_compactify <- nrow(data_table)
-  # Runs compactify on data frame
+  # Runs compactify on data_table
   if (identical(compactify, TRUE) || identical(compactify, "message")) {
     compactified <- apply_compactify(data_table, key_vars, compactify_abs_tol)
   } else {