Skip to content

Commit d1ba68e

Browse files
authored
Merge pull request #652 from cmu-delphi/archive-agg-nits
comments and some nit rewrites
2 parents cdd9fee + ae70f78 commit d1ba68e

8 files changed

+54
-31
lines changed

R/archive.R

+10-2
Original file line numberDiff line numberDiff line change
@@ -438,7 +438,8 @@ removed_by_compactify <- function(updates_df, ukey_names, abs_tol) {
438438
updates_df[update_is_locf(updates_df, ukey_names, abs_tol), ]
439439
}
440440

441-
#' Internal helper; lgl; which updates are LOCF
441+
#' Internal helper; lgl; which updates are LOCF and should thus be dropped when
442+
#' compactifying
442443
#'
443444
#' (Not validated:) Must be called inside certain dplyr data masking verbs (e.g.,
444445
#' `filter` or `mutate`) being run on an `epi_archive`'s `DT` or a data frame
@@ -470,12 +471,18 @@ update_is_locf <- function(arranged_updates_df, ukey_names, abs_tol) {
470471
} else {
471472
ekts_tbl <- new_tibble(updates_col_refs[ekt_names])
472473
vals_tbl <- new_tibble(updates_col_refs[val_names])
474+
# grab the data and a shifted version of the data, and compute the
475+
# entry-wise difference to see if the value has changed
473476
# n_updates >= 2L so we can use `:` naturally (this is the reason for
474477
# separating out n_updates == 1L from this case):
475478
inds1 <- 2L:n_updates
476479
inds2 <- 1L:(n_updates - 1L)
477480
c(
478481
FALSE, # first observation is not LOCF
482+
# for the rest, check that both the keys are exactly the same, and the
483+
# values are within abs_tol
484+
# the key comparison effectively implements a group_by, so that when the
485+
# key changes we're guaranteed the value is correct
479486
vec_approx_equal0(ekts_tbl,
480487
inds1 = inds1, ekts_tbl, inds2 = inds2,
481488
# check ekt (key) cols with 0 tolerance:
@@ -493,7 +500,8 @@ update_is_locf <- function(arranged_updates_df, ukey_names, abs_tol) {
493500
#' `epi_archive` object.
494501
#'
495502
#' @param x A data.frame, data.table, or tibble, with columns `geo_value`,
496-
#' `time_value`, `version`, and then any additional number of columns.
503+
#' `time_value`, `version`, and then any additional number of columns, either
504+
#' keys or values.
497505
#' @param ... used for specifying column names, as in [`dplyr::rename`]. For
498506
#' example `version = release_date`
499507
#' @param .versions_end location based versions_end, used to avoid prefix

R/epi_slide_opt_archive.R

+7-7
Original file line numberDiff line numberDiff line change
@@ -27,12 +27,12 @@
2727
#'
2828
#' library(dplyr)
2929
#' grp_updates <- bind_rows(
30-
#' tibble(version = 10, time_value = 1:20, value = 1:20),
31-
#' tibble(version = 12, time_value = 4:5, value = 5:4),
32-
#' tibble(version = 13, time_value = 8, value = 9),
33-
#' tibble(version = 14, time_value = 11, value = NA),
34-
#' tibble(version = 15, time_value = -10, value = -10),
35-
#' tibble(version = 16, time_value = 50, value = 50)
30+
#' tibble(version = 30, time_value = 1:20, value = 1:20),
31+
#' tibble(version = 32, time_value = 4:5, value = 5:4),
32+
#' tibble(version = 33, time_value = 8, value = 9),
33+
#' tibble(version = 34, time_value = 11, value = NA),
34+
#' tibble(version = 35, time_value = -10, value = -10),
35+
#' tibble(version = 56, time_value = 50, value = 50)
3636
#' ) %>%
3737
#' mutate(across(c(version, time_value), ~ as.Date("2020-01-01") - 1 + .x))
3838
#'
@@ -108,7 +108,7 @@ epi_slide_opt_archive_one_epikey <- function(
108108
slide[[out_colnames[[col_i]]]] <- f_dots_baked(slide[[in_colnames[[col_i]]]], before = before, after = after)
109109
}
110110
} else {
111-
cli_abort("epiprocess internal error: `f_from_package` was {format_chr_deparse(f_from_package)}",
111+
cli_abort("epiprocess internal error: `f_from_package` was {format_chr_deparse(f_from_package)}, which is unsupported",
112112
class = "epiprocess__epi_slide_opt_archive__f_from_package_invalid"
113113
)
114114
}

R/epi_slide_opt_edf.R

+2-1
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,8 @@ upstream_slide_f_info <- function(.f, ...) {
7373
)
7474
}
7575

76-
#' Calculate input and output column names for an `{epiprocess}` [`dplyr::across`]-like operations
76+
#' Calculate input and output column names for an `{epiprocess}`
77+
#' [`dplyr::across`]-like operations
7778
#'
7879
#' @param .x data.frame to perform input column tidyselection on
7980
#' @param time_type as in [`new_epi_df`]

R/patch.R

+22-12
Original file line numberDiff line numberDiff line change
@@ -121,17 +121,22 @@ vec_approx_equal0 <- function(vec1, vec2, na_equal, abs_tol, inds1 = NULL, inds2
121121
na_or_nan1 <- is.na(vec1)
122122
na_or_nan2 <- is.na(vec2)
123123
# Since above are bare logical vectors, we can use `fifelse`
124-
res <- fifelse(
125-
!na_or_nan1 & !na_or_nan2,
126-
abs(vec1 - vec2) <= abs_tol,
127-
if (na_equal) {
124+
if (na_equal) {
125+
res <- fifelse(
126+
!na_or_nan1 & !na_or_nan2,
127+
abs(vec1 - vec2) <= abs_tol,
128128
na_or_nan1 & na_or_nan2 & (is.nan(vec1) == is.nan(vec2))
129-
} else {
130-
# Like `==` and `vec_equal`, we consider NaN == {NA, NaN, anything else}
131-
# to be NA.
129+
)
130+
} else {
131+
# Like `==` and `vec_equal`, we consider NaN == {NA, NaN, anything else}
132+
# to be NA.
133+
res <- fifelse(
134+
!na_or_nan1 & !na_or_nan2,
135+
abs(vec1 - vec2) <= abs_tol,
132136
NA
133-
}
134-
)
137+
)
138+
}
139+
135140
if (!is.null(dim(vec1))) {
136141
dim(res) <- dim(vec1)
137142
res <- rowSums(res) == ncol(res)
@@ -278,9 +283,9 @@ tbl_diff2 <- function(earlier_snapshot, later_tbl,
278283
# ukey+val duplicates (cases 2. and 3.).)
279284

280285
# Row indices of first occurrence of each ukey; will be the same as
281-
# seq_len(combined_n) except for when that ukey has been re-reported in
282-
# `later_tbl`, in which case (3. or 4.) it will point back to the row index of
283-
# the same ukey in `earlier_snapshot`:
286+
# seq_len(combined_n) for each ukey's first appearance (cases 1., 2., or 5.);
287+
# for re-reported ukeys in `later_tbl` (cases 3. or 4.), it will point back to
288+
# the row index of the same ukey in `earlier_snapshot`:
284289
combined_ukey_firsts <- vec_duplicate_id(combined_ukeys)
285290

286291
# Which rows from combined are cases 3. or 4.?
@@ -368,6 +373,11 @@ tbl_patch <- function(snapshot, update, ukey_names) {
368373
result_tbl <- vec_rbind(update, snapshot)
369374

370375
dup_ids <- vec_duplicate_id(result_tbl[ukey_names])
376+
# Find the "first" appearance of each ukey; since `update` is ordered before `snapshot`,
377+
# this means favoring the rows from `update` over those in `snapshot`.
378+
# This is like `!duplicated()` but faster, and like `vec_unique_loc()` but guaranteeing
379+
# that we get the first appearance since `vec_duplicate_id()` guarantees that
380+
# it points to the first appearance.
371381
not_overwritten <- dup_ids == vec_seq_along(result_tbl)
372382
result_tbl <- result_tbl[not_overwritten, ]
373383

man/across_ish_names_info.Rd

+4-2
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

man/epi_slide_opt_archive_one_epikey.Rd

+6-6
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

man/update_is_locf.Rd

+2-1
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

tests/testthat/test-epi_slide_opt_archive.R

+1
Original file line numberDiff line numberDiff line change
@@ -180,6 +180,7 @@ test_that("epi_slide_opt.epi_archive gives expected results on example data; als
180180
mutate(age_group = "overall") %>%
181181
as_epi_archive(other_keys = "age_group")
182182

183+
# grouping shouldn't change the outcome
183184
expect_equal(
184185
mini_case_death_rate_archive_b %>%
185186
group_by(geo_value, age_group) %>%

0 commit comments

Comments
 (0)