Skip to content

Commit 4366b59

Browse files
authored
Merge pull request #651 from cmu-delphi/lcb/archive-filter
Add `filter.epi_archive`
2 parents 231d979 + 332362b commit 4366b59

15 files changed

+461
-65
lines changed

DESCRIPTION

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
Type: Package
22
Package: epiprocess
33
Title: Tools for basic signal processing in epidemiology
4-
Version: 0.11.5
4+
Version: 0.11.6
55
Authors@R: c(
66
person("Jacob", "Bien", role = "ctb"),
77
person("Logan", "Brooks", , "[email protected]", role = c("aut", "cre")),

NAMESPACE

+1
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ S3method(epix_slide,epi_archive)
3232
S3method(epix_slide,grouped_epi_archive)
3333
S3method(epix_truncate_versions_after,epi_archive)
3434
S3method(epix_truncate_versions_after,grouped_epi_archive)
35+
S3method(filter,epi_archive)
3536
S3method(group_by,epi_archive)
3637
S3method(group_by,epi_df)
3738
S3method(group_by,grouped_epi_archive)

NEWS.md

+1
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ Pre-1.0.0 numbering scheme: 0.x will indicate releases, while 0.x.y will indicat
88

99
- `is_epi_archive` function has been reintroduced.
1010
- `epix_as_of_current()` introduced as an alias for `epix_as_of(.$versions_end)`.
11+
- Added `dplyr::filter` implementation for `epi_archive`s.
1112

1213
# epiprocess 0.11
1314

R/archive.R

+17-24
Original file line numberDiff line numberDiff line change
@@ -52,13 +52,6 @@ validate_version_bound <- function(version_bound, x, na_ok = FALSE,
5252
class = "epiprocess__version_bound_mismatched_class"
5353
)
5454
}
55-
if (!identical(typeof(version_bound), typeof(x[["version"]]))) {
56-
cli_abort(
57-
"{version_bound_arg} must have the same `typeof` as x$version,
58-
which has a `typeof` of {typeof(x$version)}",
59-
class = "epiprocess__version_bound_mismatched_typeof"
60-
)
61-
}
6255
}
6356

6457
return(invisible(NULL))
@@ -207,23 +200,23 @@ next_after.Date <- function(x) x + 1L
207200
#' undergo tiny nonmeaningful revisions and the archive object with the
208201
#' default setting is too large.
209202
#' @param clobberable_versions_start Optional; `length`-1; either a value of the
210-
#' same `class` and `typeof` as `x$version`, or an `NA` of any `class` and
211-
#' `typeof`: specifically, either (a) the earliest version that could be
212-
#' subject to "clobbering" (being overwritten with different update data, but
213-
#' using the *same* version tag as the old update data), or (b) `NA`, to
214-
#' indicate that no versions are clobberable. There are a variety of reasons
215-
#' why versions could be clobberable under routine circumstances, such as (a)
216-
#' today's version of one/all of the columns being published after initially
217-
#' being filled with `NA` or LOCF, (b) a buggy version of today's data being
218-
#' published but then fixed and republished later in the day, or (c) data
219-
#' pipeline delays (e.g., publisher uploading, periodic scraping, database
220-
#' syncing, periodic fetching, etc.) that make events (a) or (b) reflected
221-
#' later in the day (or even on a different day) than expected; potential
222-
#' causes vary between different data pipelines. The default value is `NA`,
223-
#' which doesn't consider any versions to be clobberable. Another setting that
224-
#' may be appropriate for some pipelines is `max_version_with_row_in(x)`.
225-
#' @param versions_end Optional; length-1, same `class` and `typeof` as
226-
#' `x$version`: what is the last version we have observed? The default is
203+
#' same `class` as `x$version`, or an `NA` of any `class`: specifically,
204+
#' either (a) the earliest version that could be subject to "clobbering"
205+
#' (being overwritten with different update data, but using the *same* version
206+
#' tag as the old update data), or (b) `NA`, to indicate that no versions are
207+
#' clobberable. There are a variety of reasons why versions could be
208+
#' clobberable under routine circumstances, such as (a) today's version of
209+
#' one/all of the columns being published after initially being filled with
210+
#' `NA` or LOCF, (b) a buggy version of today's data being published but then
211+
#' fixed and republished later in the day, or (c) data pipeline delays (e.g.,
212+
#' publisher uploading, periodic scraping, database syncing, periodic
213+
#' fetching, etc.) that make events (a) or (b) reflected later in the day (or
214+
#' even on a different day) than expected; potential causes vary between
215+
#' different data pipelines. The default value is `NA`, which doesn't consider
216+
#' any versions to be clobberable. Another setting that may be appropriate for
217+
#' some pipelines is `max_version_with_row_in(x)`.
218+
#' @param versions_end Optional; length-1, same `class` as `x$version`: what is
219+
#' the last version we have observed? The default is
227220
#' `max_version_with_row_in(x)`, but values greater than this could also be
228221
#' valid, and would indicate that we observed additional versions of the data
229222
#' beyond `max(x$version)`, but they all contained empty updates. (The default

R/methods-epi_archive.R

+161-10
Original file line numberDiff line numberDiff line change
@@ -80,19 +80,13 @@ epix_as_of <- function(x, version, min_time_value = -Inf, all_versions = FALSE,
8080
"`version` must have the same `class` vector as `epi_archive$DT$version`."
8181
)
8282
}
83-
if (!identical(typeof(version), typeof(x$DT$version))) {
84-
cli_abort(
85-
"`version` must have the same `typeof` as `epi_archive$DT$version`."
86-
)
87-
}
8883
assert_scalar(version, na.ok = FALSE)
8984
if (version > x$versions_end) {
9085
cli_abort("`version` must be at most `epi_archive$versions_end`.")
9186
}
9287
assert_scalar(min_time_value, na.ok = FALSE)
9388
min_time_value_inf <- is.infinite(min_time_value) && min_time_value < 0
94-
min_time_value_same_type <- typeof(min_time_value) == typeof(x$DT$time_value) &
95-
class(min_time_value) == class(x$DT$time_value)
89+
min_time_value_same_type <- identical(class(min_time_value), class(x$DT$time_value))
9690
if (!min_time_value_inf && !min_time_value_same_type) {
9791
cli_abort("`min_time_value` must be either -Inf or a time_value of the same type and
9892
class as `epi_archive$time_value`.")
@@ -941,9 +935,6 @@ epix_truncate_versions_after.epi_archive <- function(x, max_version) {
941935
if (!identical(class(max_version), class(x$DT$version))) {
942936
cli_abort("`max_version` must have the same `class` as `epi_archive$DT$version`.")
943937
}
944-
if (!identical(typeof(max_version), typeof(x$DT$version))) {
945-
cli_abort("`max_version` must have the same `typeof` as `epi_archive$DT$version`.")
946-
}
947938
assert_scalar(max_version, na.ok = FALSE)
948939
if (max_version > x$versions_end) {
949940
cli_abort("`max_version` must be at most `epi_archive$versions_end`.")
@@ -1020,3 +1011,163 @@ dplyr_col_modify.col_modify_recorder_df <- function(data, cols) {
10201011
attr(data, "epiprocess::col_modify_recorder_df::cols") <- cols
10211012
data
10221013
}
1014+
1015+
1016+
1017+
#' [`dplyr::filter`] for `epi_archive`s
1018+
#'
1019+
#' @param .data an `epi_archive`
1020+
#' @param ... as in [`dplyr::filter`]; using the `version` column is not allowed
1021+
#' unless you use `.format_aware = TRUE`; see details.
1022+
#' @param .by as in [`dplyr::filter`]
1023+
#' @param .format_aware optional, `TRUE` or `FALSE`; default `FALSE`. See
1024+
#' details.
1025+
#'
1026+
#' @details
1027+
#'
1028+
#' By default, using the `version` column or measurement columns is disabled as
1029+
#' it's easy to get unexpected results. See if either [`epix_as_of`] or
1030+
#' [`epix_slide`] works for any version selection you have in mind: for version
1031+
#' selection, see the `version` or `.versions` args, respectively; for
1032+
#' measurement column-based filtering, try `filter`ing after `epix_as_of` or
1033+
#' inside the `.f` in `epix_slide()`. If they don't cover your use case, then
1034+
#' you can set `.format_aware = TRUE` to enable usage of these columns, but be
1035+
#' careful to:
1036+
#' * Factor in that `.data$DT` may have been converted into a compact format
1037+
#' based on diffing consecutive versions, and the last version of each
1038+
#' observation in `.data$DT` will always be carried forward to future
1039+
#' `version`s`; see details of [`as_epi_archive`].
1040+
#' * Set `clobberable_versions_start` and `versions_end` of the result
1041+
#' appropriately after the `filter` call. They will be initialized with the
1042+
#' same values as in `.data`.
1043+
#'
1044+
#' `dplyr::filter` also has an optional argument `.preserve`, which should not
1045+
#' have an impact on (ungrouped) `epi_archive`s, and `grouped_epi_archive`s do
1046+
#' not currently support `dplyr::filter`.
1047+
#'
1048+
#' @examples
1049+
#'
1050+
#' # Filter to one location and a particular time range:
1051+
#' archive_cases_dv_subset %>%
1052+
#' filter(geo_value == "fl", time_value >= as.Date("2020-10-01"))
1053+
#'
1054+
#' # Convert to weekly by taking the Saturday data for each week, so that
1055+
#' # `case_rate_7d_av` represents a Sun--Sat average:
1056+
#' archive_cases_dv_subset %>%
1057+
#' filter(as.POSIXlt(time_value)$wday == 6L)
1058+
#'
1059+
#' # Filtering involving the `version` column or measurement columns requires
1060+
#' # extra care. See epix_as_of and epix_slide instead for some common
1061+
#' # operations. One semi-common operation that ends up being fairly simple is
1062+
#' # treating observations as finalized after some amount of time, and ignoring
1063+
#' # any revisions that were made after that point:
1064+
#' archive_cases_dv_subset %>%
1065+
#' filter(
1066+
#' version <= time_value + as.difftime(60, units = "days"),
1067+
#' .format_aware = TRUE
1068+
#' )
1069+
#'
1070+
#' @export
1071+
filter.epi_archive <- function(.data, ..., .by = NULL, .format_aware = FALSE) {
1072+
in_tbl <- tibble::as_tibble(as.list(.data$DT), .name_repair = "minimal")
1073+
if (.format_aware) {
1074+
out_tbl <- in_tbl %>%
1075+
filter(..., .by = {{ .by }})
1076+
} else {
1077+
measurement_colnames <- setdiff(names(.data$DT), key_colnames(.data))
1078+
forbidden_colnames <- c("version", measurement_colnames)
1079+
out_tbl <- in_tbl %>%
1080+
filter(
1081+
# Add our own fake filter arg to the user's ..., to update the data mask
1082+
# to prevent `version` column usage.
1083+
{
1084+
# We should be evaluating inside the data mask. To disable both
1085+
# `version` and `.data$version` etc., we need to go to the ancestor
1086+
# environment containing the data mask's column bindings. This is
1087+
# likely just the parent env, but search to make sure, in a way akin
1088+
# to `<<-`:
1089+
e <- environment()
1090+
while (!identical(e, globalenv()) && !identical(e, emptyenv())) { # nolint:vector_logic_linter
1091+
if ("version" %in% names(e)) {
1092+
# This is where the column bindings are. Replace the forbidden ones.
1093+
# They are expected to be active bindings, so directly
1094+
# assigning has issues; `rm` first.
1095+
rm(list = forbidden_colnames, envir = e)
1096+
eval_env <- new.env(parent = asNamespace("epiprocess")) # see (2) below
1097+
delayedAssign(
1098+
"version",
1099+
cli_abort(c(
1100+
"Using `version` in `filter.epi_archive` may produce unexpected results.",
1101+
">" = "See if `epix_as_of` or `epix_slide` would work instead.",
1102+
">" = "If not, see `?filter.epi_archive` details for how to proceed."
1103+
), class = "epiprocess__filter_archive__used_version"),
1104+
eval.env = eval_env,
1105+
assign.env = e
1106+
)
1107+
for (measurement_colname in measurement_colnames) {
1108+
# Record current `measurement_colname` and set up execution for
1109+
# the promise for the error in its own dedicated environment, so
1110+
# that (1) `for` loop updating its value and `rm` cleanup don't
1111+
# mess things up. We can also (2) prevent changes to data mask
1112+
# ancestry (to involve user's quosure env rather than our
1113+
# quosure env) or contents (from edge case of user binding
1114+
# functions inside the mask) from potentially interfering by
1115+
# setting the promise's execution environment to skip over the
1116+
# data mask.
1117+
eval_env <- new.env(parent = asNamespace("epiprocess"))
1118+
eval_env[["local_measurement_colname"]] <- measurement_colname
1119+
delayedAssign(
1120+
measurement_colname,
1121+
cli_abort(c(
1122+
"Using `{format_varname(local_measurement_colname)}`
1123+
in `filter.epi_archive` may produce unexpected results.",
1124+
">" = "See `?filter.epi_archive` details for how to proceed."
1125+
), class = "epiprocess__filter_archive__used_measurement"),
1126+
eval.env = eval_env,
1127+
assign.env = e
1128+
)
1129+
}
1130+
break
1131+
}
1132+
e <- parent.env(e)
1133+
}
1134+
# Don't mask similarly-named user objects in ancestor envs:
1135+
rm(list = c("e", "measurement_colname", "eval_env"))
1136+
TRUE
1137+
},
1138+
...,
1139+
.by = {{ .by }}
1140+
)
1141+
}
1142+
# We could try to re-infer the geo_type, e.g., when filtering from
1143+
# national+state to just state. However, we risk inference failures such as
1144+
# "hrr" -> "hhs" from filtering to hrr 10, or "custom" -> USA-related when
1145+
# working with non-USA data:
1146+
out_geo_type <- .data$geo_type
1147+
if (.data$time_type == "day") {
1148+
# We might be going from daily to weekly; re-infer:
1149+
out_time_type <- guess_time_type(out_tbl$time_value)
1150+
} else {
1151+
# We might be filtering weekly to a single time_value; avoid re-inferring to
1152+
# stay "week". Or in other cases, just skip inferring, as re-inferring is
1153+
# expected to match the input time_type:
1154+
out_time_type <- .data$time_type
1155+
}
1156+
# Even if they narrow down to just a single value of an other_keys column,
1157+
# it's probably still better (& simpler) to treat it as an other_keys column
1158+
# since it still exists in the result:
1159+
out_other_keys <- .data$other_keys
1160+
# `filter` makes no guarantees about not aliasing columns in its result when
1161+
# the filter condition is all TRUE, so don't setDT.
1162+
out_dtbl <- as.data.table(out_tbl, key = out_other_keys)
1163+
result <- new_epi_archive(
1164+
out_dtbl,
1165+
out_geo_type, out_time_type, out_other_keys,
1166+
# Assume version-related metadata unchanged; part of why we want to push
1167+
# back on filter expressions like `.data$version <= .env$as_of`:
1168+
.data$clobberable_versions_start, .data$versions_end
1169+
)
1170+
# Filtering down rows while keeping all (ukey) columns should preserve ukey
1171+
# uniqueness.
1172+
result
1173+
}

R/methods-epi_df.R

+1-1
Original file line numberDiff line numberDiff line change
@@ -532,7 +532,7 @@ sum_groups_epi_df <- function(.x, sum_cols, group_cols = "time_value") {
532532
if (!"geo_value" %in% group_cols) {
533533
out <- out %>%
534534
mutate(geo_value = "total") %>%
535-
relocate(.data$geo_value, .before = 1)
535+
relocate("geo_value", .before = 1)
536536
}
537537

538538
# The `geo_type` will be correctly inherited here by the following logic:

_pkgdown.yml

+2-1
Original file line numberDiff line numberDiff line change
@@ -72,8 +72,9 @@ reference:
7272
- epix_as_of
7373
- epix_as_of_current
7474
- epix_slide
75-
- epix_merge
7675
- revision_summary
76+
- epix_merge
77+
- filter.epi_archive
7778
- epix_fill_through_version
7879
- epix_truncate_versions_after
7980
- set_versions_end

man/epi_archive.Rd

+17-17
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)