@@ -80,19 +80,13 @@ epix_as_of <- function(x, version, min_time_value = -Inf, all_versions = FALSE,
80
80
" `version` must have the same `class` vector as `epi_archive$DT$version`."
81
81
)
82
82
}
83
- if (! identical(typeof(version ), typeof(x $ DT $ version ))) {
84
- cli_abort(
85
- " `version` must have the same `typeof` as `epi_archive$DT$version`."
86
- )
87
- }
88
83
assert_scalar(version , na.ok = FALSE )
89
84
if (version > x $ versions_end ) {
90
85
cli_abort(" `version` must be at most `epi_archive$versions_end`." )
91
86
}
92
87
assert_scalar(min_time_value , na.ok = FALSE )
93
88
min_time_value_inf <- is.infinite(min_time_value ) && min_time_value < 0
94
- min_time_value_same_type <- typeof(min_time_value ) == typeof(x $ DT $ time_value ) &
95
- class(min_time_value ) == class(x $ DT $ time_value )
89
+ min_time_value_same_type <- identical(class(min_time_value ), class(x $ DT $ time_value ))
96
90
if (! min_time_value_inf && ! min_time_value_same_type ) {
97
91
cli_abort(" `min_time_value` must be either -Inf or a time_value of the same type and
98
92
class as `epi_archive$time_value`." )
@@ -941,9 +935,6 @@ epix_truncate_versions_after.epi_archive <- function(x, max_version) {
941
935
if (! identical(class(max_version ), class(x $ DT $ version ))) {
942
936
cli_abort(" `max_version` must have the same `class` as `epi_archive$DT$version`." )
943
937
}
944
- if (! identical(typeof(max_version ), typeof(x $ DT $ version ))) {
945
- cli_abort(" `max_version` must have the same `typeof` as `epi_archive$DT$version`." )
946
- }
947
938
assert_scalar(max_version , na.ok = FALSE )
948
939
if (max_version > x $ versions_end ) {
949
940
cli_abort(" `max_version` must be at most `epi_archive$versions_end`." )
@@ -1020,3 +1011,163 @@ dplyr_col_modify.col_modify_recorder_df <- function(data, cols) {
1020
1011
attr(data , " epiprocess::col_modify_recorder_df::cols" ) <- cols
1021
1012
data
1022
1013
}
1014
+
1015
+
1016
+
1017
+ # ' [`dplyr::filter`] for `epi_archive`s
1018
+ # '
1019
+ # ' @param .data an `epi_archive`
1020
+ # ' @param ... as in [`dplyr::filter`]; using the `version` column is not allowed
1021
+ # ' unless you use `.format_aware = TRUE`; see details.
1022
+ # ' @param .by as in [`dplyr::filter`]
1023
+ # ' @param .format_aware optional, `TRUE` or `FALSE`; default `FALSE`. See
1024
+ # ' details.
1025
+ # '
1026
+ # ' @details
1027
+ # '
1028
+ # ' By default, using the `version` column or measurement columns is disabled as
1029
+ # ' it's easy to get unexpected results. See if either [`epix_as_of`] or
1030
+ # ' [`epix_slide`] works for any version selection you have in mind: for version
1031
+ # ' selection, see the `version` or `.versions` args, respectively; for
1032
+ # ' measurement column-based filtering, try `filter`ing after `epix_as_of` or
1033
+ # ' inside the `.f` in `epix_slide()`. If they don't cover your use case, then
1034
+ # ' you can set `.format_aware = TRUE` to enable usage of these columns, but be
1035
+ # ' careful to:
1036
+ # ' * Factor in that `.data$DT` may have been converted into a compact format
1037
+ # ' based on diffing consecutive versions, and the last version of each
1038
+ # ' observation in `.data$DT` will always be carried forward to future
1039
+ # ' `version`s`; see details of [`as_epi_archive`].
1040
+ # ' * Set `clobberable_versions_start` and `versions_end` of the result
1041
+ # ' appropriately after the `filter` call. They will be initialized with the
1042
+ # ' same values as in `.data`.
1043
+ # '
1044
+ # ' `dplyr::filter` also has an optional argument `.preserve`, which should not
1045
+ # ' have an impact on (ungrouped) `epi_archive`s, and `grouped_epi_archive`s do
1046
+ # ' not currently support `dplyr::filter`.
1047
+ # '
1048
+ # ' @examples
1049
+ # '
1050
+ # ' # Filter to one location and a particular time range:
1051
+ # ' archive_cases_dv_subset %>%
1052
+ # ' filter(geo_value == "fl", time_value >= as.Date("2020-10-01"))
1053
+ # '
1054
+ # ' # Convert to weekly by taking the Saturday data for each week, so that
1055
+ # ' # `case_rate_7d_av` represents a Sun--Sat average:
1056
+ # ' archive_cases_dv_subset %>%
1057
+ # ' filter(as.POSIXlt(time_value)$wday == 6L)
1058
+ # '
1059
+ # ' # Filtering involving the `version` column or measurement columns requires
1060
+ # ' # extra care. See epix_as_of and epix_slide instead for some common
1061
+ # ' # operations. One semi-common operation that ends up being fairly simple is
1062
+ # ' # treating observations as finalized after some amount of time, and ignoring
1063
+ # ' # any revisions that were made after that point:
1064
+ # ' archive_cases_dv_subset %>%
1065
+ # ' filter(
1066
+ # ' version <= time_value + as.difftime(60, units = "days"),
1067
+ # ' .format_aware = TRUE
1068
+ # ' )
1069
+ # '
1070
+ # ' @export
1071
+ filter.epi_archive <- function (.data , ... , .by = NULL , .format_aware = FALSE ) {
1072
+ in_tbl <- tibble :: as_tibble(as.list(.data $ DT ), .name_repair = " minimal" )
1073
+ if (.format_aware ) {
1074
+ out_tbl <- in_tbl %> %
1075
+ filter(... , .by = {{ .by }})
1076
+ } else {
1077
+ measurement_colnames <- setdiff(names(.data $ DT ), key_colnames(.data ))
1078
+ forbidden_colnames <- c(" version" , measurement_colnames )
1079
+ out_tbl <- in_tbl %> %
1080
+ filter(
1081
+ # Add our own fake filter arg to the user's ..., to update the data mask
1082
+ # to prevent `version` column usage.
1083
+ {
1084
+ # We should be evaluating inside the data mask. To disable both
1085
+ # `version` and `.data$version` etc., we need to go to the ancestor
1086
+ # environment containing the data mask's column bindings. This is
1087
+ # likely just the parent env, but search to make sure, in a way akin
1088
+ # to `<<-`:
1089
+ e <- environment()
1090
+ while (! identical(e , globalenv()) && ! identical(e , emptyenv())) { # nolint:vector_logic_linter
1091
+ if (" version" %in% names(e )) {
1092
+ # This is where the column bindings are. Replace the forbidden ones.
1093
+ # They are expected to be active bindings, so directly
1094
+ # assigning has issues; `rm` first.
1095
+ rm(list = forbidden_colnames , envir = e )
1096
+ eval_env <- new.env(parent = asNamespace(" epiprocess" )) # see (2) below
1097
+ delayedAssign(
1098
+ " version" ,
1099
+ cli_abort(c(
1100
+ " Using `version` in `filter.epi_archive` may produce unexpected results." ,
1101
+ " >" = " See if `epix_as_of` or `epix_slide` would work instead." ,
1102
+ " >" = " If not, see `?filter.epi_archive` details for how to proceed."
1103
+ ), class = " epiprocess__filter_archive__used_version" ),
1104
+ eval.env = eval_env ,
1105
+ assign.env = e
1106
+ )
1107
+ for (measurement_colname in measurement_colnames ) {
1108
+ # Record current `measurement_colname` and set up execution for
1109
+ # the promise for the error in its own dedicated environment, so
1110
+ # that (1) `for` loop updating its value and `rm` cleanup don't
1111
+ # mess things up. We can also (2) prevent changes to data mask
1112
+ # ancestry (to involve user's quosure env rather than our
1113
+ # quosure env) or contents (from edge case of user binding
1114
+ # functions inside the mask) from potentially interfering by
1115
+ # setting the promise's execution environment to skip over the
1116
+ # data mask.
1117
+ eval_env <- new.env(parent = asNamespace(" epiprocess" ))
1118
+ eval_env [[" local_measurement_colname" ]] <- measurement_colname
1119
+ delayedAssign(
1120
+ measurement_colname ,
1121
+ cli_abort(c(
1122
+ " Using `{format_varname(local_measurement_colname)}`
1123
+ in `filter.epi_archive` may produce unexpected results." ,
1124
+ " >" = " See `?filter.epi_archive` details for how to proceed."
1125
+ ), class = " epiprocess__filter_archive__used_measurement" ),
1126
+ eval.env = eval_env ,
1127
+ assign.env = e
1128
+ )
1129
+ }
1130
+ break
1131
+ }
1132
+ e <- parent.env(e )
1133
+ }
1134
+ # Don't mask similarly-named user objects in ancestor envs:
1135
+ rm(list = c(" e" , " measurement_colname" , " eval_env" ))
1136
+ TRUE
1137
+ },
1138
+ ... ,
1139
+ .by = {{ .by }}
1140
+ )
1141
+ }
1142
+ # We could try to re-infer the geo_type, e.g., when filtering from
1143
+ # national+state to just state. However, we risk inference failures such as
1144
+ # "hrr" -> "hhs" from filtering to hrr 10, or "custom" -> USA-related when
1145
+ # working with non-USA data:
1146
+ out_geo_type <- .data $ geo_type
1147
+ if (.data $ time_type == " day" ) {
1148
+ # We might be going from daily to weekly; re-infer:
1149
+ out_time_type <- guess_time_type(out_tbl $ time_value )
1150
+ } else {
1151
+ # We might be filtering weekly to a single time_value; avoid re-inferring to
1152
+ # stay "week". Or in other cases, just skip inferring, as re-inferring is
1153
+ # expected to match the input time_type:
1154
+ out_time_type <- .data $ time_type
1155
+ }
1156
+ # Even if they narrow down to just a single value of an other_keys column,
1157
+ # it's probably still better (& simpler) to treat it as an other_keys column
1158
+ # since it still exists in the result:
1159
+ out_other_keys <- .data $ other_keys
1160
+ # `filter` makes no guarantees about not aliasing columns in its result when
1161
+ # the filter condition is all TRUE, so don't setDT.
1162
+ out_dtbl <- as.data.table(out_tbl , key = out_other_keys )
1163
+ result <- new_epi_archive(
1164
+ out_dtbl ,
1165
+ out_geo_type , out_time_type , out_other_keys ,
1166
+ # Assume version-related metadata unchanged; part of why we want to push
1167
+ # back on filter expressions like `.data$version <= .env$as_of`:
1168
+ .data $ clobberable_versions_start , .data $ versions_end
1169
+ )
1170
+ # Filtering down rows while keeping all (ukey) columns should preserve ukey
1171
+ # uniqueness.
1172
+ result
1173
+ }
0 commit comments