Skip to content

Commit cd7787d

Browse files
committedAug 13, 2024
from JingJing: min/max/med value, recent time cutoff
1 parent 4b7f28a commit cd7787d

7 files changed

+99
-93
lines changed
 

‎R/revision_analysis.R

+32-26
Original file line numberDiff line numberDiff line change
@@ -9,25 +9,35 @@
99
#' includes `NA`'s)
1010
#' 3. `max_lag`: the amount of time until the final (new) version (same caveat
1111
#' for `drop_nas=FALSE`, though it is far less likely to matter)
12-
#' 4. `spread`: the difference between the smallest and largest values (this
12+
#' 4. `min_value`: the minimum value across revisions
13+
#' 5. `max_value`: the maximum value across revisions
14+
#' 6. `median_value`: the median value across revisions
15+
#' 7. `spread`: the difference between the smallest and largest values (this
1316
#' always excludes `NA` values)
14-
#' 5. `rel_spread`: `spread` divided by the largest value (so it will
17+
#' 8. `rel_spread`: `spread` divided by the largest value (so it will
1518
#' always be less than 1). Note that this need not be the final value. It will
1619
#' be `NA` whenever `spread` is 0.
17-
#' 6. `time_near_latest`: This gives the lag when the value is within
20+
#' 9. `time_near_latest`: This gives the lag when the value is within
1821
#' `within_latest` (default 20%) of the value at the latest time. For example,
1922
#' consider the series (0,20, 99, 150, 102, 100); then `time_near_latest` is
2023
#' the 5th index, since even though 99 is within 20%, it is outside the window
2124
#' afterwards at 150.
2225
#' @param epi_arch an epi_archive to be analyzed
23-
#' @param ... <[`tidyselect`][dplyr_tidy_select]>, used to choose the column to summarize. If empty, it
24-
#' chooses the first. Currently only implemented for one column at a time.
26+
#' @param ... <[`tidyselect`][dplyr_tidy_select]>, used to choose the column to
27+
#' summarize. If empty, it chooses the first. Currently only implemented for
28+
#' one column at a time.
2529
#' @param drop_nas bool, drop any `NA` values from the archive? After dropping
2630
#' `NA`'s compactify is run again to make sure there are no duplicate values
2731
#' from occasions when the signal is revised to `NA`, and then back to its
2832
#' immediately-preceding value.
2933
#' @param print_inform bool, determines whether to print summary information, or
3034
#' only return the full summary tibble
35+
#' @param min_waiting_period `difftime`, integer or `NULL`. Sets a cutoff: any
36+
#' time_values not earlier than `min_waiting_period` before `versions_end` are
37+
#' removed. `min_waiting_period` should characterize the typical time during
38+
#' which revisions occur. The default of 60 days corresponds to a typical
39+
#' final value for case counts as reported in the context of insurance. To
40+
#' avoid this filtering, either set to `NULL` or 0.
3141
#' @param within_latest double between 0 and 1. Determines the threshold
3242
#' used for the `time_to`
3343
#' @param quick_revision difftime or integer (integer is treated as days), for
@@ -60,6 +70,7 @@ revision_summary <- function(epi_arch,
6070
...,
6171
drop_nas = TRUE,
6272
print_inform = TRUE,
73+
min_waiting_period = as.difftime(60, units = "days"),
6374
within_latest = 0.2,
6475
quick_revision = as.difftime(3, units = "days"),
6576
few_revisions = 3,
@@ -92,6 +103,11 @@ revision_summary <- function(epi_arch,
92103
revision_behavior <-
93104
epi_arch$DT %>%
94105
select(c(geo_value, time_value, all_of(keys), version, !!arg))
106+
if (!is.null(min_waiting_period)) {
107+
revision_behavior <- revision_behavior %>%
108+
filter(abs(time_value - as.Date(epi_arch$versions_end)) >= min_waiting_period)
109+
}
110+
95111
if (drop_nas) {
96112
# if we're dropping NA's, we should recompactify
97113
revision_behavior <-
@@ -113,18 +129,22 @@ revision_summary <- function(epi_arch,
113129
n_revisions = dplyr::n() - 1,
114130
min_lag = min(lag), # nolint: object_usage_linter
115131
max_lag = max(lag), # nolint: object_usage_linter
116-
spread = spread_vec(pick(!!arg)),
117-
rel_spread = spread / max_no_na(pick(!!arg)), # nolint: object_usage_linter
132+
min_value = f_no_na(min, pick(!!arg)),
133+
max_value = f_no_na(max, pick(!!arg)),
134+
median_value = f_no_na(median, pick(!!arg)),
118135
time_to = time_within_x_latest(lag, pick(!!arg), prop = within_latest), # nolint: object_usage_linter
119136
.groups = "drop"
120137
) %>%
121138
mutate(
139+
spread = max_value - min_value, # nolint: object_usage_linter
140+
rel_spread = spread / max_value, # nolint: object_usage_linter
122141
# TODO the units here may be a problem
123142
min_lag = as.difftime(min_lag, units = "days"), # nolint: object_usage_linter
124143
max_lag = as.difftime(max_lag, units = "days"), # nolint: object_usage_linter
125144
time_near_latest = as.difftime(time_to, units = "days") # nolint: object_usage_linter
126145
) %>%
127-
select(-time_to)
146+
select(-time_to) %>%
147+
relocate(time_value, geo_value, all_of(keys), n_revisions, min_lag, max_lag, time_near_latest, spread, rel_spread, min_value, max_value, median_value)
128148
if (print_inform) {
129149
cli_inform("Min lag (time to first version):")
130150
difftime_summary(revision_behavior$min_lag) %>% print()
@@ -203,31 +223,17 @@ get_last_run <- function(bool_vec, values_from) {
203223
values_from[[length(bool_vec) - tail(runs$lengths, n = 1) + 1]]
204224
}
205225

206-
#' the default behavior returns a warning on empty lists, which we do not want,
207-
#' and there is no super clean way of preventing this
226+
#' use when the default behavior returns a warning on empty lists, which we do
227+
#' not want, and there is no super clean way of preventing this
208228
#' @keywords internal
209-
max_no_na <- function(x) {
229+
f_no_na <- function(f, x) {
210230
x <- x[!is.na(x)]
211231
if (length(x) == 0) {
212232
return(Inf)
213233
} else {
214-
return(max(x))
234+
return(f(x))
215235
}
216236
}
217-
#' the default behavior returns a warning on empty lists, which we do not want
218-
#' @keywords internal
219-
spread_vec <- function(x) {
220-
x <- x[!is.na(x)]
221-
if (length(x) == 0) {
222-
return(-Inf)
223-
} else {
224-
res <- x %>%
225-
range(na.rm = TRUE) %>%
226-
diff(na.rm = TRUE)
227-
return(res)
228-
}
229-
}
230-
231237

232238

233239
#' simple util for printing a fraction and it's percent

‎man/f_no_na.Rd

+14
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

‎man/max_no_na.Rd

-14
This file was deleted.

‎man/revision_summary.Rd

+14-2
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

‎man/spread_vec.Rd

-12
This file was deleted.

‎tests/testthat/_snaps/revision-latency-functions.md

+38-38
Original file line numberDiff line numberDiff line change
@@ -24,25 +24,25 @@
2424
Output
2525
min median mean max
2626
0 days 3 days 6.9 days 19 days
27-
# A tibble: 7 x 8
28-
time_value geo_value n_revisions min_lag max_lag spread rel_spread
29-
<date> <chr> <dbl> <drtn> <drtn> <dbl> <dbl>
30-
1 2020-01-01 ak 4 2 days 19 days 101 0.990
31-
2 2020-01-01 al 1 0 days 19 days 99 0.99
32-
3 2020-01-02 ak 1 4 days 5 days 9 0.09
33-
4 2020-01-02 al 0 0 days 0 days 0 0
34-
5 2020-01-03 ak 0 3 days 3 days 0 NaN
35-
6 2020-01-03 al 1 1 days 2 days 3 0.75
36-
7 2020-01-04 al 0 1 days 1 days 0 0
37-
time_near_latest
38-
<drtn>
39-
1 19 days
40-
2 19 days
41-
3 4 days
42-
4 0 days
43-
5 3 days
44-
6 2 days
45-
7 1 days
27+
# A tibble: 7 x 11
28+
time_value geo_value n_revisions min_lag max_lag time_near_latest spread
29+
<date> <chr> <dbl> <drtn> <drtn> <drtn> <dbl>
30+
1 2020-01-01 ak 4 2 days 19 days 19 days 101
31+
2 2020-01-01 al 1 0 days 19 days 19 days 99
32+
3 2020-01-02 ak 1 4 days 5 days 4 days 9
33+
4 2020-01-02 al 0 0 days 0 days 0 days 0
34+
5 2020-01-03 ak 0 3 days 3 days 3 days 0
35+
6 2020-01-03 al 1 1 days 2 days 2 days 3
36+
7 2020-01-04 al 0 1 days 1 days 1 days 0
37+
rel_spread min_value max_value median_value
38+
<dbl> <dbl> <dbl> <dbl>
39+
1 0.990 1 102 6
40+
2 0.99 1 100 50.5
41+
3 0.09 91 100 95.5
42+
4 0 1 1 1
43+
5 NaN 0 0 0
44+
6 0.75 1 4 2.5
45+
7 0 9 9 9
4646

4747
---
4848

@@ -72,23 +72,23 @@
7272
Output
7373
min median mean max
7474
0 days 3 days 6.9 days 19 days
75-
# A tibble: 7 x 8
76-
time_value geo_value n_revisions min_lag max_lag spread rel_spread
77-
<date> <chr> <dbl> <drtn> <drtn> <dbl> <dbl>
78-
1 2020-01-01 ak 6 2 days 19 days 101 0.990
79-
2 2020-01-01 al 1 0 days 19 days 99 0.99
80-
3 2020-01-02 ak 1 4 days 5 days 9 0.09
81-
4 2020-01-02 al 0 0 days 0 days 0 0
82-
5 2020-01-03 ak 0 3 days 3 days 0 NaN
83-
6 2020-01-03 al 1 1 days 2 days 3 0.75
84-
7 2020-01-04 al 1 0 days 1 days 0 0
85-
time_near_latest
86-
<drtn>
87-
1 19 days
88-
2 19 days
89-
3 4 days
90-
4 0 days
91-
5 3 days
92-
6 2 days
93-
7 1 days
75+
# A tibble: 7 x 11
76+
time_value geo_value n_revisions min_lag max_lag time_near_latest spread
77+
<date> <chr> <dbl> <drtn> <drtn> <drtn> <dbl>
78+
1 2020-01-01 ak 6 2 days 19 days 19 days 101
79+
2 2020-01-01 al 1 0 days 19 days 19 days 99
80+
3 2020-01-02 ak 1 4 days 5 days 4 days 9
81+
4 2020-01-02 al 0 0 days 0 days 0 days 0
82+
5 2020-01-03 ak 0 3 days 3 days 3 days 0
83+
6 2020-01-03 al 1 1 days 2 days 2 days 3
84+
7 2020-01-04 al 1 0 days 1 days 1 days 0
85+
rel_spread min_value max_value median_value
86+
<dbl> <dbl> <dbl> <dbl>
87+
1 0.990 1 102 5.5
88+
2 0.99 1 100 50.5
89+
3 0.09 91 100 95.5
90+
4 0 1 1 1
91+
5 NaN 0 0 0
92+
6 0.75 1 4 2.5
93+
7 0 9 9 9
9494

‎tests/testthat/test-revision-latency-functions.R

+1-1
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ dummy_ex <- tibble::tribble(
2828
"ak", as.Date("2020-01-03"), as.Date("2020-01-06"), 0,
2929
"ak", as.Date("2020-01-03"), as.Date("2020-01-07"), 0,
3030
) %>%
31-
as_epi_archive(compactify = FALSE)
31+
as_epi_archive(versions_end = as.Date("2022-01-01"), compactify = FALSE)
3232

3333
test_that("revision_summary works for a dummy dataset", {
3434
expect_snapshot(dummy_ex %>% revision_summary() %>% print(n = 10, width = 300))

0 commit comments

Comments
 (0)
Please sign in to comment.