34
34
# ' `NA`'s compactify is run again if `compactify` is `TRUE` to make
35
35
# ' sure there are no duplicate values from occasions when the signal is
36
36
# ' revised to `NA`, and then back to its immediately-preceding value.
37
- # ' @param print_inform bool, determines whether to print summary information, or
38
- # ' only return the full summary tibble
39
37
# ' @param min_waiting_period `difftime`, integer or `NULL`. Sets a cutoff: any
40
38
# ' time_values that have not had at least `min_waiting_period` to stabilize as
41
39
# ' of the `versions_end` are removed. `min_waiting_period` should characterize
42
40
# ' the typical time during which most significant revisions occur. The default
43
41
# ' of 60 days corresponds to a typical near-final value for case counts as
44
42
# ' reported in the context of insurance. To avoid this filtering, either set
45
- # ' to `NULL` or 0.
43
+ # ' to `NULL` or 0. A `difftime` will be rounded up to the appropriate `time_type` if
44
+ # ' necessary (that is 5 days will be rounded to 1 week if the data is weekly).
46
45
# ' @param within_latest double between 0 and 1. Determines the threshold
47
46
# ' used for the `lag_to`
48
- # ' @param quick_revision difftime or integer (integer is treated as days), for
49
- # ' the printed summary, the amount of time between the final revision and the
50
- # ' actual time_value to consider the revision quickly resolved. Default of 3
51
- # ' days
52
- # ' @param few_revisions integer, for the printed summary, the upper bound on the
53
- # ' number of revisions to consider "few". Default is 3.
54
- # ' @param abs_spread_threshold length-1 numeric, for the printed summary, the
55
- # ' maximum spread used to characterize revisions which don't actually change
56
- # ' very much. Default is 5% of the maximum value in the dataset, but this is
57
- # ' the most unit dependent of values, and likely needs to be chosen
58
- # ' appropriate for the scale of the dataset.
59
- # ' @param rel_spread_threshold length-1 double between 0 and 1, for the printed
60
- # ' summary, the relative spread fraction used to characterize revisions which
61
- # ' don't actually change very much. Default is .1, or 10% of the final value
62
47
# ' @param compactify bool. If `TRUE`, we will compactify after the signal
63
48
# ' requested in `...` has been selected on its own and the `drop_nas` step.
64
49
# ' This helps, for example, to give similar results when called on
67
52
# ' requested signal. The default is `TRUE`.
68
53
# ' @param compactify_abs_tol length-1 double, used if `compactify` is `TRUE`, it
69
54
# ' determines the threshold for when two doubles are considered identical.
55
+ # ' @param return_only_tibble boolean to return only the simple `tibble` of
56
+ # ' computational results rather than the complete S3 object.
70
57
# '
71
58
# ' @details Applies to `epi_archive`s with `time_type`s of `"day"`, `"week"`,
72
59
# ' and `"yearmonth"`. It can also work with a `time_type` of `"integer"` if
76
63
# ' produce incorrect results for some calculations, since week numbering
77
64
# ' contains jumps at year boundaries.
78
65
# '
66
+ # ' @return An S3 object with class `revision_behavior`. This function is typically
67
+ # ' called for the purposes of inspecting the printed output. The
68
+ # ' results of the computations are available in
69
+ # ' `revision_analysis(...)$revision_behavior`. If you only want to access
70
+ # ' the internal computations, use `return_only_tibble = TRUE`.
71
+ # '
79
72
# ' @examples
80
- # ' revision_example <- revision_summary (archive_cases_dv_subset, percent_cli)
81
- # ' revision_example %>% arrange(desc(spread))
73
+ # ' revision_example <- revision_analysis (archive_cases_dv_subset, percent_cli)
74
+ # ' revision_example$revision_behavior %>% arrange(desc(spread))
82
75
# '
83
76
# ' @export
84
77
# ' @importFrom cli cli_inform cli_abort cli_li
85
78
# ' @importFrom rlang list2 syms dots_n
86
79
# ' @importFrom vctrs vec_cast
87
80
# ' @importFrom dplyr mutate group_by arrange filter if_any all_of across pull pick c_across
88
81
# ' everything ungroup summarize if_else %>%
89
- revision_summary <- function (epi_arch ,
90
- ... ,
91
- drop_nas = TRUE ,
92
- print_inform = TRUE ,
93
- min_waiting_period = as.difftime(60 , units = " days" ) %> %
94
- difftime_approx_ceiling_time_delta(epi_arch $ time_type ),
95
- within_latest = 0.2 ,
96
- quick_revision = as.difftime(3 , units = " days" ) %> %
97
- difftime_approx_ceiling_time_delta(epi_arch $ time_type ),
98
- few_revisions = 3 ,
99
- abs_spread_threshold = NULL ,
100
- rel_spread_threshold = 0.1 ,
101
- compactify = TRUE ,
102
- compactify_abs_tol = 0 ) {
82
+ revision_analysis <- function (epi_arch ,
83
+ ... ,
84
+ drop_nas = TRUE ,
85
+ min_waiting_period = as.difftime(60 , units = " days" ),
86
+ within_latest = 0.2 ,
87
+ compactify = TRUE ,
88
+ compactify_abs_tol = 0 ,
89
+ return_only_tibble = FALSE ) {
103
90
assert_class(epi_arch , " epi_archive" )
91
+ if (methods :: is(min_waiting_period , " difftime" )) {
92
+ min_waiting_period <- min_waiting_period %> %
93
+ difftime_approx_ceiling_time_delta(epi_arch $ time_type )
94
+ }
104
95
# if the column to summarize isn't specified, use the only one if there is only one
105
96
if (dots_n(... ) == 0 ) {
106
97
# Choose the first column that's not a key:
@@ -126,11 +117,6 @@ revision_summary <- function(epi_arch,
126
117
cli_abort(" Not currently implementing more than one column at a time. Run each separately." )
127
118
}
128
119
}
129
- if (is.null(abs_spread_threshold )) {
130
- abs_spread_threshold <- .05 * epi_arch $ DT %> %
131
- pull(!! arg ) %> %
132
- max(na.rm = TRUE )
133
- }
134
120
# for each time_value, get
135
121
# the number of revisions
136
122
# the maximum spread in value (both absolute and relative)
@@ -193,63 +179,113 @@ revision_summary <- function(epi_arch,
193
179
time_value , geo_value , all_of(epikey_names ), n_revisions , min_lag , max_lag , # nolint: object_usage_linter
194
180
lag_near_latest , spread , rel_spread , min_value , max_value , median_value # nolint: object_usage_linter
195
181
)
196
- if (print_inform ) {
197
- cli_inform(" Min lag (time to first version):" )
198
- time_delta_summary(revision_behavior $ min_lag , time_type ) %> % print()
199
- if (! drop_nas ) {
200
- total_na <- epi_arch $ DT %> %
201
- filter(is.na(c_across(!! arg ))) %> % # nolint: object_usage_linter
202
- nrow()
203
- cli_inform(" Fraction of all versions that are `NA`:" )
204
- cli_li(num_percent(total_na , nrow(epi_arch $ DT ), " " ))
205
- cli_inform(" " )
206
- }
207
- cli_inform(" Fraction of epi_key+time_values with" )
208
- total_num <- nrow(revision_behavior ) # nolint: object_usage_linter
209
- total_num_unrevised <- sum(revision_behavior $ n_revisions == 0 ) # nolint: object_usage_linter
210
- cli_inform(" No revisions:" )
211
- cli_li(num_percent(total_num_unrevised , total_num , " " ))
212
- total_quickly_revised <- sum( # nolint: object_usage_linter
213
- time_delta_to_n_steps(revision_behavior $ max_lag , time_type ) < =
214
- time_delta_to_n_steps(quick_revision , time_type )
215
- )
216
- cli_inform(" Quick revisions (last revision within {format_time_delta(quick_revision, time_type)}
217
- of the `time_value`):" )
218
- cli_li(num_percent(total_quickly_revised , total_num , " " ))
219
- total_barely_revised <- sum( # nolint: object_usage_linter
220
- revision_behavior $ n_revisions < =
221
- few_revisions
222
- )
223
- cli_inform(" Few revisions (At most {few_revisions} revisions for that `time_value`):" )
224
- cli_li(num_percent(total_barely_revised , total_num , " " ))
225
- cli_inform(" " )
226
- cli_inform(" Fraction of revised epi_key+time_values which have:" )
182
+ total_na <- epi_arch $ DT %> %
183
+ filter(is.na(c_across(!! arg ))) %> % # nolint: object_usage_linter
184
+ nrow()
185
+ if (! return_only_tibble ) {
186
+ revision_behavior <- structure(list (
187
+ revision_behavior = revision_behavior ,
188
+ range_time_values = range(epi_arch $ DT $ time_value ),
189
+ signal_variable = arg ,
190
+ drop_nas = drop_nas ,
191
+ time_type = time_type ,
192
+ total_na = total_na ,
193
+ max_val = max(epi_arch $ DT [[arg ]], na.rm = TRUE ),
194
+ n_obs = nrow(epi_arch $ DT ),
195
+ within_latest = within_latest
196
+ ), class = " revision_analysis" )
197
+ }
198
+ return (revision_behavior )
199
+ }
200
+
201
+
227
202
228
- real_revisions <- revision_behavior %> % filter(n_revisions > 0 ) # nolint: object_usage_linter
229
- n_real_revised <- nrow(real_revisions ) # nolint: object_usage_linter
230
- rel_spread <- sum( # nolint: object_usage_linter
231
- real_revisions $ rel_spread <
232
- rel_spread_threshold ,
233
- na.rm = TRUE
234
- ) + sum(is.na(real_revisions $ rel_spread ))
235
- cli_inform(" Less than {rel_spread_threshold} spread in relative value:" )
236
- cli_li(num_percent(rel_spread , n_real_revised , " " ))
237
- abs_spread <- sum( # nolint: object_usage_linter
238
- real_revisions $ spread >
239
- abs_spread_threshold
240
- ) # nolint: object_usage_linter
241
- cli_inform(" Spread of more than {abs_spread_threshold} in actual value (when revised):" )
242
- cli_li(num_percent(abs_spread , n_real_revised , " " ))
243
203
244
- # time_type_unit_pluralizer[[time_type]] is a format string controlled by us
245
- # and/or downstream devs, so we can paste it onto our format string safely:
246
- units_plural <- pluralize(paste0(" {qty(2)}" , time_type_unit_pluralizer [[time_type ]])) # nolint: object_usage_linter
247
- cli_inform(" {toTitleCase(units_plural)} until within {within_latest*100}% of the latest value:" )
248
- time_delta_summary(revision_behavior [[" lag_near_latest" ]], time_type ) %> % print()
204
+ # ' Print a `revision_analysis` object
205
+ # '
206
+ # ' @param x a `revision_analysis` object
207
+ # ' @param quick_revision Difftime or integer (integer is treated as days).
208
+ # ' The amount of time between the final revision and the
209
+ # ' actual time_value to consider the revision quickly resolved. Default of 3
210
+ # ' days. This will be rounded up to the appropriate `time_type` if
211
+ # ' necessary (that is 5 days will be rounded to 1 week if the data is weekly).
212
+ # ' @param few_revisions Integer. The upper bound on the
213
+ # ' number of revisions to consider "few". Default is 3.
214
+ # ' @param abs_spread_threshold Scalar numeric. The
215
+ # ' maximum spread used to characterize revisions which don't actually change
216
+ # ' very much. Default is 5% of the maximum value in the dataset, but this is
217
+ # ' the most unit dependent of values, and likely needs to be chosen
218
+ # ' appropriate for the scale of the dataset.
219
+ # ' @param rel_spread_threshold Scalar between 0 and 1. The relative spread fraction used to characterize revisions which
220
+ # ' don't actually change very much. Default is .1, or 10% of the final value
221
+ # '
222
+ # ' @rdname revision_analysis
223
+ # ' @export
224
+ print.revision_analysis <- function (x ,
225
+ quick_revision = as.difftime(3 , units = " days" ),
226
+ few_revisions = 3 ,
227
+ abs_spread_threshold = NULL ,
228
+ rel_spread_threshold = 0.1 ,
229
+ ... ) {
230
+ if (methods :: is(quick_revision , " difftime" )) {
231
+ quick_revision <- quick_revision %> %
232
+ difftime_approx_ceiling_time_delta(x $ time_type )
249
233
}
250
- return (revision_behavior )
234
+ if (is.null(abs_spread_threshold )) abs_spread_threshold <- .05 * x $ max_val
235
+ rev_beh <- x $ revision_behavior
236
+ cli :: cli_h2(" An epi_archive spanning {.val {x$range_time_values[1]}} to {.val {x$range_time_values[1]}}." )
237
+ cli :: cli_h3(" Min lag (time to first version):" )
238
+ time_delta_summary(rev_beh $ min_lag , x $ time_type ) %> % print()
239
+ if (! x $ drop_nas ) {
240
+ cli_inform(" Fraction of all versions that are `NA`:" )
241
+ cli_li(num_percent(x $ total_na , x $ n_obs , " " ))
242
+ cli_inform(" " )
243
+ }
244
+ cli :: cli_h3(" Fraction of epi_key + time_values with" )
245
+ total_num <- nrow(rev_beh ) # nolint: object_usage_linter
246
+ total_num_unrevised <- sum(rev_beh $ n_revisions == 0 ) # nolint: object_usage_linter
247
+ cli_inform(" No revisions:" )
248
+ cli_li(num_percent(total_num_unrevised , total_num , " " ))
249
+ total_quickly_revised <- sum( # nolint: object_usage_linter
250
+ time_delta_to_n_steps(rev_beh $ max_lag , x $ time_type ) < =
251
+ time_delta_to_n_steps(quick_revision , x $ time_type )
252
+ )
253
+ cli_inform(" Quick revisions (last revision within {format_time_delta(quick_revision, x$time_type)}
254
+ of the `time_value`):" )
255
+ cli_li(num_percent(total_quickly_revised , total_num , " " ))
256
+ total_barely_revised <- sum(rev_beh $ n_revisions < = few_revisions )
257
+ cli_inform(" Few revisions (At most {.val {few_revisions}} revisions for that `time_value`):" )
258
+ cli_li(num_percent(total_barely_revised , total_num , " " ))
259
+
260
+ cli :: cli_h3(" Fraction of revised epi_key + time_values which have:" )
261
+
262
+ real_revisions <- rev_beh %> % filter(n_revisions > 0 ) # nolint: object_usage_linter
263
+ n_real_revised <- nrow(real_revisions ) # nolint: object_usage_linter
264
+ rel_spread <- sum( # nolint: object_usage_linter
265
+ real_revisions $ rel_spread < rel_spread_threshold ,
266
+ na.rm = TRUE
267
+ ) + sum(is.na(real_revisions $ rel_spread ))
268
+ cli_inform(" Less than {.val {rel_spread_threshold}} spread in relative value:" )
269
+ cli_li(num_percent(rel_spread , n_real_revised , " " ))
270
+ abs_spread <- sum( # nolint: object_usage_linter
271
+ real_revisions $ spread > abs_spread_threshold
272
+ ) # nolint: object_usage_linter
273
+ divid <- cli :: cli_div(theme = list (.val = list (digits = 3 )))
274
+ cli_inform(" Spread of more than {.val {abs_spread_threshold}} in actual value (when revised):" )
275
+ cli :: cli_end(divid )
276
+ cli_li(num_percent(abs_spread , n_real_revised , " " ))
277
+
278
+ # time_type_unit_pluralizer[[time_type]] is a format string controlled by us
279
+ # and/or downstream devs, so we can paste it onto our format string safely:
280
+ units_plural <- pluralize(paste0(" {qty(2)}" , time_type_unit_pluralizer [[x $ time_type ]])) # nolint: object_usage_linter
281
+ cli :: cli_h3(" {toTitleCase(units_plural)} until within {.val {x$within_latest*100}}% of the latest value:" )
282
+ time_delta_summary(rev_beh [[" lag_near_latest" ]], x $ time_type ) %> % print()
251
283
}
252
284
285
+ # ' @export
286
+ # ' @rdname revision_analysis
287
+ revision_summary <- revision_analysis
288
+
253
289
# ' pull the value from lags when values starts indefinitely being within prop of its latest value.
254
290
# ' @param lags vector of lags; should be sorted
255
291
# ' @param values this should be a vector (e.g., a column) with length matching that of `lags`
0 commit comments