-
Notifications
You must be signed in to change notification settings - Fork 8
/
Copy patharchive.R
810 lines (762 loc) · 31.8 KB
/
archive.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
# We use special features of data.table's `[`. The data.table package has a
# compatibility feature that disables some/all of these features if it thinks we
# might expect `data.frame`-compatible behavior instead. We can signal that we
# want the special behavior via `.datatable_aware = TRUE` or by importing any
# `data.table` package member. Do both to prevent surprises if we decide to use
# `data.table::` everywhere and not importing things.
.datatable_aware <- TRUE
#' Validate a version bound arg
#'
#' Expected to be used on `clobberable_versions_start`, `versions_end`, and
#' similar arguments. Some additional context-specific checks may be needed.
#' Side effects: raises an error if version bound appears invalid.
#'
#' @param version_bound the version bound to validate
#' @param x a data frame containing a version column with which to check
#' compatibility
#' @param na_ok Boolean; is `NA` an acceptable "bound"? (If so, `NA` will
#' have a special context-dependent meaning.)
#' @param version_bound_arg optional string; what to call the version bound in
#' error messages
#'
#' @keywords internal
validate_version_bound <- function(version_bound, x, na_ok = FALSE,
version_bound_arg = rlang::caller_arg(version_bound),
x_arg = rlang::caller_arg(x)) {
if (is.null(version_bound)) {
cli_abort(
"{version_bound_arg} cannot be NULL",
class = "epiprocess__version_bound_null"
)
}
if (length(version_bound) != 1L) {
cli_abort(
"{version_bound_arg} must have length of 1",
class = "epiprocess__version_bound_wrong_length"
)
}
if (is.na(version_bound)) {
if (!na_ok) {
cli_abort(
"{version_bound_arg} cannot be NA",
class = "epiprocess__version_bound_na_with_na_not_okay"
)
}
} else {
if (!identical(class(version_bound), class(x[["version"]]))) {
cli_abort(
"{version_bound_arg} must have the same `class` vector as x$version,
which has a `class` of {format_chr_deparse(class(x$version))}",
class = "epiprocess__version_bound_mismatched_class"
)
}
if (!identical(typeof(version_bound), typeof(x[["version"]]))) {
cli_abort(
"{version_bound_arg} must have the same `typeof` as x$version,
which has a `typeof` of {typeof(x$version)}",
class = "epiprocess__version_bound_mismatched_typeof"
)
}
}
return(invisible(NULL))
}
#' `max(x$version)`, with error if `x` has 0 rows
#'
#' Exported to make defaults more easily copyable.
#'
#' @param x `x` argument of [`as_epi_archive`]
#'
#' @return `max(x$version)` if it has any rows; raises error if it has 0 rows or
#' an `NA` version value
#'
#' @keywords internal
max_version_with_row_in <- function(x) {
if (nrow(x) == 0L) {
cli_abort(
"`nrow(x)==0L`, representing a data set history with no row up through the
latest observed version, but we don't have a sensible guess at what version
that is, or whether any of the empty versions might be clobbered in the
future; if we use `x` to form an `epi_archive`, then
`clobberable_versions_start` and `versions_end` must be manually specified.",
class = "epiprocess__max_version_cannot_be_used"
)
} else {
check_names(names(x), must.include = "version")
version_col <- x[["version"]]
if (anyNA(version_col)) {
cli_abort("version values cannot be NA",
class = "epiprocess__version_values_must_not_be_na"
)
} else {
version_bound <- max(version_col)
}
}
version_bound
}
#' Get the next possible value greater than `x` of the same type
#'
#' @param x the starting "value"(s)
#' @return same class, typeof, and length as `x`
#'
#' @keywords internal
next_after <- function(x) UseMethod("next_after")
#' @keywords internal
next_after.integer <- function(x) x + 1L
#' @keywords internal
next_after.Date <- function(x) x + 1L
#' `epi_archive` object
#'
#' @description The second main data structure for storing time series in
#' `epiprocess`. It is similar to `epi_df` in that it fundamentally a table with
#' a few required columns that stores epidemiological time series data. An
#' `epi_archive` requires a `geo_value`, `time_value`, and `version` column (and
#' possibly other key columns) along with measurement values. In brief, an
#' `epi_archive` is a history of the time series data, where the `version`
#' column tracks the time at which the data was available. This allows for
#' version-aware forecasting.
#'
#' `new_epi_archive` is the low-level constructor for `epi_archive` objects that
#' only performs some fast, basic checks on the inputs. `validate_epi_archive`
#' can perform more costly validation checks on its output. But most users
#' should use `as_epi_archive`, which performs all necessary checks and has some
#' additional features.
#'
#' @details An `epi_archive` contains a `data.table` object `DT` (from the
#' `{data.table}` package), with (at least) the following columns:
#'
#' * `geo_value`: the geographic value associated with each row of measurements,
#' * `time_value`: the time value associated with each row of measurements,
#' * `version`: the time value specifying the version for each row of
#' measurements. For example, if in a given row the `version` is January 15,
#' 2022 and `time_value` is January 14, 2022, then this row contains the
#' measurements of the data for January 14, 2022 that were available one day
#' later.
#'
#' The variables `geo_value`, `time_value`, `version` serve as key variables for
#' the data table (in addition to any other keys specified in the metadata).
#' There can only be a single row per unique combination of key variables. The
#' keys for an `epi_archive` can be viewed with `key(epi_archive$DT)`.
#'
#' ## Compactification
#'
#' By default, an `epi_archive` will compactify the data table to remove
#' redundant rows. This is done by not storing rows that have the same value,
#' except for the `version` column (this is essentially a last observation
#' carried forward, but along the version index). This is done to save space and
#' improve performance. If you do not want to compactify the data, you can set
#' `compactify = FALSE` in `as_epi_archive()`.
#'
#' Note that in some data scenarios, LOCF may not be appropriate. For instance,
#' if you expected data to be updated on a given day, but your data source did
#' not update, then it could be reasonable to code the data as `NA` for that
#' day, instead of assuming LOCF.
#'
#' `NA`s *can* be introduced by `epi_archive` methods for other
#' reasons, e.g., in [`epix_fill_through_version`] and [`epix_merge`], if
#' requested, to represent potential update data that we do not yet have access
#' to; or in [`epix_merge`] to represent the "value" of an observation before
#' the version in which it was first released, or if no version of that
#' observation appears in the archive data at all.
#'
#' ## Metadata
#'
#' The following pieces of metadata are included as fields in an `epi_archive`
#' object:
#'
#' * `geo_type`: the type for the geo values.
#' * `time_type`: the type for the time values.
#' * `other_keys`: any additional keys as a character vector.
#' Typical examples are "age" or sub-geographies.
#'
#' While this metadata is not protected, it is generally recommended to treat it
#' as read-only, and to use the `epi_archive` methods to interact with the data
#' archive. Unexpected behavior may result from modifying the metadata
#' directly.
#'
#' @param data_table a `data.table` with [`data.table::key()`] equal to
#' `c("geo_value", other_keys, "time_value", "version")`. For `data.table`
#' users: this sets up an alias of `data_table`; if you plan to keep on
#' working with `data_table` or working directly with the archive's `$DT`
#' using mutating operations, you should `copy()` if appropriate. We will not
#' mutate the `DT` with any exported `{epiprocess}` functions, though.
#' @param geo_type DEPRECATED Has no effect. Geo value type is inferred from the
#' location column and set to "custom" if not recognized.
#' @param time_type DEPRECATED Has no effect. Time value type inferred from the
#' time column and set to "custom" if not recognized. Unpredictable behavior
#' may result if the time type is not recognized.
#' @param other_keys Character vector specifying the names of variables in `x`
#' that should be considered key variables (in the language of `data.table`)
#' apart from "geo_value", "time_value", and "version". Typical examples are
#' "age" or more granular geographies.
#' @param compactify Optional; `TRUE`, `FALSE`, or `"message"`. `TRUE` will
#' remove some redundant rows, `FALSE` will not. `"message"` is like `TRUE`
#' but will emit a message if anything was changed. Default is `TRUE`. See
#' more information below under "Compactification:".
#' @param compactify_abs_tol Optional; double. A tolerance level used to detect
#' approximate equality for compactification. The default is 0, which
#' corresponds to exact equality. Consider using this if your value columns
#' undergo tiny nonmeaningful revisions and the archive object with the
#' default setting is too large.
#' @param clobberable_versions_start Optional; `length`-1; either a value of the
#' same `class` and `typeof` as `x$version`, or an `NA` of any `class` and
#' `typeof`: specifically, either (a) the earliest version that could be
#' subject to "clobbering" (being overwritten with different update data, but
#' using the *same* version tag as the old update data), or (b) `NA`, to
#' indicate that no versions are clobberable. There are a variety of reasons
#' why versions could be clobberable under routine circumstances, such as (a)
#' today's version of one/all of the columns being published after initially
#' being filled with `NA` or LOCF, (b) a buggy version of today's data being
#' published but then fixed and republished later in the day, or (c) data
#' pipeline delays (e.g., publisher uploading, periodic scraping, database
#' syncing, periodic fetching, etc.) that make events (a) or (b) reflected
#' later in the day (or even on a different day) than expected; potential
#' causes vary between different data pipelines. The default value is `NA`,
#' which doesn't consider any versions to be clobberable. Another setting that
#' may be appropriate for some pipelines is `max_version_with_row_in(x)`.
#' @param versions_end Optional; length-1, same `class` and `typeof` as
#' `x$version`: what is the last version we have observed? The default is
#' `max_version_with_row_in(x)`, but values greater than this could also be
#' valid, and would indicate that we observed additional versions of the data
#' beyond `max(x$version)`, but they all contained empty updates. (The default
#' value of `clobberable_versions_start` does not fully trust these empty
#' updates, and assumes that any version `>= max(x$version)` could be
#' clobbered.) If `nrow(x) == 0`, then this argument is mandatory.
#' @return An `epi_archive` object.
#'
#' @seealso [`epix_as_of`] [`epix_merge`] [`epix_slide`]
#' @importFrom dplyr if_any if_all everything
#' @importFrom utils capture.output
#'
#' @name epi_archive
#' @export
#'
#' @examples
#' # Simple ex. with necessary keys
#' tib <- tibble::tibble(
#' geo_value = rep(c("ca", "hi"), each = 5),
#' time_value = rep(seq(as.Date("2020-01-01"),
#' by = 1, length.out = 5
#' ), times = 2),
#' version = rep(seq(as.Date("2020-01-02"),
#' by = 1, length.out = 5
#' ), times = 2),
#' value = rnorm(10, mean = 2, sd = 1)
#' )
#'
#' toy_epi_archive <- tib %>% as_epi_archive()
#' toy_epi_archive
#'
#' # Ex. with an additional key for county
#' df <- data.frame(
#' geo_value = c(replicate(2, "ca"), replicate(2, "fl")),
#' county = c(1, 3, 2, 5),
#' time_value = c(
#' "2020-06-01",
#' "2020-06-02",
#' "2020-06-01",
#' "2020-06-02"
#' ),
#' version = c(
#' "2020-06-02",
#' "2020-06-03",
#' "2020-06-02",
#' "2020-06-03"
#' ),
#' cases = c(1, 2, 3, 4),
#' cases_rate = c(0.01, 0.02, 0.01, 0.05)
#' )
#'
#' x <- df %>% as_epi_archive(other_keys = "county")
#'
new_epi_archive <- function(
data_table,
geo_type,
time_type,
other_keys,
clobberable_versions_start,
versions_end) {
assert_class(data_table, "data.table")
assert_string(geo_type)
assert_string(time_type)
assert_character(other_keys, any.missing = FALSE)
if (any(c("geo_value", "time_value", "version") %in% other_keys)) {
cli_abort("`other_keys` cannot contain \"geo_value\", \"time_value\", or \"version\".")
}
assert_true(identical(key(data_table), c("geo_value", other_keys, "time_value", "version")))
validate_version_bound(clobberable_versions_start, data_table, na_ok = TRUE)
validate_version_bound(versions_end, data_table, na_ok = FALSE)
structure(
list(
DT = data_table,
geo_type = geo_type,
time_type = time_type,
other_keys = other_keys,
clobberable_versions_start = clobberable_versions_start,
versions_end = versions_end
),
class = "epi_archive"
)
}
#' Perform second (costly) round of validation that `x` is a proper `epi_archive`
#'
#' @rdname epi_archive
#' @export
validate_epi_archive <- function(x) {
assert_class(x, "epi_archive")
ukey_vars1 <- c("geo_value", x$other_keys, "time_value", "version")
ukey_vars2 <- key(x$DT)
if (!identical(ukey_vars1, ukey_vars2)) {
cli_abort(c("`data.table::key(x$DT)` not as expected",
"*" = "Based on `x$other_keys` the key should be {format_chr_deparse(ukey_vars1)}",
"*" = "But `key(x$DT)` is {format_chr_deparse(ukey_vars2)}",
">" = "Consider reconstructing the archive from `x$DT` specifying
the appropriate `other_keys`."
))
}
# Rely on data.table to ensure that these key columns exist.
if (anyDuplicated(x$DT, by = key(x$DT)) != 0L) {
cli_abort("`x$DT` must have one row per unique combination of the key variables. If you
have additional key variables other than `geo_value`, `time_value`, and
`version`, such as an age group column, please specify them in `other_keys`.
Otherwise, check for duplicate rows and/or conflicting values for the same
measurement.",
class = "epiprocess__epi_archive_requires_unique_key"
)
}
if (!identical(class(x$DT$time_value), class(x$DT$version))) {
cli_abort(
"`x$DT$time_value` and `x$DT$version` must have the same class.",
class = "epiprocess__time_value_version_mismatch"
)
}
if (anyMissing(x$DT$version)) {
cli_abort("Column `version` must not contain missing values.")
}
if (nrow(x$DT) > 0L && x$versions_end < max(x$DT$version)) {
cli_abort(
"`x$versions_end` was {x$versions_end}, but `x$DT` contained
updates for a later version or versions, up through {max(x$DT$version)}",
class = "epiprocess__versions_end_earlier_than_updates"
)
}
if (!is.na(x$clobberable_versions_start) && x$clobberable_versions_start > x$versions_end) {
cli_abort(
"`x$versions_end` was {x$versions_end}; however, `x$clobberable_versions_start`
was {x$clobberable_versions_start}, indicating that there were later observed versions",
class = "epiprocess__versions_end_earlier_than_clobberable_versions_start"
)
}
# Return x visibly due to popular `validate_...(new_...())` pattern:
x
}
#' Given a tibble as would be found in an epi_archive, remove duplicate entries.
#'
#' Works by shifting all rows except the version, then comparing values to see
#' if they've changed. We need to arrange in descending order, but note that
#' we don't need to group, since at least one column other than version has
#' changed, and so is kept.
#'
#' @param updates_df DT of an `epi_archive` or something analogous (though
#' potentially unsorted) of another class
#' @param ukey_names chr; the column names forming a unique key for the
#' `updates_df`; "version" must come last. For an `epi_archive`'s `DT`, this
#' would be `key(DT)`.
#' @param abs_tol numeric, >=0; absolute tolerance to use on numeric measurement
#' columns when determining whether something can be compactified away; see
#' [`vec_approx_equal`]
#'
#' @importFrom data.table is.data.table key
#' @importFrom dplyr arrange filter
#' @importFrom vctrs vec_duplicate_any
#'
#' @keywords internal
apply_compactify <- function(updates_df, ukey_names, abs_tol = 0) {
assert_data_frame(updates_df)
assert_character(ukey_names)
assert_subset(ukey_names, names(updates_df))
if (vec_duplicate_any(ukey_names)) {
cli_abort("`ukey_names` must not contain duplicates")
}
if (length(ukey_names) == 0 || ukey_names[[length(ukey_names)]] != "version") {
cli_abort('"version" must appear in `ukey_names` and must be last.')
}
assert_numeric(abs_tol, len = 1, lower = 0)
if (is.data.table(updates_df)) {
if (!identical(key(updates_df), ukey_names)) {
cli_abort(c("`ukey_names` should match `key(updates_df)`",
"i" = "`ukey_names` was {format_chr_deparse(ukey_names)}",
"i" = "`key(updates_df)` was {format_chr_deparse(key(updates_df))}"
))
}
} else {
updates_df <- updates_df %>% arrange(pick(all_of(ukey_names)))
}
# In case updates_df is a data.table, store keep flags in a local: "When the
# first argument inside DT[...] is a single symbol (e.g. DT[var]), data.table
# looks for var in calling scope". In case it's not a data.table, make sure to
# use df[i,] not just df[i].
to_keep <- !update_is_locf(updates_df, ukey_names, abs_tol)
updates_df[to_keep, ]
}
#' get the entries that `compactify` would remove
#' @keywords internal
#' @importFrom dplyr filter if_all everything
removed_by_compactify <- function(updates_df, ukey_names, abs_tol) {
if (!is.data.table(updates_df) || !identical(key(updates_df), ukey_names)) {
updates_df <- updates_df %>% arrange(pick(all_of(ukey_names)))
}
updates_df[update_is_locf(updates_df, ukey_names, abs_tol), ]
}
#' Internal helper; lgl; which updates are LOCF
#'
#' (Not validated:) Must be called inside certain dplyr data masking verbs (e.g.,
#' `filter` or `mutate`) being run on an `epi_archive`'s `DT` or a data frame
#' formatted like one.
#'
#' @param arranged_updates_df an arranged update data frame like an `epi_archive` `DT`
#' @param ukey_names (not validated:) chr; the archive/equivalent
#' [`key_colnames`]; must include `"version"`.
#' @param abs_tol (not validated:) as in [`apply_compactify`]
#'
#' @return lgl
#'
#' @keywords internal
update_is_locf <- function(arranged_updates_df, ukey_names, abs_tol) {
# Use as.list to get a shallow "copy" in case of data.table, so that column
# selection does not copy the column contents. Don't leak these column aliases
# or it will break data.table ownership model.
updates_col_refs <- as.list(arranged_updates_df)
all_names <- names(arranged_updates_df)
ekt_names <- ukey_names[ukey_names != "version"]
val_names <- all_names[!all_names %in% ukey_names]
n_updates <- nrow(arranged_updates_df)
if (n_updates == 0L) {
logical(0L)
} else if (n_updates == 1L) {
FALSE # sole observation is not LOCF
} else {
ekts_tbl <- new_tibble(updates_col_refs[ekt_names])
vals_tbl <- new_tibble(updates_col_refs[val_names])
# n_updates >= 2L so we can use `:` naturally (this is the reason for
# separating out n_updates == 1L from this case):
inds1 <- 2L:n_updates
inds2 <- 1L:(n_updates - 1L)
c(
FALSE, # first observation is not LOCF
vec_approx_equal0(ekts_tbl,
inds1 = inds1, ekts_tbl, inds2 = inds2,
# check ekt (key) cols with 0 tolerance:
na_equal = TRUE, abs_tol = 0
) &
vec_approx_equal0(vals_tbl,
inds1 = inds1, vals_tbl, inds2 = inds2,
na_equal = TRUE, abs_tol = abs_tol
)
)
}
}
#' `as_epi_archive` converts a data frame, data table, or tibble into an
#' `epi_archive` object.
#'
#' @param x A data.frame, data.table, or tibble, with columns `geo_value`,
#' `time_value`, `version`, and then any additional number of columns.
#' @param ... used for specifying column names, as in [`dplyr::rename`]. For
#' example `version = release_date`
#' @param .versions_end location based versions_end, used to avoid prefix
#' `version = issue` from being assigned to `versions_end` instead of being
#' used to rename columns.
#'
#' @rdname epi_archive
#'
#' @export
as_epi_archive <- function(
x,
geo_type = deprecated(),
time_type = deprecated(),
other_keys = character(),
compactify = TRUE,
compactify_abs_tol = 0,
clobberable_versions_start = NA,
.versions_end = max_version_with_row_in(x), ...,
versions_end = .versions_end) {
assert_data_frame(x)
# Convert first to data.frame to guard against data.table#6859 and potentially
# other things epiprocess#618:
x_already_copied <- identical(class(x), c("data.table", "data.frame"))
x <- as.data.frame(x)
x <- rename(x, ...)
x <- guess_column_name(x, "geo_value", geo_column_names())
if (!all(other_keys %in% names(x))) {
# Give a more tailored error message than as.data.table would:
cli_abort(c(
"`x` is missing the following expected columns:
{format_varnames(setdiff(other_keys, names(x)))}.",
">" = "You might need to `dplyr::rename()` beforehand
or using `as_epi_archive()`'s renaming feature."
))
}
x <- guess_column_name(x, "time_value", time_column_names())
x <- guess_column_name(x, "version", version_column_names())
# Convert to data.table:
key_vars <- c("geo_value", other_keys, "time_value", "version")
if (x_already_copied) {
setDT(x, key = key_vars)
} else {
x <- as.data.table(x, key = key_vars)
}
if (lifecycle::is_present(geo_type)) {
cli_warn("epi_archive constructor argument `geo_type` is now ignored. Consider removing.")
}
if (lifecycle::is_present(time_type)) {
cli_warn("epi_archive constructor argument `time_type` is now ignored. Consider removing.")
}
geo_type <- guess_geo_type(x$geo_value)
time_type <- guess_time_type(x$time_value)
result <- validate_epi_archive(new_epi_archive(
x, geo_type, time_type, other_keys,
clobberable_versions_start, versions_end
))
# Compactification:
if (!list(compactify) %in% list(TRUE, FALSE, "message")) {
cli_abort('`compactify` must be `TRUE`, `FALSE`, or `"message"`')
}
data_table <- result$DT # probably just `x`, but take no chances
nrow_before_compactify <- nrow(data_table)
# Runs compactify on data_table
if (identical(compactify, TRUE) || identical(compactify, "message")) {
compactified <- apply_compactify(data_table, key_vars, compactify_abs_tol)
} else {
compactified <- data_table
}
# Messages about redundant rows if the number of rows decreased, and we didn't
# explicitly say to compactify
if (identical(compactify, "message") && nrow(compactified) < nrow_before_compactify) {
elim <- removed_by_compactify(data_table, key_vars, compactify_abs_tol)
message_intro <- cli::format_inline(
"Found rows that appear redundant based on
last (version of each) observation carried forward;
these rows have been removed to 'compactify' and save space:",
keep_whitespace = FALSE
)
message_data <- paste(collapse = "\n", capture.output(print(elim, topn = 3L, nrows = 7L)))
message_outro <- cli::format_inline(
"Built-in `epi_archive` functionality should be unaffected,
but results may change if you work directly with its fields (such as `DT`).
See `?as_epi_archive` for details.
To silence this message but keep compactification,
you can pass `compactify=TRUE` when constructing the archive.",
keep_whitespace = FALSE
)
message_string <- paste(sep = "\n", message_intro, message_data, message_outro)
rlang::inform(message_string, class = "epiprocess__compactify_default_removed_rows")
}
result$DT <- compactified
result
}
#' Print information about an `epi_archive` object
#'
#' @param x An `epi_archive` object.
#' @param ... Should be empty, there to satisfy the S3 generic.
#' @param class Boolean; whether to print the class label header
#' @param methods Boolean; whether to print all available methods of
#' the archive
#'
#' @importFrom cli cat_line format_message
#' @importFrom rlang check_dots_empty
#' @export
print.epi_archive <- function(x, ..., class = TRUE, methods = TRUE) {
if (rlang::dots_n(...) > 0) {
cli_abort(c(
"Error in print.epi_archive()",
"i" = "Too many arguments passed to `print.epi_archive()`."
))
}
cat_line(format_message(
c(
">" = if (class) "An `epi_archive` object, with metadata:",
"i" = if (length(setdiff(key(x$DT), c("geo_value", "time_value", "version"))) > 0) {
"Other DT keys: {setdiff(key(x$DT), c('geo_value', 'time_value', 'version'))}"
},
"i" = if (nrow(x$DT) != 0L) {
"Min/max time values: {min(x$DT$time_value)} / {max(x$DT$time_value)}"
},
"i" = if (nrow(x$DT) != 0L) {
"First/last version with update: {min(x$DT$version)} / {max(x$DT$version)}"
},
"i" = if (!is.na(x$clobberable_versions_start)) {
"Clobberable versions start: {x$clobberable_versions_start}"
},
"i" = "Versions end: {x$versions_end}",
"i" = "A preview of the table ({nrow(x$DT)} rows x {ncol(x$DT)} columns):"
)
))
print(x$DT[])
return(invisible(x))
}
#' `group_by` and related methods for `epi_archive`, `grouped_epi_archive`
#'
#' @param .data An `epi_archive` or `grouped_epi_archive`
#' @param ... Similar to [`dplyr::group_by`] (see "Details:" for edge cases);
#' * For `group_by`: unquoted variable name(s) or other
#' ["data masking"][dplyr::dplyr_data_masking] expression(s). It's possible to
#' use [`dplyr::mutate`]-like syntax here to calculate new columns on which to
#' perform grouping, but note that, if you are regrouping an already-grouped
#' `.data` object, the calculations will be carried out ignoring such grouping
#' (same as [in dplyr][dplyr::group_by]).
#' * For `ungroup`: either
#' * empty, in order to remove the grouping and output an `epi_archive`; or
#' * variable name(s) or other ["tidy-select"][dplyr::dplyr_tidy_select]
#' expression(s), in order to remove the matching variables from the list of
#' grouping variables, and output another `grouped_epi_archive`.
#' @param .add Boolean. If `FALSE`, the default, the output will be grouped by
#' the variable selection from `...` only; if `TRUE`, the output will be
#' grouped by the current grouping variables plus the variable selection from
#' `...`.
#' @param .drop As described in [`dplyr::group_by`]; determines treatment of
#' factor columns.
#' @param x For `groups`, `group_vars`, or `ungroup`: a `grouped_epi_archive`;
#' for `is_grouped_epi_archive`: any object
#' @param .tbl (For `group_by_drop_default`:) an `epi_archive` or
#' `grouped_epi_archive` (`epi_archive` dispatches to the S3 default method;
#' `grouped_epi_archive` dispatches its own S3 method)
#'
#' @details
#'
#' To match `dplyr`, `group_by` allows "data masking" (also referred to as
#' "tidy evaluation") expressions `...`, not just column names, in a way similar
#' to `mutate`. Note that replacing or removing key columns with these
#' expressions is disabled.
#'
#' `archive %>% group_by()` and other expressions that group or regroup by zero
#' columns (indicating that all rows should be treated as part of one large
#' group) will output a `grouped_epi_archive`, in order to enable the use of
#' `grouped_epi_archive` methods on the result. This is in slight contrast to
#' the same operations on tibbles and grouped tibbles, which will *not* output a
#' `grouped_df` in these circumstances.
#'
#' Using `group_by` with `.add=FALSE` to override the existing grouping is
#' disabled; instead, `ungroup` first then `group_by`.
#'
#' `group_by_drop_default` on (ungrouped) `epi_archive`s is expected to dispatch
#' to `group_by_drop_default.default` (but there is a dedicated method for
#' `grouped_epi_archive`s).
#'
#' @examples
#'
#' grouped_archive <- archive_cases_dv_subset %>% group_by(geo_value)
#'
#' # `print` for metadata and method listing:
#' grouped_archive %>% print()
#'
#' # The primary use for grouping is to perform a grouped `epix_slide`:
#'
#' archive_cases_dv_subset %>%
#' group_by(geo_value) %>%
#' epix_slide(
#' .f = ~ mean(.x$case_rate_7d_av),
#' .before = 2,
#' .versions = as.Date("2020-06-11") + 0:2,
#' .new_col_name = "case_rate_3d_av"
#' ) %>%
#' ungroup()
#'
#' # -----------------------------------------------------------------
#'
#' # Advanced: some other features of dplyr grouping are implemented:
#'
#' library(dplyr)
#' toy_archive <-
#' tribble(
#' ~geo_value, ~age_group, ~time_value, ~version, ~value,
#' "us", "adult", "2000-01-01", "2000-01-02", 121,
#' "us", "pediatric", "2000-01-02", "2000-01-03", 5, # (addition)
#' "us", "adult", "2000-01-01", "2000-01-03", 125, # (revision)
#' "us", "adult", "2000-01-02", "2000-01-03", 130 # (addition)
#' ) %>%
#' mutate(
#' age_group = ordered(age_group, c("pediatric", "adult")),
#' time_value = as.Date(time_value),
#' version = as.Date(version)
#' ) %>%
#' as_epi_archive(other_keys = "age_group")
#'
#' # The following are equivalent:
#' toy_archive %>% group_by(geo_value, age_group)
#' toy_archive %>%
#' group_by(geo_value) %>%
#' group_by(age_group, .add = TRUE)
#' grouping_cols <- c("geo_value", "age_group")
#' toy_archive %>% group_by(across(all_of(grouping_cols)))
#'
#' # And these are equivalent:
#' toy_archive %>% group_by(geo_value)
#' toy_archive %>%
#' group_by(geo_value, age_group) %>%
#' ungroup(age_group)
#'
#' # To get the grouping variable names as a character vector:
#' toy_archive %>%
#' group_by(geo_value) %>%
#' group_vars()
#'
#' # To get the grouping variable names as a `list` of `name`s (a.k.a. symbols):
#' toy_archive %>%
#' group_by(geo_value) %>%
#' groups()
#'
#' toy_archive %>%
#' group_by(geo_value, age_group, .drop = FALSE) %>%
#' epix_slide(.f = ~ sum(.x$value), .before = 20) %>%
#' ungroup()
#'
#' @importFrom dplyr group_by
#' @export
#'
#' @aliases grouped_epi_archive
group_by.epi_archive <- function(.data, ..., .add = FALSE,
.drop = dplyr::group_by_drop_default(.data)) {
# `add` makes no difference; this is an ungrouped `epi_archive`.
detailed_mutate <- epix_detailed_restricted_mutate(.data, ...)
assert_logical(.drop)
if (!.drop) {
grouping_cols <- as.list(detailed_mutate[["archive"]][["DT"]])[detailed_mutate[["request_names"]]]
grouping_col_is_factor <- purrr::map_lgl(grouping_cols, is.factor)
# ^ Use `as.list` to try to avoid any possibility of a deep copy.
if (length(grouping_cols) != 0L && !any(grouping_col_is_factor)) {
cli_warn(
"`.drop=FALSE` but none of the grouping columns are factors;
did you mean to convert one of the columns to a factor beforehand?",
class = "epiprocess__group_by_epi_archive__drop_FALSE_no_factors"
)
} else if (any(diff(grouping_col_is_factor) == -1L)) {
cli_warn(
"`.drop=FALSE` but there are one or more non-factor grouping columns listed
after a factor grouping column; this may produce groups with `NA`s for these
non-factor columns; see https://github.com/tidyverse/dplyr/issues/5369#issuecomment-683762553;
depending on how you want completion to work, you might instead want to convert all
grouping columns to factors beforehand, specify the non-factor grouping columns first,
or use `.drop=TRUE` and add a call to `tidyr::complete()`.",
class = "epiprocess__group_by_epi_archive__drop_FALSE_nonfactor_after_factor"
)
}
}
new_grouped_epi_archive(detailed_mutate[["archive"]],
detailed_mutate[["request_names"]],
drop = .drop
)
}
#' Clone an `epi_archive` object.
#'
#' @param x An `epi_archive` object.
#'
#' @importFrom data.table copy
#' @export
clone <- function(x) {
UseMethod("clone")
}
#' @rdname clone
#' @export
clone.epi_archive <- function(x) {
x$DT <- data.table::copy(x$DT)
x
}