Skip to content

Commit ec243a8

Browse files
committed
Bring back slide unique-key checks, make as_epi_df ukey checks faster
1 parent f8ba9f8 commit ec243a8

File tree

9 files changed

+152
-23
lines changed

9 files changed

+152
-23
lines changed

NAMESPACE

+5
Original file line numberDiff line numberDiff line change
@@ -109,13 +109,16 @@ importFrom(checkmate,assert)
109109
importFrom(checkmate,assert_character)
110110
importFrom(checkmate,assert_class)
111111
importFrom(checkmate,assert_data_frame)
112+
importFrom(checkmate,assert_false)
112113
importFrom(checkmate,assert_function)
113114
importFrom(checkmate,assert_int)
114115
importFrom(checkmate,assert_list)
115116
importFrom(checkmate,assert_logical)
116117
importFrom(checkmate,assert_numeric)
117118
importFrom(checkmate,assert_scalar)
118119
importFrom(checkmate,assert_string)
120+
importFrom(checkmate,assert_subset)
121+
importFrom(checkmate,assert_tibble)
119122
importFrom(checkmate,checkInt)
120123
importFrom(checkmate,check_atomic)
121124
importFrom(checkmate,check_data_frame)
@@ -165,6 +168,7 @@ importFrom(dplyr,groups)
165168
importFrom(dplyr,if_all)
166169
importFrom(dplyr,if_any)
167170
importFrom(dplyr,if_else)
171+
importFrom(dplyr,is_grouped_df)
168172
importFrom(dplyr,lag)
169173
importFrom(dplyr,mutate)
170174
importFrom(dplyr,near)
@@ -236,3 +240,4 @@ importFrom(tsibble,as_tsibble)
236240
importFrom(utils,capture.output)
237241
importFrom(utils,tail)
238242
importFrom(vctrs,vec_data)
243+
importFrom(vctrs,vec_equal)

R/epi_df.R

+9-11
Original file line numberDiff line numberDiff line change
@@ -279,22 +279,20 @@ as_epi_df.tbl_df <- function(
279279
}
280280

281281
assert_character(other_keys)
282+
assert_subset(other_keys, names(x))
283+
# Fix up if given more than just other keys, at least until epipredict#428
284+
# merged:
285+
other_keys <- other_keys[!other_keys %in% c("geo_value", "time_value")]
282286

283287
if (".time_value_counts" %in% other_keys) {
284288
cli_abort("as_epi_df: `other_keys` can't include \".time_value_counts\"")
285289
}
286290

287-
if (anyDuplicated(x[c("geo_value", "time_value", other_keys)])) {
288-
duplicated_time_values <- x %>%
289-
group_by(across(all_of(c("geo_value", "time_value", other_keys)))) %>%
290-
filter(dplyr::n() > 1) %>%
291-
ungroup()
292-
bad_data <- capture.output(duplicated_time_values)
293-
cli_abort(
294-
"as_epi_df: some groups in the data have duplicated time values. epi_df requires a unique time_value per group.",
295-
body = c("Sample groups:", bad_data)
296-
)
297-
}
291+
assert(check_ukey_unique(x, c("geo_value", other_keys, "time_value"), c(
292+
">" = "If this is line list data, convert it to counts/rates first.",
293+
">" = "If this contains a demographic breakdown, check that you have
294+
specified appropriate `other_keys`" # . from checkmate
295+
)))
298296

299297
new_epi_df(x, geo_type, time_type, as_of, other_keys)
300298
}

R/epiprocess-package.R

+6
Original file line numberDiff line numberDiff line change
@@ -5,20 +5,26 @@
55
#' @import epidatasets
66
#' @importFrom checkmate anyInfinite anyMissing assert assert_character
77
#' @importFrom checkmate assert_class assert_data_frame assert_int assert_list
8+
#' @importFrom checkmate assert_false
89
#' @importFrom checkmate assert_logical assert_numeric assert_scalar checkInt
910
#' @importFrom checkmate assert_string
11+
#' @importFrom checkmate assert_subset
12+
#' @importFrom checkmate assert_tibble
1013
#' @importFrom checkmate check_atomic check_data_frame expect_class test_int
1114
#' @importFrom checkmate check_names
1215
#' @importFrom checkmate test_subset test_set_equal vname
1316
#' @importFrom cli cli_abort cli_warn
1417
#' @importFrom data.table as.data.table
1518
#' @importFrom data.table key
1619
#' @importFrom data.table setkeyv
20+
#' @importFrom dplyr arrange
21+
#' @importFrom dplyr is_grouped_df
1722
#' @importFrom dplyr select
1823
#' @importFrom lifecycle deprecated
1924
#' @importFrom rlang %||%
2025
#' @importFrom rlang is_bare_integerish
2126
#' @importFrom vctrs vec_data
27+
#' @importFrom vctrs vec_equal
2228
## usethis namespace: end
2329
NULL
2430

R/slide.R

+4-12
Original file line numberDiff line numberDiff line change
@@ -259,18 +259,7 @@ epi_slide <- function(
259259
assert_logical(.all_rows, len = 1)
260260

261261
# Check for duplicated time values within groups
262-
duplicated_time_values <- .x %>%
263-
group_epi_df() %>%
264-
filter(dplyr::n() > 1) %>%
265-
ungroup()
266-
if (nrow(duplicated_time_values) > 0) {
267-
bad_data <- capture.output(duplicated_time_values)
268-
cli_abort(
269-
"as_epi_df: some groups in a resulting dplyr computation have duplicated time values.
270-
epi_df requires a unique time_value per group.",
271-
body = c("Sample groups:", bad_data)
272-
)
273-
}
262+
assert(check_ukey_unique(ungroup(.x), c(group_vars(.x), "time_value")))
274263

275264
# Begin handling completion. This will create a complete time index between
276265
# the smallest and largest time values in the data. This is used to ensure
@@ -752,6 +741,9 @@ epi_slide_opt <- function(
752741
)
753742
}
754743

744+
# Check for duplicated time values within groups
745+
assert(check_ukey_unique(ungroup(.x), c(group_vars(.x), "time_value")))
746+
755747
# The position of a given column can be differ between input `.x` and
756748
# `.data_group` since the grouping step by default drops grouping columns.
757749
# To avoid rerunning `eval_select` for every `.data_group`, convert

R/utils.R

+54
Original file line numberDiff line numberDiff line change
@@ -1195,7 +1195,61 @@ time_type_unit_abbr <- function(time_type) {
11951195
maybe_unit_abbr
11961196
}
11971197

1198+
#' Extract singular element of a length-1 unnamed list (validated)
1199+
#'
1200+
#' Inverse of `list(elt)`.
1201+
#'
1202+
#' @param x a length-1 list
1203+
#' @return x[[1L]], if x actually was a length-1 list; error otherwise
1204+
#'
1205+
#' @keywords internal
11981206
unwrap <- function(x) {
11991207
checkmate::assert_list(x, len = 1L, names = "unnamed")
12001208
x[[1L]]
12011209
}
1210+
1211+
#' Check that a unique key is indeed unique in a tibble (TRUE/str)
1212+
#'
1213+
#' A `checkmate`-style check function.
1214+
#'
1215+
#' @param x a tibble, with no particular row or column order (if you have a
1216+
#' guaranteed row order based on the ukey you can probably do something more
1217+
#' efficient)
1218+
#' @param ukey_names character vector; subset of column names of `x` denoting a
1219+
#' unique key.
1220+
#' @param end_cli_message optional character vector, a cli message format
1221+
#' string/vector; information/advice to tack onto any error messages.
1222+
#' @return `TRUE` if no ukey is duplicated (i.e., `x[ukey_names]` has no
1223+
#' duplicated rows); string with an error message if there are errors.
1224+
#'
1225+
#' @keywords internal
1226+
check_ukey_unique <- function(x, ukey_names, end_cli_message = character()) {
1227+
assert_tibble(x) # to not have to think about `data.table` perf, xface
1228+
assert_false(is_grouped_df(x)) # to not have to think about `grouped_df` perf, xface
1229+
assert_character(ukey_names)
1230+
assert_subset(ukey_names, names(x))
1231+
#
1232+
if (nrow(x) <= 1L) {
1233+
TRUE
1234+
} else {
1235+
# Fast check, slow error message.
1236+
arranged_ukeys <- arrange(x[ukey_names], across(all_of(ukey_names)))
1237+
if (!any(vec_equal(arranged_ukeys[-1L, ], arranged_ukeys[-nrow(arranged_ukeys), ]))) {
1238+
TRUE
1239+
} else {
1240+
bad_data <- x %>%
1241+
group_by(across(all_of(ukey_names))) %>%
1242+
filter(dplyr::n() > 1) %>%
1243+
ungroup()
1244+
lines <- c(
1245+
cli::format_error("
1246+
There cannot be more than one row with the same combination of
1247+
{format_varnames(ukey_names)}. Problematic rows:
1248+
"),
1249+
capture.output(bad_data),
1250+
cli::format_message(end_cli_message)
1251+
)
1252+
paste(collapse = "\n", lines)
1253+
}
1254+
}
1255+
}

man/check_ukey_unique.Rd

+27
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

man/unwrap.Rd

+18
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

tests/testthat/_snaps/epi_df.md

+15
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
# as_epi_df errors on nonunique epikeytime
2+
3+
Code
4+
as_epi_df(tibble::tibble(geo_value = 1, time_value = 1, value = 1:2), as_of = 5)
5+
Condition
6+
Error:
7+
! Assertion on 'x' failed: There cannot be more than one row with the same combination of geo_value and time_value. Problematic rows:
8+
# A tibble: 2 x 3
9+
geo_value time_value value
10+
<dbl> <dbl> <int>
11+
1 1 1 1
12+
2 1 1 2
13+
> If this is line list data, convert it to counts/rates first.
14+
> If this contains a demographic breakdown, check that you have specified appropriate `other_keys`.
15+

tests/testthat/test-epi_df.R

+14
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,20 @@ test_that("as_epi_df errors for non-character other_keys", {
4040
expect_silent(as_epi_df(ex_input, other_keys = c("state", "pol")))
4141
})
4242

43+
test_that("as_epi_df errors on nonunique epikeytime", {
44+
expect_snapshot(
45+
as_epi_df(tibble::tibble(
46+
geo_value = 1, time_value = 1, value = 1:2
47+
), as_of = 5),
48+
error = TRUE
49+
)
50+
expect_no_error(
51+
as_epi_df(tibble::tibble(
52+
geo_value = 1, age_group = 1:2, time_value = 1, value = 1:2
53+
), other_keys = "age_group", as_of = 5)
54+
)
55+
})
56+
4357
test_that("as_epi_df works for nonstandard input", {
4458
tib <- tibble::tibble(
4559
x = 1:10, y = 1:10,

0 commit comments

Comments
 (0)