Skip to content

Commit 1f44295

Browse files
authored
Merge pull request #492 from cmu-delphi/revisionSummary
revision analysis first draft
2 parents 3e4c98e + 889d53e commit 1f44295

22 files changed

+777
-54
lines changed

DESCRIPTION

+3-1
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
Type: Package
22
Package: epiprocess
33
Title: Tools for basic signal processing in epidemiology
4-
Version: 0.8.2
4+
Version: 0.8.3
55
Authors@R: c(
66
person("Jacob", "Bien", role = "ctb"),
77
person("Logan", "Brooks", email = "[email protected]", role = c("aut", "cre")),
@@ -31,6 +31,7 @@ Imports:
3131
data.table,
3232
dplyr (>= 1.0.8),
3333
genlasso,
34+
glue,
3435
ggplot2,
3536
glue,
3637
lifecycle (>= 1.0.1),
@@ -83,6 +84,7 @@ Collate:
8384
'methods-epi_df.R'
8485
'outliers.R'
8586
'reexports.R'
87+
'revision_analysis.R'
8688
'slide.R'
8789
'utils.R'
8890
'utils_pipe.R'

NAMESPACE

+12
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,7 @@ export(new_epi_df)
8181
export(next_after)
8282
export(relocate)
8383
export(rename)
84+
export(revision_summary)
8485
export(slice)
8586
export(time_column_names)
8687
export(ungroup)
@@ -111,6 +112,7 @@ importFrom(checkmate,vname)
111112
importFrom(cli,cat_line)
112113
importFrom(cli,cli_abort)
113114
importFrom(cli,cli_inform)
115+
importFrom(cli,cli_li)
114116
importFrom(cli,cli_vec)
115117
importFrom(cli,cli_warn)
116118
importFrom(cli,format_message)
@@ -128,8 +130,11 @@ importFrom(data.table,set)
128130
importFrom(data.table,setDF)
129131
importFrom(data.table,setkeyv)
130132
importFrom(dplyr,"%>%")
133+
importFrom(dplyr,across)
134+
importFrom(dplyr,all_of)
131135
importFrom(dplyr,arrange)
132136
importFrom(dplyr,bind_rows)
137+
importFrom(dplyr,c_across)
133138
importFrom(dplyr,dplyr_col_modify)
134139
importFrom(dplyr,dplyr_reconstruct)
135140
importFrom(dplyr,dplyr_row_slice)
@@ -142,11 +147,17 @@ importFrom(dplyr,group_vars)
142147
importFrom(dplyr,groups)
143148
importFrom(dplyr,if_all)
144149
importFrom(dplyr,if_any)
150+
importFrom(dplyr,if_else)
151+
importFrom(dplyr,lag)
145152
importFrom(dplyr,mutate)
153+
importFrom(dplyr,near)
154+
importFrom(dplyr,pick)
155+
importFrom(dplyr,pull)
146156
importFrom(dplyr,relocate)
147157
importFrom(dplyr,rename)
148158
importFrom(dplyr,select)
149159
importFrom(dplyr,slice)
160+
importFrom(dplyr,summarize)
150161
importFrom(dplyr,tibble)
151162
importFrom(dplyr,ungroup)
152163
importFrom(ggplot2,autoplot)
@@ -178,6 +189,7 @@ importFrom(rlang,is_formula)
178189
importFrom(rlang,is_function)
179190
importFrom(rlang,is_missing)
180191
importFrom(rlang,is_quosure)
192+
importFrom(rlang,list2)
181193
importFrom(rlang,missing_arg)
182194
importFrom(rlang,new_function)
183195
importFrom(rlang,quo_get_expr)

NEWS.md

+1
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ Pre-1.0.0 numbering scheme: 0.x will indicate releases, while 0.x.y will indicat
88

99
- Added `complete.epi_df`, which fills in missing values in an `epi_df` with
1010
`NA`s. Uses `tidyr::complete` underneath and preserves `epi_df` metadata.
11+
- Inclusion of the function `revision_summary` to provide basic revision information for `epi_archive`s out of the box. (#492)
1112

1213
## Bug fixes
1314

R/archive.R

+71-33
Original file line numberDiff line numberDiff line change
@@ -240,6 +240,7 @@ NULL
240240
#' value of `clobberable_versions_start` does not fully trust these empty
241241
#' updates, and assumes that any version `>= max(x$version)` could be
242242
#' clobbered.) If `nrow(x) == 0`, then this argument is mandatory.
243+
#' @param compactify_tol double. the tolerance used to detect approximate equality for compactification
243244
#' @return An `epi_archive` object.
244245
#'
245246
#' @importFrom data.table as.data.table key setkeyv
@@ -295,15 +296,16 @@ new_epi_archive <- function(
295296
additional_metadata,
296297
compactify,
297298
clobberable_versions_start,
298-
versions_end) {
299+
versions_end,
300+
compactify_tol = .Machine$double.eps^0.5) {
299301
# Create the data table; if x was an un-keyed data.table itself,
300302
# then the call to as.data.table() will fail to set keys, so we
301303
# need to check this, then do it manually if needed
302304
key_vars <- c("geo_value", "time_value", other_keys, "version")
303-
DT <- as.data.table(x, key = key_vars) # nolint: object_name_linter
304-
if (!identical(key_vars, key(DT))) setkeyv(DT, cols = key_vars)
305+
data_table <- as.data.table(x, key = key_vars) # nolint: object_name_linter
306+
if (!identical(key_vars, key(data_table))) setkeyv(data_table, cols = key_vars)
305307

306-
if (anyDuplicated(DT, by = key(DT)) != 0L) {
308+
if (anyDuplicated(data_table, by = key(data_table)) != 0L) {
307309
cli_abort("`x` must have one row per unique combination of the key variables. If you
308310
have additional key variables other than `geo_value`, `time_value`, and
309311
`version`, such as an age group column, please specify them in `other_keys`.
@@ -313,38 +315,17 @@ new_epi_archive <- function(
313315
)
314316
}
315317

316-
# Checks to see if a value in a vector is LOCF
317-
is_locf <- function(vec) { # nolint: object_usage_linter
318-
dplyr::if_else(!is.na(vec) & !is.na(dplyr::lag(vec)),
319-
vec == dplyr::lag(vec),
320-
is.na(vec) & is.na(dplyr::lag(vec))
321-
)
322-
}
323-
324-
# LOCF is defined by a row where all values except for the version
325-
# differ from their respective lag values
326-
327-
# Checks for LOCF's in a data frame
328-
rm_locf <- function(df) {
329-
dplyr::filter(df, if_any(c(everything(), -version), ~ !is_locf(.))) # nolint: object_usage_linter
330-
}
331-
332-
# Keeps LOCF values, such as to be printed
333-
keep_locf <- function(df) {
334-
dplyr::filter(df, if_all(c(everything(), -version), ~ is_locf(.))) # nolint: object_usage_linter
335-
}
336-
318+
nrow_before_compactify <- nrow(data_table)
337319
# Runs compactify on data frame
338320
if (is.null(compactify) || compactify == TRUE) {
339-
elim <- keep_locf(DT)
340-
DT <- rm_locf(DT) # nolint: object_name_linter
321+
compactified <- apply_compactify(data_table, key_vars, compactify_tol)
341322
} else {
342-
# Create empty data frame for nrow(elim) to be 0
343-
elim <- tibble::tibble()
323+
compactified <- data_table
344324
}
345-
346-
# Warns about redundant rows
347-
if (is.null(compactify) && nrow(elim) > 0) {
325+
# Warns about redundant rows if the number of rows decreased, and we didn't
326+
# explicitly say to compactify
327+
if (is.null(compactify) && nrow(compactified) < nrow_before_compactify) {
328+
elim <- removed_by_compactify(data_table, key_vars, compactify_tol)
348329
warning_intro <- cli::format_inline(
349330
"Found rows that appear redundant based on
350331
last (version of each) observation carried forward;
@@ -366,7 +347,7 @@ new_epi_archive <- function(
366347

367348
structure(
368349
list(
369-
DT = DT,
350+
DT = compactified,
370351
geo_type = geo_type,
371352
time_type = time_type,
372353
additional_metadata = additional_metadata,
@@ -377,6 +358,63 @@ new_epi_archive <- function(
377358
)
378359
}
379360

361+
#' given a tibble as would be found in an epi_archive, remove duplicate entries.
362+
#' @description
363+
#' works by shifting all rows except the version, then comparing values to see
364+
#' if they've changed. We need to arrange in descending order, but note that
365+
#' we don't need to group, since at least one column other than version has
366+
#' changed, and so is kept.
367+
#' @keywords internal
368+
#' @importFrom dplyr filter
369+
apply_compactify <- function(df, keys, tolerance = .Machine$double.eps^.5) {
370+
df %>%
371+
arrange(!!!keys) %>%
372+
filter(if_any(
373+
c(everything(), -version), # all non-version columns
374+
~ !is_locf(., tolerance)
375+
))
376+
}
377+
378+
#' get the entries that `compactify` would remove
379+
#' @keywords internal
380+
#' @importFrom dplyr filter if_all everything
381+
removed_by_compactify <- function(df, keys, tolerance) {
382+
df %>%
383+
arrange(!!!keys) %>%
384+
filter(if_all(
385+
c(everything(), -version),
386+
~ is_locf(., tolerance)
387+
)) # nolint: object_usage_linter
388+
}
389+
390+
#' Checks to see if a value in a vector is LOCF
391+
#' @description
392+
#' LOCF meaning last observation carried forward. lags the vector by 1, then
393+
#' compares with itself. For doubles it uses float comparison via
394+
#' [`dplyr::near`], otherwise it uses equality. `NA`'s and `NaN`'s are
395+
#' considered equal to themselves and each other.
396+
#' @importFrom dplyr lag if_else near
397+
#' @keywords internal
398+
is_locf <- function(vec, tolerance) { # nolint: object_usage_linter
399+
lag_vec <- dplyr::lag(vec)
400+
if (typeof(vec) == "double") {
401+
res <- if_else(
402+
!is.na(vec) & !is.na(lag_vec),
403+
near(vec, lag_vec, tol = tolerance),
404+
is.na(vec) & is.na(lag_vec)
405+
)
406+
return(res)
407+
} else {
408+
res <- if_else(
409+
!is.na(vec) & !is.na(lag_vec),
410+
vec == lag_vec,
411+
is.na(vec) & is.na(lag_vec)
412+
)
413+
return(res)
414+
}
415+
}
416+
417+
380418
#' `validate_epi_archive` ensures correctness of arguments fed to `as_epi_archive`.
381419
#'
382420
#' @rdname epi_archive

R/epi_df.R

-1
Original file line numberDiff line numberDiff line change
@@ -242,7 +242,6 @@ as_epi_df.tbl_df <- function(
242242
must be present in `x`."
243243
)
244244
}
245-
246245
if (lifecycle::is_present(geo_type)) {
247246
cli_warn("epi_archive constructor argument `geo_type` is now ignored. Consider removing.")
248247
}

0 commit comments

Comments
 (0)