diff --git a/.github/workflows/doc-preview.yaml b/.github/workflows/doc-preview.yaml new file mode 100644 index 000000000..068184225 --- /dev/null +++ b/.github/workflows/doc-preview.yaml @@ -0,0 +1,65 @@ +on: + issue_comment: + types: [created] + +name: doc-preview.yaml + +permissions: read-all + +jobs: + preview: + if: ${{ github.event.issue.pull_request && (github.event.comment.author_association == 'COLLABORATOR' || github.event.comment.author_association == 'CONTRIBUTOR' || github.event.comment.author_association == 'MEMBER' || github.event.comment.author_association == 'OWNER') && startsWith(github.event.comment.body, '/preview-docs') }} + + runs-on: ubuntu-latest + permissions: + # Needed to write a comment on the PR + pull-requests: write + # Needed to read the PR branch + contents: read + steps: + - uses: actions/checkout@v4 + with: + # Checkout the PR branch + ref: refs/pull/${{ github.event.issue.number }}/head + + - uses: r-lib/actions/setup-pandoc@v2 + + - uses: r-lib/actions/setup-r@v2 + with: + use-public-rspm: true + + - uses: r-lib/actions/setup-r-dependencies@v2 + with: + extra-packages: any::pkgdown, local::. + needs: website + + - name: Build site + run: pkgdown::build_site_github_pages(new_process = FALSE, install = FALSE) + shell: Rscript {0} + + - name: Deploy to Netlify + uses: nwtgck/actions-netlify@v3.0 + with: + # Standard config + github-token: ${{ secrets.GITHUB_TOKEN }} + deploy-message: "Deploy from GitHub Actions" + # 'docs/' is the default directory for pkgdown::build_site() + # we add 'dev' because _pkgdown.yml has 'development: mode: devel' + publish-dir: './docs/dev' + # Development deploys only + production-deploy: false + # Enable pull request comment (default) + enable-pull-request-comment: true + # Overwrite the pull request comment with updated link (default) + overwrites-pull-request-comment: true + # Don't deploy to GitHub + enable-github-deployment: false + # Don't update the status of the commit + enable-commit-status: false + # Don't comment on the commit + enable-commit-comment: false + env: + # Netlify credentials (currently from Dmitry's account) + NETLIFY_AUTH_TOKEN: ${{ secrets.NETLIFY_AUTH_TOKEN }} + NETLIFY_SITE_ID: ${{ secrets.NETLIFY_SITE_ID }} + timeout-minutes: 1 diff --git a/.lintr b/.lintr index c7c90554d..066b3c000 100644 --- a/.lintr +++ b/.lintr @@ -1,6 +1,5 @@ linters: linters_with_defaults( line_length_linter(120), - cyclocomp_linter = NULL, object_length_linter(length = 40L) ) exclusions: list( diff --git a/DESCRIPTION b/DESCRIPTION index e894f0999..57981a195 100755 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,12 +1,12 @@ -Package: epiprocess Type: Package +Package: epiprocess Title: Tools for basic signal processing in epidemiology -Version: 0.10.0 +Version: 0.11.0 Authors@R: c( person("Jacob", "Bien", role = "ctb"), person("Logan", "Brooks", , "lcbrooks+github@andrew.cmu.edu", role = c("aut", "cre")), person("Rafael", "Catoia", role = "ctb"), - person("Nat", "DeFries", role = "ctb"), + person("Nat", "DeFries", role = "aut"), person("Daniel", "McDonald", role = "aut"), person("Rachel", "Lobay", role = "ctb"), person("Ken", "Mawer", role = "ctb"), @@ -23,52 +23,56 @@ Authors@R: c( person("Posit", role = "cph", comment = "Copyright holder of included rlang fragments"), person("Johns Hopkins University Center for Systems Science and Engineering", role = "dtc", - comment = "Owner of COVID-19 cases and deaths data from the COVID-19 Data Repository"), + comment = "Owner of COVID-19 cases and deaths data from the COVID-19 Data Repository"), person("Johns Hopkins University", role = "cph", - comment = "Copyright holder of COVID-19 cases and deaths data from the COVID-19 Data Repository"), + comment = "Copyright holder of COVID-19 cases and deaths data from the COVID-19 Data Repository"), person("Carnegie Mellon University Delphi Group", role = "dtc", - comment = "Owner of claims-based CLI data from the Delphi Epidata API") + comment = "Owner of claims-based CLI data from the Delphi Epidata API") ) -Description: This package introduces common data structures for working with - epidemiological data reported by location and time and offers associated - utilities to perform basic signal processing tasks. The package is designed - to be used in conjunction with `epipredict` for building and evaluating - epidemiological models. +Description: This package introduces common data structures for working + with epidemiological data reported by location and time and offers + associated utilities to perform basic signal processing tasks. The + package is designed to be used in conjunction with `epipredict` for + building and evaluating epidemiological models. License: MIT + file LICENSE URL: https://cmu-delphi.github.io/epiprocess/ Depends: + epidatasets, R (>= 3.6) Imports: checkmate, cli, data.table, dplyr (>= 1.1.0), - epidatasets, - genlasso, ggplot2, glue, lifecycle (>= 1.0.1), lubridate, magrittr, + pkgconfig, purrr, rlang, slider, tibble, tidyr, tidyselect (>= 1.2.0), + tools, tsibble, utils, vctrs, waldo Suggests: devtools, + distributional, epidatr, + epipredict, here, knitr, outbreaks, readr, rmarkdown, testthat (>= 3.1.5), + trendfilter, withr VignetteBuilder: knitr @@ -76,7 +80,8 @@ Remotes: cmu-delphi/delphidocs, cmu-delphi/epidatasets, cmu-delphi/epidatr, - glmgen/genlasso, + cmu-delphi/epipredict, + glmgen/trendfilter, reconverse/outbreaks Config/Needs/website: cmu-delphi/delphidocs Config/testthat/edition: 3 @@ -103,5 +108,6 @@ Collate: 'reexports.R' 'revision_analysis.R' 'slide.R' + 'time-utils.R' 'utils.R' 'utils_pipe.R' diff --git a/NAMESPACE b/NAMESPACE index 1f5180fb7..2f97e5b3d 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,6 +1,9 @@ # Generated by roxygen2: do not edit by hand +S3method("$<-",epi_df) S3method("[",epi_df) +S3method("[<-",epi_df) +S3method("[[<-",epi_df) S3method("names<-",epi_df) S3method(Summary,epi_df) S3method(arrange_canonical,default) @@ -39,31 +42,27 @@ S3method(guess_period,Date) S3method(guess_period,POSIXt) S3method(guess_period,default) S3method(key_colnames,data.frame) -S3method(key_colnames,default) S3method(key_colnames,epi_archive) S3method(key_colnames,epi_df) +S3method(key_colnames,tbl_ts) S3method(mean,epi_df) S3method(print,epi_archive) S3method(print,epi_df) S3method(print,grouped_epi_archive) +S3method(print,growth_rate_params) S3method(summary,epi_df) S3method(ungroup,epi_df) S3method(ungroup,grouped_epi_archive) S3method(unnest,epi_df) export("%>%") -export(archive_cases_dv_subset) export(arrange) export(arrange_canonical) export(as_epi_archive) export(as_epi_df) export(as_tsibble) export(autoplot) -export(cases_deaths_subset) export(clone) export(complete) -export(covid_case_death_rates_extended) -export(covid_incidence_county_subset) -export(covid_incidence_outliers) export(deprecated_quo_is_present) export(detect_outlr) export(detect_outlr_rm) @@ -85,6 +84,7 @@ export(group_by) export(group_epi_df) export(group_modify) export(growth_rate) +export(growth_rate_params) export(guess_period) export(is_epi_df) export(is_grouped_epi_archive) @@ -114,6 +114,7 @@ importFrom(checkmate,assert_function) importFrom(checkmate,assert_int) importFrom(checkmate,assert_list) importFrom(checkmate,assert_logical) +importFrom(checkmate,assert_number) importFrom(checkmate,assert_numeric) importFrom(checkmate,assert_scalar) importFrom(checkmate,assert_string) @@ -135,6 +136,8 @@ importFrom(cli,cli_li) importFrom(cli,cli_vec) importFrom(cli,cli_warn) importFrom(cli,format_message) +importFrom(cli,pluralize) +importFrom(cli,qty) importFrom(data.table,":=") importFrom(data.table,address) importFrom(data.table,as.data.table) @@ -143,6 +146,7 @@ importFrom(data.table,copy) importFrom(data.table,frollapply) importFrom(data.table,frollmean) importFrom(data.table,frollsum) +importFrom(data.table,is.data.table) importFrom(data.table,key) importFrom(data.table,rbindlist) importFrom(data.table,set) @@ -152,7 +156,6 @@ importFrom(dplyr,"%>%") importFrom(dplyr,across) importFrom(dplyr,all_of) importFrom(dplyr,arrange) -importFrom(dplyr,bind_rows) importFrom(dplyr,c_across) importFrom(dplyr,dplyr_col_modify) importFrom(dplyr,dplyr_reconstruct) @@ -164,6 +167,7 @@ importFrom(dplyr,group_by_drop_default) importFrom(dplyr,group_map) importFrom(dplyr,group_modify) importFrom(dplyr,group_vars) +importFrom(dplyr,grouped_df) importFrom(dplyr,groups) importFrom(dplyr,if_all) importFrom(dplyr,if_any) @@ -171,7 +175,6 @@ importFrom(dplyr,if_else) importFrom(dplyr,is_grouped_df) importFrom(dplyr,lag) importFrom(dplyr,mutate) -importFrom(dplyr,near) importFrom(dplyr,pick) importFrom(dplyr,pull) importFrom(dplyr,relocate) @@ -188,6 +191,7 @@ importFrom(lubridate,as.period) importFrom(lubridate,days) importFrom(lubridate,weeks) importFrom(magrittr,"%>%") +importFrom(purrr,list_rbind) importFrom(purrr,map) importFrom(purrr,map_lgl) importFrom(rlang,"!!!") @@ -199,6 +203,8 @@ importFrom(rlang,arg_match) importFrom(rlang,caller_arg) importFrom(rlang,caller_env) importFrom(rlang,check_dots_empty) +importFrom(rlang,check_dots_empty0) +importFrom(rlang,dots_n) importFrom(rlang,enquo) importFrom(rlang,enquos) importFrom(rlang,env) @@ -206,6 +212,7 @@ importFrom(rlang,expr_label) importFrom(rlang,f_env) importFrom(rlang,f_rhs) importFrom(rlang,is_bare_integerish) +importFrom(rlang,is_bare_numeric) importFrom(rlang,is_environment) importFrom(rlang,is_formula) importFrom(rlang,is_function) @@ -236,8 +243,20 @@ importFrom(tidyr,unnest) importFrom(tidyselect,any_of) importFrom(tidyselect,eval_select) importFrom(tidyselect,starts_with) +importFrom(tools,toTitleCase) importFrom(tsibble,as_tsibble) importFrom(utils,capture.output) importFrom(utils,tail) +importFrom(vctrs,"vec_slice<-") +importFrom(vctrs,vec_cast) importFrom(vctrs,vec_data) +importFrom(vctrs,vec_duplicate_any) importFrom(vctrs,vec_equal) +importFrom(vctrs,vec_in) +importFrom(vctrs,vec_order) +importFrom(vctrs,vec_rbind) +importFrom(vctrs,vec_recycle_common) +importFrom(vctrs,vec_rep) +importFrom(vctrs,vec_size) +importFrom(vctrs,vec_slice) +importFrom(vctrs,vec_sort) diff --git a/NEWS.md b/NEWS.md index ba6826da2..3ac814aa2 100644 --- a/NEWS.md +++ b/NEWS.md @@ -2,6 +2,73 @@ Pre-1.0.0 numbering scheme: 0.x will indicate releases, while 0.x.y will indicate PR's. +# epiprocess 0.11 + +## Breaking changes + +- `growth_rate()` argument order and names have changed. You will need to + rewrite `growth_rate(x, y)` as `growth_rate(y, x)`. The interface for passing + arguments to the `"smooth_spline"` and `"trend_filter"` methods has also + changed. Finally, `growth_rate()` with `method = "trendfilter"` now uses the + `{trendfilter}` package rather than `{genlasso}`; results for this method will + be different than before. In order to make `{epiprocess}` installation easier + for users without a compiler, we have placed `{trendfilter}` in Suggests:; if + you want to use `method = "trendfilter"` you will need to manually install + this dependency (e.g., with `remotes::install_github("glmgen/trendfilter")`). +- In `revision_summary()`: + - The `should_compactify` argument is now called `compactify`. To migrate, + change any calls with `should_compactfiy =` to `compactify =`. + - Output now uses the name `lag_near_latest` instead of `time_near_latest`. To + migrate, update references to `time_near_latest` to `lag_near_latest`. + - `revision_summary(epi_arch)` without specifying the measurement column to + analyze in `...` will no longer attempt to guess which one you intended if + there are multiple possibilities to choose from (#571). If you attempt a + complicated tidyselection that selects zero columns, this is also now an + error. If you encounter such errors, manually specify the measurement column + in `...`. + - `min_waiting_period` now defines a nonstrict inequality instead of a strict + one. To obtain the old bounds, bump the `min_waiting_period` up to the next + possible value for your `time_type`. +- In `key_colnames()`: + - On regular (non-`epi_df`) data frames, now requires manual specification of + `geo_keys`, `other_keys`, and `time_keys`. + - The `extra_keys` argument has been deprecated and replaced with + `other_keys`. +- The compactification tolerance argument has been renamed to + `compactify_abs_tol` or `abs_tol`, depending on the function; now defines a + nonstrict tolerances; and defaults to 0 (requiring exact matches in order to + compactify). This argument has been added to `as_epi_archive()` and + `epix_merge()` and removed (along with all compactification options) from + `new_epi_archive()`. +- `validate_epi_archive()` now follows the validator convention of operating on + an "unvalidated" `epi_archive` (from `new_epi_archive`) rather than arguments. + +## Improvements +- `revision_summary()` now supports all `time_type`s. +- The compactification tolerance setting now works with integer-type columns. +- Various functions are now faster, using faster variants of core operations and + avoiding reconstructing grouped `epi_df`s when unnecessary. + +## Bug fixes + +- Fixed aggregation of age-group-specific rates to overall rates in `epi_df` vignette (#587). +- Fixed `key_colnames()` omitting some key columns on `epi_archive`s (#565). +- Fixed `epi_archive` compactification raising an error on certain value column + classes such as `"distribution"` (#541); it's now easier to form an archive of + forecasts in that format. +- Fixed large compactification tolerances potentially removing all versions of + some observations in certain cases when activity was flat. +- `[<-`, `[[<-`, and `$<-` now properly retain `epi_df`-ness when used on + grouped `epi_df`s. + +## Cleanup + +- Moved example datasets from being reexported in the package to being fetched + from `epidatasets`. The `epidatasets` package is now auto-loaded as a + dependency of `epiprocess`. The datasets can still be accessed, after loading + the package, with `data()` or the name of the dataset alone, or with + `epidatasets::` (#577). + # epiprocess 0.10 ## Breaking changes diff --git a/R/archive.R b/R/archive.R index 3557c9914..922f77835 100644 --- a/R/archive.R +++ b/R/archive.R @@ -48,7 +48,7 @@ validate_version_bound <- function(version_bound, x, na_ok = FALSE, if (!identical(class(version_bound), class(x[["version"]]))) { cli_abort( "{version_bound_arg} must have the same `class` vector as x$version, - which has a `class` of {format_class_vec(class(x$version))}", + which has a `class` of {format_chr_deparse(class(x$version))}", class = "epiprocess__version_bound_mismatched_class" ) } @@ -128,8 +128,11 @@ next_after.Date <- function(x) x + 1L #' column tracks the time at which the data was available. This allows for #' version-aware forecasting. #' -#' `new_epi_archive` is the constructor for `epi_archive` objects that assumes -#' all arguments have been validated. Most users should use `as_epi_archive`. +#' `new_epi_archive` is the low-level constructor for `epi_archive` objects that +#' only performs some fast, basic checks on the inputs. `validate_epi_archive` +#' can perform more costly validation checks on its output. But most users +#' should use `as_epi_archive`, which performs all necessary checks and has some +#' additional features. #' #' @details An `epi_archive` contains a `data.table` object `DT` (from the #' `{data.table}` package), with (at least) the following columns: @@ -194,9 +197,15 @@ next_after.Date <- function(x) x + 1L #' that should be considered key variables (in the language of `data.table`) #' apart from "geo_value", "time_value", and "version". Typical examples #' are "age" or more granular geographies. -#' @param compactify Optional; Boolean. `TRUE` will remove some -#' redundant rows, `FALSE` will not, and missing or `NULL` will remove -#' redundant rows, but issue a warning. See more information at `compactify`. +#' @param compactify Optional; `TRUE`, `FALSE`, or `"message"`. `TRUE` will +#' remove some redundant rows, `FALSE` will not. `"message"` is like `TRUE` +#' but will emit a message if anything was changed. Default is `TRUE`. See +#' more information below under "Compactification:". +#' @param compactify_abs_tol Optional; double. A tolerance level used to detect +#' approximate equality for compactification. The default is 0, which +#' corresponds to exact equality. Consider using this if your value columns +#' undergo tiny nonmeaningful revisions and the archive object with the +#' default setting is too large. #' @param clobberable_versions_start Optional; `length`-1; either a value of the #' same `class` and `typeof` as `x$version`, or an `NA` of any `class` and #' `typeof`: specifically, either (a) the earliest version that could be @@ -221,8 +230,6 @@ next_after.Date <- function(x) x + 1L #' value of `clobberable_versions_start` does not fully trust these empty #' updates, and assumes that any version `>= max(x$version)` could be #' clobbered.) If `nrow(x) == 0`, then this argument is mandatory. -#' @param compactify_tol double. the tolerance used to detect approximate -#' equality for compactification #' @return An `epi_archive` object. #' #' @seealso [`epix_as_of`] [`epix_merge`] [`epix_slide`] @@ -275,19 +282,72 @@ new_epi_archive <- function( geo_type, time_type, other_keys, - compactify, clobberable_versions_start, - versions_end, - compactify_tol = .Machine$double.eps^0.5) { + versions_end) { + assert_data_frame(x) + assert_string(geo_type) + assert_string(time_type) + assert_character(other_keys, any.missing = FALSE) + if (any(c("geo_value", "time_value", "version") %in% other_keys)) { + cli_abort("`other_keys` cannot contain \"geo_value\", \"time_value\", or \"version\".") + } + validate_version_bound(clobberable_versions_start, x, na_ok = TRUE) + validate_version_bound(versions_end, x, na_ok = FALSE) + + key_vars <- c("geo_value", "time_value", other_keys, "version") + if (!all(key_vars %in% names(x))) { + # Give a more tailored error message than as.data.table would: + cli_abort(c( + "`x` is missing the following expected columns: + {format_varnames(setdiff(key_vars, names(x)))}.", + ">" = "You might need to `dplyr::rename()` beforehand + or use `as_epi_archive()`'s renaming feature.", + ">" = if (!all(other_keys %in% names(x))) { + "Check also for typos in `other_keys`." + } + )) + } + # Create the data table; if x was an un-keyed data.table itself, # then the call to as.data.table() will fail to set keys, so we # need to check this, then do it manually if needed - key_vars <- c("geo_value", "time_value", other_keys, "version") - data_table <- as.data.table(x, key = key_vars) # nolint: object_name_linter + data_table <- as.data.table(x, key = key_vars) if (!identical(key_vars, key(data_table))) setkeyv(data_table, cols = key_vars) - if (anyDuplicated(data_table, by = key(data_table)) != 0L) { - cli_abort("`x` must have one row per unique combination of the key variables. If you + structure( + list( + DT = data_table, + geo_type = geo_type, + time_type = time_type, + other_keys = other_keys, + clobberable_versions_start = clobberable_versions_start, + versions_end = versions_end + ), + class = "epi_archive" + ) +} + +#' Perform second (costly) round of validation that `x` is a proper `epi_archive` +#' +#' @rdname epi_archive +#' @export +validate_epi_archive <- function(x) { + assert_class(x, "epi_archive") + + ukey_vars1 <- c("geo_value", "time_value", x$other_keys, "version") + ukey_vars2 <- key(x$DT) + if (!identical(ukey_vars1, ukey_vars2)) { + cli_abort(c("`data.table::key(x$DT)` not as expected", + "*" = "Based on `x$other_keys` the key should be {format_chr_deparse(ukey_vars1)}", + "*" = "But `key(x$DT)` is {format_chr_deparse(ukey_vars2)}", + ">" = "Consider reconstructing the archive from `x$DT` specifying + the appropriate `other_keys`." + )) + } + # Rely on data.table to ensure that these key columns exist. + + if (anyDuplicated(x$DT, by = key(x$DT)) != 0L) { + cli_abort("`x$DT` must have one row per unique combination of the key variables. If you have additional key variables other than `geo_value`, `time_value`, and `version`, such as an age group column, please specify them in `other_keys`. Otherwise, check for duplicate rows and/or conflicting values for the same @@ -296,49 +356,37 @@ new_epi_archive <- function( ) } - nrow_before_compactify <- nrow(data_table) - # Runs compactify on data frame - if (is.null(compactify) || compactify == TRUE) { - compactified <- apply_compactify(data_table, key_vars, compactify_tol) - } else { - compactified <- data_table + if (!identical(class(x$DT$time_value), class(x$DT$version))) { + cli_abort( + "`x$DT$time_value` and `x$DT$version` must have the same class.", + class = "epiprocess__time_value_version_mismatch" + ) } - # Warns about redundant rows if the number of rows decreased, and we didn't - # explicitly say to compactify - if (is.null(compactify) && nrow(compactified) < nrow_before_compactify) { - elim <- removed_by_compactify(data_table, key_vars, compactify_tol) - warning_intro <- cli::format_inline( - "Found rows that appear redundant based on - last (version of each) observation carried forward; - these rows have been removed to 'compactify' and save space:", - keep_whitespace = FALSE + + if (anyMissing(x$DT$version)) { + cli_abort("Column `version` must not contain missing values.") + } + + if (nrow(x$DT) > 0L && x$versions_end < max(x$DT$version)) { + cli_abort( + "`x$versions_end` was {x$versions_end}, but `x$DT` contained + updates for a later version or versions, up through {max(x$DT$version)}", + class = "epiprocess__versions_end_earlier_than_updates" ) - warning_data <- paste(collapse = "\n", capture.output(print(elim, topn = 3L, nrows = 7L))) - warning_outro <- cli::format_inline( - "Built-in `epi_archive` functionality should be unaffected, - but results may change if you work directly with its fields (such as `DT`). - See `?as_epi_archive` for details. - To silence this warning but keep compactification, - you can pass `compactify=TRUE` when constructing the archive.", - keep_whitespace = FALSE + } + if (!is.na(x$clobberable_versions_start) && x$clobberable_versions_start > x$versions_end) { + cli_abort( + "`x$versions_end` was {x$versions_end}; however, `x$clobberable_versions_start` + was {x$clobberable_versions_start}, indicating that there were later observed versions", + class = "epiprocess__versions_end_earlier_than_clobberable_versions_start" ) - warning_message <- paste(sep = "\n", warning_intro, warning_data, warning_outro) - rlang::warn(warning_message, class = "epiprocess__compactify_default_removed_rows") } - structure( - list( - DT = compactified, - geo_type = geo_type, - time_type = time_type, - other_keys = other_keys, - clobberable_versions_start = clobberable_versions_start, - versions_end = versions_end - ), - class = "epi_archive" - ) + # Return x visibly due to popular `validate_...(new_...())` pattern: + x } + #' Given a tibble as would be found in an epi_archive, remove duplicate entries. #' #' Works by shifting all rows except the version, then comparing values to see @@ -346,108 +394,119 @@ new_epi_archive <- function( #' we don't need to group, since at least one column other than version has #' changed, and so is kept. #' +#' @param updates_df DT of an `epi_archive` or something analogous (though +#' potentially unsorted) of another class +#' @param ukey_names chr; the column names forming a unique key for the +#' `updates_df`; "version" must come last. For an `epi_archive`'s `DT`, this +#' would be `key(DT)`. +#' @param abs_tol numeric, >=0; absolute tolerance to use on numeric measurement +#' columns when determining whether something can be compactified away; see +#' [`is_locf`] +#' +#' @importFrom data.table is.data.table key +#' @importFrom dplyr arrange filter +#' @importFrom vctrs vec_duplicate_any +#' #' @keywords internal -#' @importFrom dplyr filter -apply_compactify <- function(df, keys, tolerance = .Machine$double.eps^.5) { - df %>% - arrange(!!!keys) %>% - filter(if_any( - c(everything(), -version), # all non-version columns - ~ !is_locf(., tolerance) - )) +apply_compactify <- function(updates_df, ukey_names, abs_tol = 0) { + assert_data_frame(updates_df) + assert_character(ukey_names) + assert_subset(ukey_names, names(updates_df)) + if (vec_duplicate_any(ukey_names)) { + cli_abort("`ukey_names` must not contain duplicates") + } + if (length(ukey_names) == 0 || ukey_names[[length(ukey_names)]] != "version") { + cli_abort('"version" must appear in `ukey_names` and must be last.') + } + assert_numeric(abs_tol, len = 1, lower = 0) + + if (!is.data.table(updates_df) || !identical(key(updates_df), ukey_names)) { + updates_df <- updates_df %>% arrange(pick(all_of(ukey_names))) + } + updates_df[!update_is_locf(updates_df, ukey_names, abs_tol), ] } #' get the entries that `compactify` would remove #' @keywords internal #' @importFrom dplyr filter if_all everything -removed_by_compactify <- function(df, keys, tolerance) { - df %>% - arrange(!!!keys) %>% - filter(if_all( - c(everything(), -version), - ~ is_locf(., tolerance) - )) # nolint: object_usage_linter +removed_by_compactify <- function(updates_df, ukey_names, abs_tol) { + if (!is.data.table(updates_df) || !identical(key(updates_df), ukey_names)) { + updates_df <- updates_df %>% arrange(pick(all_of(ukey_names))) + } + updates_df[update_is_locf(updates_df, ukey_names, abs_tol), ] } -#' Checks to see if a value in a vector is LOCF -#' @description -#' LOCF meaning last observation carried forward. lags the vector by 1, then -#' compares with itself. For doubles it uses float comparison via -#' [`dplyr::near`], otherwise it uses equality. `NA`'s and `NaN`'s are -#' considered equal to themselves and each other. -#' @importFrom dplyr lag if_else near +#' Internal helper; lgl; which updates are LOCF +#' +#' (Not validated:) Must be called inside certain dplyr data masking verbs (e.g., +#' `filter` or `mutate`) being run on an `epi_archive`'s `DT` or a data frame +#' formatted like one. +#' +#' @param arranged_updates_df an arranged update data frame like an `epi_archive` `DT` +#' @param ukey_names (not validated:) chr; the archive/equivalent +#' [`key_colnames`]; must include `"version"`. +#' @param abs_tol (not validated:) as in [`apply_compactify`] +#' +#' @return lgl +#' #' @keywords internal -is_locf <- function(vec, tolerance) { # nolint: object_usage_linter - lag_vec <- dplyr::lag(vec) - if (typeof(vec) == "double") { - res <- if_else( - !is.na(vec) & !is.na(lag_vec), - near(vec, lag_vec, tol = tolerance), - is.na(vec) & is.na(lag_vec) - ) - return(res) - } else { - res <- if_else( - !is.na(vec) & !is.na(lag_vec), - vec == lag_vec, - is.na(vec) & is.na(lag_vec) - ) - return(res) - } +update_is_locf <- function(arranged_updates_df, ukey_names, abs_tol) { + # Use as.list to get a shallow "copy" in case of data.table, so that column + # selection does not copy the column contents. Don't leak these column aliases + # or it will break data.table ownership model. + updates_col_refs <- as.list(arranged_updates_df) + + all_names <- names(arranged_updates_df) + ekt_names <- ukey_names[ukey_names != "version"] + val_names <- all_names[!all_names %in% ukey_names] + + Reduce(`&`, lapply(updates_col_refs[ekt_names], is_locf, abs_tol, TRUE)) & + Reduce(`&`, lapply(updates_col_refs[val_names], is_locf, abs_tol, FALSE)) } - -#' `validate_epi_archive` ensures correctness of arguments fed to `as_epi_archive`. +#' Checks to see if a value in a vector is LOCF +#' @description LOCF meaning last observation carried forward (to later +#' versions). Lags the vector by 1, then compares with itself. If `is_key` is +#' `TRUE`, only values that are exactly the same between the lagged and +#' original are considered LOCF. If `is_key` is `FALSE` and `vec` is a vector +#' of numbers ([`base::is.numeric`]), then approximate equality will be used, +#' checking whether the absolute difference between each pair of entries is +#' `<= abs_tol`; if `vec` is something else, then exact equality is used +#' instead. #' -#' @rdname epi_archive +#' @details #' -#' @export -validate_epi_archive <- function( - x, - other_keys, - compactify, - clobberable_versions_start, - versions_end) { - # Finish off with small checks on keys variables and metadata - if (!test_subset(other_keys, names(x))) { - cli_abort("`other_keys` must be contained in the column names of `x`.") - } - if (any(c("geo_value", "time_value", "version") %in% other_keys)) { - cli_abort("`other_keys` cannot contain \"geo_value\", \"time_value\", or \"version\".") - } - - # Conduct checks and apply defaults for `compactify` - assert_logical(compactify, len = 1, any.missing = FALSE, null.ok = TRUE) - - # Make sure `time_value` and `version` have the same time type - if (!identical(class(x[["time_value"]]), class(x[["version"]]))) { - cli_abort( - "`time_value` and `version` must have the same class.", - class = "epiprocess__time_value_version_mismatch" - ) - } - - # Apply defaults and conduct checks for - # `clobberable_versions_start`, `versions_end`: - validate_version_bound(clobberable_versions_start, x, na_ok = TRUE) - validate_version_bound(versions_end, x, na_ok = FALSE) - if (nrow(x) > 0L && versions_end < max(x[["version"]])) { - cli_abort( - "`versions_end` was {versions_end}, but `x` contained - updates for a later version or versions, up through {max(x$version)}", - class = "epiprocess__versions_end_earlier_than_updates" - ) - } - if (!is.na(clobberable_versions_start) && clobberable_versions_start > versions_end) { - cli_abort( - "`versions_end` was {versions_end}, but a `clobberable_versions_start` - of {clobberable_versions_start} indicated that there were later observed versions", - class = "epiprocess__versions_end_earlier_than_clobberable_versions_start" - ) +#' We include epikey-time columns in LOCF comparisons as part of an optimization +#' to avoid slower grouped operations while still ensuring that the first +#' observation for each time series will not be marked as LOCF. We test these +#' key columns for exact equality to prevent chopping off consecutive +#' time_values during flat periods when `abs_tol` is high. +#' +#' We use exact equality for non-`is.numeric` double/integer columns such as +#' dates, datetimes, difftimes, `tsibble::yearmonth`s, etc., as these may be +#' used as part of re-indexing or grouping procedures, and we don't want to +#' change the number of groups for those operations when we remove LOCF data +#' during compactification. +#' +#' @importFrom dplyr lag if_else +#' @importFrom rlang is_bare_numeric +#' @importFrom vctrs vec_equal +#' @keywords internal +is_locf <- function(vec, abs_tol, is_key) { # nolint: object_usage_linter + lag_vec <- lag(vec) + if (is.vector(vec, mode = "numeric") && !is_key) { + # (integer or double vector, no class (& no dims); maybe names, which we'll + # ignore like `vec_equal`); not a key column + unname(if_else( + !is.na(vec) & !is.na(lag_vec), + abs(vec - lag_vec) <= abs_tol, + is.na(vec) & is.na(lag_vec) + )) + } else { + vec_equal(vec, lag_vec, na_equal = TRUE) } } - #' `as_epi_archive` converts a data frame, data table, or tibble into an #' `epi_archive` object. #' @@ -465,7 +524,8 @@ as_epi_archive <- function( geo_type = deprecated(), time_type = deprecated(), other_keys = character(), - compactify = NULL, + compactify = TRUE, + compactify_abs_tol = 0, clobberable_versions_start = NA, .versions_end = max_version_with_row_in(x), ..., versions_end = .versions_end) { @@ -474,17 +534,7 @@ as_epi_archive <- function( x <- guess_column_name(x, "time_value", time_column_names()) x <- guess_column_name(x, "geo_value", geo_column_names()) x <- guess_column_name(x, "version", version_column_names()) - if (!test_subset(c("geo_value", "time_value", "version"), names(x))) { - cli_abort( - "Either columns `geo_value`, `time_value`, and `version`, or related columns - (see the internal functions `guess_time_column_name()`, - `guess_geo_column_name()` and/or `guess_geo_version_name()` for complete - list) must be present in `x`." - ) - } - if (anyMissing(x$version)) { - cli_abort("Column `version` must not contain missing values.") - } + if (lifecycle::is_present(geo_type)) { cli_warn("epi_archive constructor argument `geo_type` is now ignored. Consider removing.") } @@ -495,13 +545,51 @@ as_epi_archive <- function( geo_type <- guess_geo_type(x$geo_value) time_type <- guess_time_type(x$time_value) - validate_epi_archive( - x, other_keys, compactify, clobberable_versions_start, versions_end - ) - new_epi_archive( + result <- validate_epi_archive(new_epi_archive( x, geo_type, time_type, other_keys, - compactify, clobberable_versions_start, versions_end - ) + clobberable_versions_start, versions_end + )) + + # Compactification: + if (!list(compactify) %in% list(TRUE, FALSE, "message")) { + cli_abort('`compactify` must be `TRUE`, `FALSE`, or `"message"`') + } + + data_table <- result$DT + key_vars <- key(data_table) + + nrow_before_compactify <- nrow(data_table) + # Runs compactify on data frame + if (identical(compactify, TRUE) || identical(compactify, "message")) { + compactified <- apply_compactify(data_table, key_vars, compactify_abs_tol) + } else { + compactified <- data_table + } + # Messages about redundant rows if the number of rows decreased, and we didn't + # explicitly say to compactify + if (identical(compactify, "message") && nrow(compactified) < nrow_before_compactify) { + elim <- removed_by_compactify(data_table, key_vars, compactify_abs_tol) + message_intro <- cli::format_inline( + "Found rows that appear redundant based on + last (version of each) observation carried forward; + these rows have been removed to 'compactify' and save space:", + keep_whitespace = FALSE + ) + message_data <- paste(collapse = "\n", capture.output(print(elim, topn = 3L, nrows = 7L))) + message_outro <- cli::format_inline( + "Built-in `epi_archive` functionality should be unaffected, + but results may change if you work directly with its fields (such as `DT`). + See `?as_epi_archive` for details. + To silence this message but keep compactification, + you can pass `compactify=TRUE` when constructing the archive.", + keep_whitespace = FALSE + ) + message_string <- paste(sep = "\n", message_intro, message_data, message_outro) + rlang::inform(message_string, class = "epiprocess__compactify_default_removed_rows") + } + + result$DT <- compactified + result } @@ -718,5 +806,5 @@ clone <- function(x) { #' @export clone.epi_archive <- function(x) { x$DT <- data.table::copy(x$DT) - return(x) + x } diff --git a/R/correlation.R b/R/correlation.R index c66009737..8279820bc 100644 --- a/R/correlation.R +++ b/R/correlation.R @@ -120,8 +120,8 @@ epi_cor <- function(x, var1, var2, dt1 = 0, dt2 = 0, shift_by = geo_value, # nol # Function to perform time shifts, lag or lead shift <- function(var, n) { if (n < 0) { - return(dplyr::lag(var, -n)) + dplyr::lag(var, -n) } else { - return(dplyr::lead(var, n)) + dplyr::lead(var, n) } } diff --git a/R/epi_df.R b/R/epi_df.R index 6cae22dd2..4955ab08d 100644 --- a/R/epi_df.R +++ b/R/epi_df.R @@ -189,7 +189,10 @@ new_epi_df <- function(x = tibble::tibble(geo_value = character(), time_value = # Reorder columns (geo_value, time_value, ...) if (nrow(x) > 0) { - x <- x %>% relocate(all_of(c("geo_value", other_keys, "time_value")), .before = 1) + all_names <- names(x) + ukey_names <- c("geo_value", other_keys, "time_value") + value_names <- all_names[!all_names %in% ukey_names] + x <- x[c(ukey_names, value_names)] } # Apply epi_df class, attach metadata, and return @@ -218,7 +221,7 @@ as_epi_df <- function(x, ...) { #' @export as_epi_df.epi_df <- function(x, ...) { x <- ungroup(x) - return(x) + x } #' @rdname epi_df diff --git a/R/epiprocess-package.R b/R/epiprocess-package.R index 675d000db..61eccf993 100644 --- a/R/epiprocess-package.R +++ b/R/epiprocess-package.R @@ -14,24 +14,38 @@ #' @importFrom checkmate check_names #' @importFrom checkmate test_subset test_set_equal vname #' @importFrom cli cli_abort cli_warn +#' @importFrom cli pluralize +#' @importFrom cli qty #' @importFrom data.table as.data.table #' @importFrom data.table key #' @importFrom data.table setkeyv #' @importFrom dplyr arrange +#' @importFrom dplyr grouped_df #' @importFrom dplyr is_grouped_df #' @importFrom dplyr select #' @importFrom lifecycle deprecated +#' @importFrom purrr list_rbind #' @importFrom rlang %||% #' @importFrom rlang is_bare_integerish +#' @importFrom tools toTitleCase +#' @importFrom vctrs vec_cast #' @importFrom vctrs vec_data #' @importFrom vctrs vec_equal +#' @importFrom vctrs vec_in +#' @importFrom vctrs vec_order +#' @importFrom vctrs vec_rbind +#' @importFrom vctrs vec_recycle_common +#' @importFrom vctrs vec_rep +#' @importFrom vctrs vec_slice +#' @importFrom vctrs vec_slice<- +#' @importFrom vctrs vec_sort ## usethis namespace: end NULL utils::globalVariables(c( - ".x", ".group_key", ".ref_time_value", "resid", + ".", ".x", ".group_key", ".ref_time_value", "resid", "fitted", ".response", "geo_value", "time_value", "value", ".real", "lag", "max_value", "min_value", - "median_value", "spread", "rel_spread", "time_to", - "time_near_latest", "n_revisions", "min_lag", "max_lag" + "median_value", "spread", "rel_spread", "lag_to", + "lag_near_latest", "n_revisions", "min_lag", "max_lag" )) diff --git a/R/grouped_epi_archive.R b/R/grouped_epi_archive.R index 08eb2d250..378ea13bc 100644 --- a/R/grouped_epi_archive.R +++ b/R/grouped_epi_archive.R @@ -76,12 +76,12 @@ new_grouped_epi_archive <- function(x, vars, drop) { private$vars <- vars private$drop <- drop - return(structure( + structure( list( private = private ), class = "grouped_epi_archive" - )) + ) } @@ -332,7 +332,7 @@ epix_slide.grouped_epi_archive <- function( comp_value <- .slide_comp(.data_group, .group_key, .version, ...) # If this wasn't a tidyeval computation, we still need to check the output - # types. We'll let `group_modify` and `vec_rbind` deal with checking for + # types. We'll let `vec_rbind` and `bind_rows` deal with checking for # type compatibility between the outputs. if (!used_data_masking && !( # vctrs considers data.frames to be vectors, but we still check @@ -431,7 +431,7 @@ epix_slide.grouped_epi_archive <- function( } # Fast conversion: - return(validate_tibble(new_tibble(res))) + validate_tibble(new_tibble(res)) } out <- lapply(.versions, function(.version) { @@ -493,16 +493,14 @@ epix_slide.grouped_epi_archive <- function( } } - return( - dplyr::bind_rows(dplyr::group_map( # note: output will be ungrouped - dplyr::group_by(as_of_df, !!!syms(.x$private$vars), .drop = .x$private$drop), - group_map_fn, - .slide_comp = .slide_comp, ..., - .version = .version, - .new_col_name = .new_col_name, - .keep = TRUE - )) - ) + dplyr::bind_rows(dplyr::group_map( # note: output will be ungrouped + dplyr::group_by(as_of_df, !!!syms(.x$private$vars), .drop = .x$private$drop), + group_map_fn, + .slide_comp = .slide_comp, ..., + .version = .version, + .new_col_name = .new_col_name, + .keep = TRUE + )) }) # Combine output into a single tibble (allowing for packed columns) out <- vctrs::vec_rbind(!!!out) diff --git a/R/growth_rate.R b/R/growth_rate.R index 307309b5a..fc1cf6447 100644 --- a/R/growth_rate.R +++ b/R/growth_rate.R @@ -5,12 +5,12 @@ #' vignette](https://cmu-delphi.github.io/epiprocess/articles/growth_rate.html) #' for examples. #' +#' @param y Signal values. #' @param x Design points corresponding to the signal values `y`. Default is #' `seq_along(y)` (that is, equally-spaced points from 1 to the length of #' `y`). -#' @param y Signal values. #' @param x0 Points at which we should estimate the growth rate. Must be a -#' subset of `x` (no extrapolation allowed). Default is `x`. +#' contained in the range of `x` (no extrapolation allowed). Default is `x`. #' @param method Either "rel_change", "linear_reg", "smooth_spline", or #' "trend_filter", indicating the method to use for the growth rate #' calculation. The first two are local methods: they are run in a sliding @@ -21,14 +21,10 @@ #' "linear_reg". See details for more explanation. #' @param log_scale Should growth rates be estimated using the parametrization #' on the log scale? See details for an explanation. Default is `FALSE`. -#' @param dup_rm Should we check and remove duplicates in `x` (and corresponding -#' elements of `y`) before the computation? Some methods might handle -#' duplicate `x` values gracefully, whereas others might fail (either quietly -#' or loudly). Default is `FALSE`. #' @param na_rm Should missing values be removed before the computation? Default #' is `FALSE`. -#' @param ... Additional arguments to pass to the method used to estimate the -#' derivative. +#' @param params Additional arguments to pass to the method used to estimate the +#' derivative. This should be created with `growth_rate_params()`. #' @return Vector of growth rate estimates at the specified points `x0`. #' #' @details The growth rate of a function f defined over a continuously-valued @@ -49,12 +45,14 @@ #' sliding window centered at the reference point `x0`, divided by the fitted #' value from this linear regression at `x0`. #' * "smooth_spline": uses the estimated derivative at `x0` from a smoothing -#' spline fit to `x` and `y`, via `stats::smooth.spline()`, divided by the +#' spline fit to `x` and `y`, via [stats::smooth.spline()], divided by the #' fitted value of the spline at `x0`. #' * "trend_filter": uses the estimated derivative at `x0` from polynomial trend #' filtering (a discrete spline) fit to `x` and `y`, via -#' `genlasso::trendfilter()`, divided by the fitted value of the discrete -#' spline at `x0`. +#' [trendfilter::trendfilter()], divided by the fitted value of the discrete +#' spline at `x0`. This method requires the +#' [`{trendfilter}` package](https://github.com/glmgen/trendfilter) +#' to be installed. #' #' ## Log Scale #' @@ -80,27 +78,30 @@ #' ## Additional Arguments #' #' For the global methods, "smooth_spline" and "trend_filter", additional -#' arguments can be specified via `...` for the underlying estimation -#' function. For the smoothing spline case, these additional arguments are -#' passed directly to `stats::smooth.spline()` (and the defaults are exactly -#' as in this function). The trend filtering case works a bit differently: -#' here, a custom set of arguments is allowed (which are distributed -#' internally to `genlasso::trendfilter()` and `genlasso::cv.trendfilter()`): +#' arguments can be specified via `params` for the underlying estimation +#' function. These additional arguments are +#' passed to [stats::smooth.spline()], [trendfilter::trendfilter()], or +#' [trendfilter::cv_trendfilter()]. The defaults are exactly +#' as specified in those functions, except when those defaults conflict +#' among these functions. These cases are as follows: #' -#' * `ord`: order of piecewise polynomial for the trend filtering fit. Default -#' is 3. -#' * `maxsteps`: maximum number of steps to take in the solution path before -#' terminating. Default is 1000. -#' * `cv`: should cross-validation be used to choose an effective degrees of -#' freedom for the fit? Default is `TRUE`. -#' * `k`: number of folds if cross-validation is to be used. Default is 3. -#' * `df`: desired effective degrees of freedom for the trend filtering fit. If -#' `cv = FALSE`, then `df` must be a positive integer; if `cv = TRUE`, then -#' `df` must be one of "min" or "1se" indicating the selection rule to use +#' * `df`: desired effective degrees of freedom. For "smooth_spline", this must be numeric (or `NULL`) and will +#' be passed along to the underlying function. For "trend_filter", if +#' `cv = FALSE`, then `df` must be a positive number (integer is most sensible); +#' if `cv = TRUE`, then `df` must be one of "min" or "1se" indicating the +#' selection rule to use #' based on the cross-validation error curve: minimum or 1-standard-error -#' rule, respectively. Default is "min" (going along with the default `cv = -#' TRUE`). Note that if `cv = FALSE`, then we require `df` to be set by the -#' user. +#' rule, respectively. The default is "min" (going along with the default +#' `cv = TRUE`). +#' * `lambda`: For "smooth_spline", this should be a scalar value or `NULL`. +#' For "trend_filter", this is allowed to also be a vector, as long as either +#' `cv = TRUE` or `df` is specified. +#' * `cv`: should cross-validation be used to choose an effective degrees of +#' freedom for the fit? The default is `FALSE` to match [stats::smooth.spline()]. +#' In that case, as in that function, GCV is used instead. +#' For "trend_filter", this will be coerced to `TRUE` if neither +#' `df` nor `lambda` are specified (the default). +#' Note that passing both `df` and a scalar `lambda` will always be an error. #' #' @export #' @examples @@ -109,54 +110,67 @@ #' group_by(geo_value) %>% #' mutate(cases_gr = growth_rate(x = time_value, y = cases)) #' -#' # Log scale, degree 4 polynomial and 6-fold cross validation +#' # Degree 3 polynomial and 5-fold cross validation on the log scale +#' # some locations report 0 cases, so we replace these with 1 #' cases_deaths_subset %>% #' group_by(geo_value) %>% -#' mutate(gr_poly = growth_rate(x = time_value, y = cases, log_scale = TRUE, ord = 4, k = 6)) -growth_rate <- function(x = seq_along(y), y, x0 = x, - method = c( - "rel_change", "linear_reg", - "smooth_spline", "trend_filter" - ), - h = 7, log_scale = FALSE, - dup_rm = FALSE, na_rm = FALSE, ...) { +#' mutate(gr_poly = growth_rate( +#' x = time_value, y = pmax(cases, 1), method = "trend_filter", +#' log_scale = TRUE, na_rm = TRUE +#' )) +growth_rate <- function( + y, x = seq_along(y), x0 = x, + method = c("rel_change", "linear_reg", "smooth_spline", "trend_filter"), + h = 7, log_scale = FALSE, na_rm = FALSE, + params = growth_rate_params()) { # Check x, y, x0 if (length(x) != length(y)) cli_abort("`x` and `y` must have the same length.") - if (!all(x0 %in% x)) cli_abort("`x0` must be a subset of `x`.") method <- rlang::arg_match(method) + assert_class(params, "growth_rate_params") + if (anyNA(x) || anyNA(x0)) { + cli_abort("Neither `x` nor `x0` may contain `NA`s.") + } + if (vctrs::vec_duplicate_any(x)) { + cli_abort( + "`x` contains duplicate values. (If being run on a + column in an `epi_df`, did you group by relevant key variables?)" + ) + } + if (method == "trend_filter" && !requireNamespace("trendfilter", quietly = TRUE)) { + cli_abort(c( + "The {.pkg trendfilter} package must be installed to use this option.", + i = "It is available at {.url https://github.com/glmgen/trendfilter}." + )) + } # Arrange in increasing order of x + if (min(x0) < min(x) || max(x0) > max(x)) { + cli_abort("`x0` must be contained in `[min(x), max(x)]`.") + } o <- order(x) x <- x[o] y <- y[o] + n <- length(y) # Convert to log(y) if we need to y <- as.numeric(y) - if (log_scale) y <- log(y) - - # Remove duplicates if we need to - if (dup_rm) { - o <- !duplicated(x) - if (any(!o)) { - cli_warn( - "`x` contains duplicate values. (If being run on a - column in an `epi_df`, did you group by relevant key variables?)" - ) + if (log_scale) { + if (any(y <= 0)) { + cli_warn("`y` contains 0 or negative values. Taking logs may produce + strange results.") } - x <- x[o] - y <- y[o] + y <- suppressWarnings(log(y)) } - - # Remove NAs if we need to - if (na_rm) { - o <- !(is.na(x) & is.na(y)) - x <- x[o] - y <- y[o] + if (!is.finite(y[1]) || !is.finite(y[n])) { + cli_abort("Either the first or last `y` values are not finite. This may be + due to `log_scale = TRUE`.") } - - # Useful indices for later - i0 <- x %in% x0 + good_obs <- (!is.na(y) | !na_rm) & is.finite(y) + x <- x[good_obs] + y <- y[good_obs] + x <- as.numeric(x) + x0 <- as.numeric(x0) # Local methods if (method == "rel_change" || method == "linear_reg") { @@ -178,9 +192,9 @@ growth_rate <- function(x = seq_along(y), y, x0 = x, a <- mean(yy[left]) hh <- mean(xx[right]) - mean(xx[left]) if (log_scale) { - return((b - a) / hh) + (b - a) / hh } else { - return((b / a - 1) / hh) + (b / a - 1) / hh } } else { # Linear regression @@ -189,30 +203,33 @@ growth_rate <- function(x = seq_along(y), y, x0 = x, b <- sum(xm * ym) / sum(xm^2) a <- mean(yy - b * xx) if (log_scale) { - return(b) + b } else { - return(b / (a + b * x_ref)) + b / (a + b * x_ref) } } }) - - return(g[i0]) + return(stats::approx(x, g, x0)$y) } # Global methods if (method == "smooth_spline" || method == "trend_filter") { - # Convert to numerics - x <- as.numeric(x) - x0 <- as.numeric(x0) - - # Collect parameters - params <- list(...) + if (any(is.na(x) | is.na(y) | !is.finite(x) | !is.finite(y))) { + cli_abort(c( + "{.val {method}} requires all real values without missingness.", + i = "Set `na_rm = TRUE` and / or check for infinite values.", + i = "Using `log_scale = TRUE` may induce either case." + )) + } - # Smoothing spline if (method == "smooth_spline") { - params$x <- x - params$y <- y - obj <- do.call(stats::smooth.spline, params) + if (is.character(params$df)) params$df <- NULL + if (length(params$lambda) > 1L) { + cli_abort("{.val smooth_spline} requires 1 `lambda` but more were used.") + } + params <- params[c("df", "spar", "lambda", "cv", "all.knots", "df.offset", "penalty")] + params <- params[!sapply(params, is.null)] + obj <- rlang::inject(stats::smooth.spline(x = x, y = y, !!!params)) f0 <- stats::predict(obj, x = x0)$y d0 <- stats::predict(obj, x = x0, deriv = 1)$y if (log_scale) { @@ -220,46 +237,149 @@ growth_rate <- function(x = seq_along(y), y, x0 = x, } else { return(d0 / f0) } - } else { - # Trend filtering - ord <- params$ord - maxsteps <- params$maxsteps - cv <- params$cv - df <- params$df - k <- params$k - - # Default parameters - ord <- ord %||% 3 - maxsteps <- maxsteps %||% 1000 - cv <- cv %||% TRUE - df <- df %||% "min" - k <- k %||% 3 - - # Check cv and df combo - if (is.numeric(df)) cv <- FALSE - if (!cv && !(is.numeric(df) && df == round(df))) { - cli_abort("If `cv = FALSE`, then `df` must be an integer.") - } - - # Compute trend filtering path - obj <- genlasso::trendfilter(y = y, pos = x, ord = ord, max = maxsteps) - - # Use CV to find df, if we need to - if (cv) { - cv_obj <- quiet(genlasso::cv.trendfilter(obj, k = k, mode = "df")) - df <- ifelse(df == "min", cv_obj$df.min, cv_obj$df.1se) + } else { # Trend filtering + params <- parse_trendfilter_params(params) + if (params$cv) { + obj <- trendfilter::cv_trendfilter( + y, x, + k = params$k, error_measure = params$error_measure, + nfolds = params$nfolds, family = params$family, lambda = params$lambda, + nlambda = params$nlambda, lambda_max = params$lambda_max, + lambda_min = params$lambda_min, lambda_min_ratio = params$lambda_min_ratio + ) + lam <- params$df + which_lambda <- paste0("lambda_", lam) + f <- stats::predict(obj, newx = x0, which_lambda = which_lambda) + } else { + obj <- trendfilter::trendfilter( + y, x, + k = params$k, family = params$family, lambda = params$lambda, + nlambda = params$nlambda, lambda_max = params$lambda_max, + lambda_min = params$lambda_min, lambda_min_ratio = params$lambda_min_ratio + ) + single_lambda <- length(obj$lambda) == 1L + lam <- ifelse(single_lambda, obj$lambda, obj$lambda[which.min(abs(params$df - obj$dof))]) + f <- stats::predict(obj, newx = x0, lambda = lam) } - # Estimate growth rate and return - f <- genlasso::coef.genlasso(obj, df = df)$beta - d <- diff(f) / diff(x) + d <- diff(f) / diff(x0) # Extend by one element d <- c(d, d[length(d)]) if (log_scale) { - return(d[i0]) + return(d) } else { - return((d / f)[i0]) + return(d / f) + } + } + } +} + +#' Optional parameters for growth rate methods +#' +#' Construct an object containing non-standard arguments for [growth_rate()]. +#' +#' @param df Numeric or NULL for "smooth_spline". May also be one of "min" or +#' "max" in the case of "trend_filter". The desired equivalent number of +#' degrees of freedom of the fit. Lower values give smoother estimates. +#' @param lambda The desired smoothing parameter. For "smooth_spline", this +#' can be specified instead of `spar`. For "trend_filter", this sequence +#' determines the balance between data fidelity and smoothness of the +#' estimated curve; larger `lambda` results in a smoother estimate. The +#' default, `NULL` results in an automatic computation based on `nlambda`, +#' the largest value of `lambda` that would result in a maximally smooth +#' estimate, and `lambda_min_ratio`. Supplying a value of `lambda` overrides +#' this behaviour. +#' @param cv For "smooth_spline", ordinary leave-one-out (`TRUE`) or ‘generalized’ +#' cross-validation (GCV) when `FALSE`; is used for smoothing parameter computation +#' only when both `spar` and `df` are not specified. For "trend_filter", +#' `cv` determines whether or not cross-validation is used to choose the +#' tuning parameter. If `FALSE`, then the user must specify either `lambda` +#' or `df`. +#' @inheritParams stats::smooth.spline +#' @inheritParams trendfilter::trendfilter +#' @inheritParams trendfilter::cv_trendfilter +#' +#' @return A list of parameter configurations. +#' @importFrom checkmate assert_number +#' @export +growth_rate_params <- function( + df = NULL, + lambda = NULL, + cv = FALSE, + spar = NULL, + all.knots = FALSE, # nolint + df.offset = 0, # nolint + penalty = 1, + k = 3L, + family = c("gaussian", "logistic", "poisson"), + nlambda = 50L, + lambda_max = NULL, + lambda_min = NULL, + lambda_min_ratio = 1e-5, + error_measure = c("deviance", "mse", "mae"), + nfolds = 3L) { + if (is.character(df)) { + df <- rlang::arg_match0(df, c("min", "1se")) + } else { + assert_number(df, lower = 0, null.ok = TRUE, finite = TRUE) + } + assert_number(spar, null.ok = TRUE, finite = TRUE) + assert_numeric(lambda, lower = 0, null.ok = TRUE, finite = TRUE) + assert_logical(cv, len = 1) + assert_logical(all.knots, len = 1) + assert_number(df.offset, lower = 0, finite = TRUE) + assert_number(penalty, lower = 0, finite = TRUE) + checkmate::assert_integerish(k, lower = 0, len = 1) + family <- arg_match(family) + assert_number(nlambda, lower = 0, finite = TRUE) + assert_number(lambda_max, lower = 0, finite = TRUE, null.ok = TRUE) + assert_number(lambda_min, lower = 0, finite = TRUE, null.ok = TRUE) + assert_number(lambda_min_ratio, lower = 0, upper = 1) + error_measure <- arg_match(error_measure) + checkmate::assert_integerish(nfolds, lower = 2, len = 1) + + structure(enlist( + df, lambda, cv, # shared by all + spar, all.knots, df.offset, penalty, # smooth.spline + k, family, nlambda, lambda_max, lambda_min, lambda_min_ratio, # all TF + error_measure, nfolds # cv_trendfilter + ), class = "growth_rate_params") +} + +#' @export +print.growth_rate_params <- function(x, ...) { + utils::str(x, give.attr = FALSE) +} + +parse_trendfilter_params <- function(params) { + assert_class(params, "growth_rate_params") + vec_lambda <- checkmate::test_numeric(params$lambda, min.len = 2L, null.ok = TRUE) + df_cv <- checkmate::test_character(params$df, null.ok = TRUE) + if (df_cv && vec_lambda) { + params$cv <- TRUE # Turn CV on (or leave it on) + params$df <- params$df %||% "min" # use the original arg or provide the minimizer + return(params) + } + if (params$cv) { # CV = TRUE on input but conflicts with other custom args + cli_abort( + "When `cv = TRUE`, `df` must be `NULL` or character and `lambda` must be + `NULL` or a vector." + ) + } else { # CV should stay FALSE + if (!vec_lambda) { + if (is.character(params$df)) { + cli_abort( + "`df` a character implies using CV, but also setting `lambda` to a + single value implies no CV." + ) + } + if (is.numeric(params$df)) { + cli_abort("`df` and `lambda` cannot both be scalars.") } } } + # If we got here, we fit TF. There are two possibilities: + # 1. df is NULL and lambda is a scalar + # 2. df is numeric and lambda is either NULL or a vector (vec_lambda = TRUE) + params } diff --git a/R/key_colnames.R b/R/key_colnames.R index 49c326748..eeecce05b 100644 --- a/R/key_colnames.R +++ b/R/key_colnames.R @@ -1,47 +1,133 @@ -#' Grab any keys associated to an epi_df +#' Get names of columns that form a (unique) key associated with an object #' -#' @param x a data.frame, tibble, or epi_df +#' This is entirely based on metadata and arguments passed; there are no +#' explicit checks that the key actually is unique in any associated data +#' structures. +#' +#' @param x an object, often a data frame or something similar. `{epiprocess}` +#' includes implementations for [`epi_df`]s, [`epi_archive`]s, +#' [`tsibble::tsibble`]s, and other data frames (including +#' [`tibble::tibble`]s); other packages, like `{epipredict}`, can add more. #' @param ... additional arguments passed on to methods -#' @param other_keys an optional character vector of other keys to include -#' @param exclude an optional character vector of keys to exclude -#' @return If an `epi_df`, this returns all "keys". Otherwise `NULL`. +#' @param geo_keys,other_keys,time_keys character vectors, sometimes optional; +#' which variables (if any) should be considered as part of a unique +#' key/identifier for data in `x`, dealing respectively with the associated +#' geographical region, demographic/strain/other information needed in +#' addition to the geographical region to identify individual time series in +#' `x`, and time interval during which associated events occurred. +#' +#' Mandatory if `x` is a regular `data.frame` or `tibble`. Optional if `x` is +#' an `epi_df`; the defaults are `"geo_value"`, the `epi_df`'s `other_keys` +#' metadata, and `"time_value"`, respectively; if you provide these manually, +#' they must match the defaults. (This behavior is to enable consistent and +#' sane results when you can't guarantee whether `x` is an `epi_df` or just a +#' `tibble`/`data.frame`. You don't need to use it if you know that `x` is +#' definitely an `epi_df`.) Not accepted when `x` is a `tsibble` or an +#' `epi_archive`. +#' @param exclude an optional character vector of key column names to exclude +#' from the result +#' @return character vector #' @keywords internal #' @export -key_colnames <- function(x, ...) { - UseMethod("key_colnames") -} - -#' @rdname key_colnames -#' @method key_colnames default -#' @export -key_colnames.default <- function(x, ...) { - character(0L) +key_colnames <- function(x, ..., exclude = character()) { + provided_args <- rlang::call_args_names(rlang::call_match()) + if ("extra_keys" %in% provided_args) { + lifecycle::deprecate_soft("0.9.6", "key_colnames(extra_keys=)", "key_colnames(other_keys=)") + redispatch <- function(..., extra_keys) { + key_colnames(..., other_keys = extra_keys) + } + redispatch(x, ..., exclude = exclude) + } else { + UseMethod("key_colnames") + } } #' @rdname key_colnames +#' @importFrom rlang check_dots_empty0 #' @method key_colnames data.frame #' @export -key_colnames.data.frame <- function(x, other_keys = character(0L), exclude = character(0L), ...) { +key_colnames.data.frame <- function(x, ..., + geo_keys, + other_keys, + time_keys, + exclude = character()) { + check_dots_empty0(...) + assert_character(geo_keys) + assert_character(time_keys) assert_character(other_keys) assert_character(exclude) - nm <- setdiff(c("geo_value", other_keys, "time_value"), exclude) - intersect(nm, colnames(x)) + keys <- c(geo_keys, other_keys, time_keys) + if (!all(keys %in% names(x))) { + cli_abort(c( + "Some of the specified key columns aren't present in `x`", + "i" = "Specified keys: {format_varnames(keys)}", + "i" = "Columns of x: {format_varnames(names(x))}", + "x" = "Missing keys: {format_varnames(setdiff(keys, names(x)))}" + ), class = "epiprocess__key_colnames__keys_not_in_colnames") + } + setdiff(keys, exclude) } #' @rdname key_colnames #' @method key_colnames epi_df #' @export -key_colnames.epi_df <- function(x, exclude = character(0L), ...) { +key_colnames.epi_df <- function(x, ..., + geo_keys = "geo_value", + other_keys = attr(x, "metadata")$other_keys, + time_keys = "time_value", + exclude = character()) { + check_dots_empty0(...) + if (!identical(geo_keys, "geo_value")) { + cli_abort('If `x` is an `epi_df`, then `geo_keys` must be `"geo_value"`', + class = "epiprocess__key_colnames__mismatched_geo_keys" + ) + } + if (!identical(time_keys, "time_value")) { + cli_abort('If `x` is an `epi_df`, then `time_keys` must be `"time_value"`', + class = "epiprocess__key_colnames__mismatched_time_keys" + ) + } + expected_other_keys <- attr(x, "metadata")$other_keys + if (!identical(other_keys, expected_other_keys)) { + cli_abort(c( + "The provided `other_keys` argument didn't match the `other_keys` of `x`", + "*" = "`other_keys` was {format_chr_with_quotes(other_keys)}", + "*" = "`expected_other_keys` was {format_chr_with_quotes(expected_other_keys)}", + "i" = "If you know that `x` will always be an `epi_df` and + resolve this discrepancy by adjusting the metadata of `x`, you + shouldn't have to pass `other_keys =` here anymore, + unless you want to continue to perform this check." + ), class = "epiprocess__key_colnames__mismatched_other_keys") + } assert_character(exclude) - other_keys <- attr(x, "metadata")$other_keys setdiff(c("geo_value", other_keys, "time_value"), exclude) } +#' @rdname key_colnames +#' @method key_colnames tbl_ts +#' @export +key_colnames.tbl_ts <- function(x, ..., exclude = character()) { + check_dots_empty0(...) + assert_character(exclude) + idx <- tsibble::index_var(x) + idx2 <- tsibble::index2_var(x) + if (!identical(idx, idx2)) { + cli_abort(c( + "`x` is in the middle of a re-indexing operation with `index_by()`; it's unclear + whether we should output the old unique key or the new unique key-to-be", + "i" = "Old index: {format_varname(idx)}", + "i" = "Pending new index: {format_varname(idx2)}", + "Please complete (e.g., with `summarise()`) or remove the re-indexing operation." + ), class = "epiprocess__key_colnames__incomplete_reindexing_operation") + } + setdiff(c(tsibble::key_vars(x), idx), exclude) +} + #' @rdname key_colnames #' @method key_colnames epi_archive #' @export -key_colnames.epi_archive <- function(x, exclude = character(0L), ...) { +key_colnames.epi_archive <- function(x, ..., exclude = character()) { + check_dots_empty0(...) assert_character(exclude) - other_keys <- attr(x, "metadata")$other_keys - setdiff(c("geo_value", other_keys, "time_value"), exclude) + setdiff(c("geo_value", x$other_keys, "time_value", "version"), exclude) } diff --git a/R/methods-epi_archive.R b/R/methods-epi_archive.R index 9ad456735..362fd4eaa 100644 --- a/R/methods-epi_archive.R +++ b/R/methods-epi_archive.R @@ -52,8 +52,7 @@ #' # (a.k.a. "hotfixed", "clobbered", etc.): #' clobberable_versions_start = max(archive_cases_dv_subset$DT$version), #' # Suppose today is the following day, and there are no updates out yet: -#' versions_end = max(archive_cases_dv_subset$DT$version) + 1L, -#' compactify = TRUE +#' versions_end = max(archive_cases_dv_subset$DT$version) + 1L #' ) #' #' epix_as_of(archive_cases_dv_subset2, max(archive_cases_dv_subset$DT$version)) @@ -263,8 +262,9 @@ epix_fill_through_version <- function(x, fill_versions_end, how = c("na", "locf" #' and use `min(x$versions_end, y$versions_end)` as the result's #' `versions_end`. #' -#' @param compactify Optional; `TRUE` (default), `FALSE`, or `NULL`; should the +#' @param compactify Optional; `TRUE` (default), `FALSE`, or `"message"`; should the #' result be compactified? See `as_epi_archive()` for details. +#' @param compactify_abs_tol As in [`as_epi_archive()`]. #' @details #' When merging archives, unless the archives have identical data release #' patterns, we often have to handle the situation when one signal has a more @@ -344,7 +344,7 @@ epix_fill_through_version <- function(x, fill_versions_end, how = c("na", "locf" #' @export epix_merge <- function(x, y, sync = c("forbid", "na", "locf", "truncate"), - compactify = TRUE) { + compactify = TRUE, compactify_abs_tol = 0) { assert_class(x, "epi_archive") assert_class(y, "epi_archive") sync <- rlang::arg_match(sync) @@ -527,7 +527,7 @@ epix_merge <- function(x, y, # inputs are already compactified, but at time of writing we don't have # compactify in its own method or field, and it seems like it should be # pretty fast anyway. - compactify = compactify, + compactify = compactify, compactify_abs_tol = compactify_abs_tol, clobberable_versions_start = result_clobberable_versions_start, versions_end = new_versions_end )) @@ -604,10 +604,10 @@ epix_detailed_restricted_mutate <- function(.data, ...) { out_archive <- .data out_archive$DT <- out_dt request_names <- names(col_modify_cols) - return(list( + list( archive = out_archive, request_names = request_names - )) + ) # (We might also consider special-casing when `mutate` hands back something # equivalent (in some sense) to the input (probably only encountered when # we're dealing with `group_by`), and using just `$DT`, not a shallow copy, @@ -879,8 +879,7 @@ epix_slide.epi_archive <- function( #' @noRd epix_slide_versions_default <- function(ea) { versions_with_updates <- c(ea$DT$version, ea$versions_end) - ref_time_values <- tidyr::full_seq(versions_with_updates, guess_period(versions_with_updates)) - return(ref_time_values) + tidyr::full_seq(versions_with_updates, guess_period(versions_with_updates)) } @@ -920,7 +919,7 @@ epix_truncate_versions_after.epi_archive <- function(x, max_version) { x$clobberable_versions_start <- NA } x$versions_end <- max_version - return(x) + x } diff --git a/R/methods-epi_df.R b/R/methods-epi_df.R index 6e19f7531..1191521c9 100644 --- a/R/methods-epi_df.R +++ b/R/methods-epi_df.R @@ -1,27 +1,38 @@ #' Convert to tibble #' -#' Converts an `epi_df` object into a tibble, dropping metadata and any -#' grouping. +#' Converts an `epi_df` object into a tibble, dropping metadata, any +#' grouping, and any unrelated classes and attributes. #' #' Advanced: if you are working with a third-party package that uses #' `as_tibble()` on `epi_df`s but you actually want them to remain `epi_df`s, #' use `attr(your_epi_df, "decay_to_tibble") <- FALSE` beforehand. #' #' @param x an `epi_df` -#' @inheritParams tibble::as_tibble -#' @importFrom tibble as_tibble +#' @param ... if present, forwarded to [`tibble::as_tibble`] +#' @importFrom tibble as_tibble new_tibble +#' @importFrom rlang dots_n +#' @importFrom vctrs vec_data vec_size #' @export as_tibble.epi_df <- function(x, ...) { # Note that some versions of `tsibble` overwrite `as_tibble.grouped_df`, which - # also impacts grouped `epi_df`s don't rely on `NextMethod()`. Destructure - # first instead. - destructured <- tibble::as_tibble(vctrs::vec_data(x), ...) + # also impacts grouped `epi_df`s, so don't rely on `NextMethod()`. Destructure + # and redispatch instead. + destructured <- vec_data(x) # -> data.frame, dropping extra attrs + tbl <- if (dots_n(...) == 0 && + is.null(pkgconfig::get_config("tibble::rownames"))) { # nolint: indentation_linter + # perf: new_tibble instead of as_tibble.data.frame which performs + # extra checks whose defaults should be redundant here: + new_tibble(destructured) + # (^ We don't need to provide nrow= as we have >0 columns.) + } else { + as_tibble(destructured, ...) + } if (attr(x, "decay_to_tibble") %||% TRUE) { - return(destructured) + tbl } else { # We specially requested via attr not to decay epi_df-ness but to drop any - # grouping. - reclass(destructured, attr(x, "metadata")) + # grouping. (Miscellaneous attrs are also dropped.) + reclass(tbl, attr(x, "metadata")) } } @@ -39,12 +50,10 @@ as_tibble.epi_df <- function(x, ...) { #' @export as_tsibble.epi_df <- function(x, key, ...) { if (missing(key)) key <- c("geo_value", attributes(x)$metadata$other_keys) - return( - as_tsibble( - tibble::as_tibble(x), - key = tidyselect::all_of(key), index = "time_value", - ... - ) + as_tsibble( + tibble::as_tibble(x), + key = tidyselect::all_of(key), index = "time_value", + ... ) } @@ -153,7 +162,30 @@ dplyr_reconstruct.epi_df <- function(data, template) { # keep any grouping that has been applied: res <- NextMethod() - col_names <- names(res) + reconstruct_light_edf(res, template) +} + +#' Like `dplyr_reconstruct.epi_df` but not recomputing any grouping +#' +#' In the move to our current not-quite-proper/effective "implementation" of +#' [`dplyr::dplyr_extending`] for `epi_df`s, we moved a lot of checks in +#' `dplyr_reconstruct` and used it instead of `reclass()` in various +#' operations to prevent operations from outputting invalid metadata/classes, +#' instead of more careful tailored and relevant checks. However, this actually +#' introduced extra overhead due to `dplyr_reconstruct.epi_df()` passing off to +#' `dplyr_reconstruct.grouped_df()` when grouped, which assumes that it will +#' need to / should for safety recompute the groups, even when it'd be safe for +#' it not to do so. In many operations, we're using `NextMethod()` to dispatch +#' to `grouped_df` behavior if needed, and it should output something with valid +#' groupings. +#' +#' This function serves the original purpose of performing `epi_df`-centric +#' checks rather than just throwing on potentially-incorrect metadata like +#' `reclass()`, but without unnecessary `dplyr_reconstruct()` delegation. +#' +#' @keywords internal +reconstruct_light_edf <- function(data, template) { + col_names <- names(data) # Duplicate columns, cli_abort dup_col_names <- col_names[duplicated(col_names)] @@ -171,13 +203,13 @@ dplyr_reconstruct.epi_df <- function(data, template) { if (not_epi_df) { # If we're calling on an `epi_df` from one of our own functions, we need to # decay to a non-`epi_df` result. If `dplyr` is calling, `x` is a tibble, - # `res` is not an `epi_df` yet (but might, e.g., be a `grouped_df`), and we + # `data` is not an `epi_df` yet (but might, e.g., be a `grouped_df`), and we # simply need to skip adding the metadata & class. Current `decay_epi_df` # should work in both cases. - return(decay_epi_df(res)) + return(decay_epi_df(data)) } - res <- reclass(res, attr(template, "metadata")) + data <- reclass(data, attr(template, "metadata")) # XXX we may want verify the `geo_type` and `time_type` here. If it's # significant overhead, we may also want to keep this less strict version @@ -185,9 +217,9 @@ dplyr_reconstruct.epi_df <- function(data, template) { # Amend additional metadata if some other_keys cols are dropped in the subset old_other_keys <- attr(template, "metadata")$other_keys - attr(res, "metadata")$other_keys <- old_other_keys[old_other_keys %in% col_names] + attr(data, "metadata")$other_keys <- old_other_keys[old_other_keys %in% col_names] - res + data } #' @export @@ -198,19 +230,40 @@ dplyr_reconstruct.epi_df <- function(data, template) { return(res) } - dplyr::dplyr_reconstruct(res, x) + reconstruct_light_edf(res, x) +} + +#' @export +`[<-.epi_df` <- function(x, i, j, ..., value) { + res <- NextMethod() + + reconstruct_light_edf(res, x) +} + +#' @export +`[[<-.epi_df` <- function(x, i, j, ..., value) { + res <- NextMethod() + + reconstruct_light_edf(res, x) +} + +#' @export +`$<-.epi_df` <- function(x, name, value) { + res <- NextMethod() + + reconstruct_light_edf(res, x) } #' @importFrom dplyr dplyr_col_modify #' @export dplyr_col_modify.epi_df <- function(data, cols) { - dplyr::dplyr_reconstruct(NextMethod(), data) + reconstruct_light_edf(NextMethod(), data) } #' @importFrom dplyr dplyr_row_slice #' @export dplyr_row_slice.epi_df <- function(data, i, ...) { - dplyr::dplyr_reconstruct(NextMethod(), data) + reconstruct_light_edf(NextMethod(), data) } #' @export @@ -224,7 +277,7 @@ dplyr_row_slice.epi_df <- function(data, i, ...) { new_metadata[["other_keys"]] <- new_other_keys } result <- reclass(NextMethod(), new_metadata) - dplyr::dplyr_reconstruct(result, result) + reconstruct_light_edf(result, result) } #' @method group_by epi_df @@ -253,7 +306,7 @@ ungroup.epi_df <- function(x, ...) { #' @param .keep Boolean; see [`dplyr::group_modify`] #' @export group_modify.epi_df <- function(.data, .f, ..., .keep = FALSE) { - dplyr::dplyr_reconstruct(NextMethod(), .data) + reconstruct_light_edf(NextMethod(), .data) } #' "Complete" an `epi_df`, adding missing rows and/or replacing `NA`s @@ -333,7 +386,7 @@ group_modify.epi_df <- function(.data, .f, ..., .keep = FALSE) { #' ) #' @export complete.epi_df <- function(data, ..., fill = list(), explicit = TRUE) { - result <- dplyr::dplyr_reconstruct(NextMethod(), data) + result <- reconstruct_light_edf(NextMethod(), data) if ("time_value" %in% names(rlang::call_match(dots_expand = FALSE)[["..."]])) { attr(result, "metadata")$time_type <- guess_time_type(result$time_value) } @@ -345,14 +398,14 @@ complete.epi_df <- function(data, ..., fill = list(), explicit = TRUE) { #' @param data an `epi_df` #' @export unnest.epi_df <- function(data, ...) { - dplyr::dplyr_reconstruct(NextMethod(), data) + reconstruct_light_edf(NextMethod(), data) } # Simple reclass function reclass <- function(x, metadata) { class(x) <- unique(c("epi_df", class(x))) - attributes(x)$metadata <- metadata - return(x) + attr(x, "metadata") <- metadata + x } #' Arrange an epi_df into a standard order @@ -376,7 +429,7 @@ arrange_canonical.default <- function(x, ...) { cli::cli_abort(c( "`arrange_canonical()` is only meaningful for an {.cls epi_df}." )) - return(x) + x } #' @export @@ -397,14 +450,14 @@ arrange_row_canonical.default <- function(x, ...) { cli::cli_abort(c( "`arrange_row_canonical()` is only meaningful for an {.cls epi_df}." )) - return(x) + x } #' @export arrange_row_canonical.epi_df <- function(x, ...) { rlang::check_dots_empty() cols <- key_colnames(x) - x %>% dplyr::arrange(dplyr::across(dplyr::all_of(cols))) + x[vctrs::vec_order(x[cols]), ] } arrange_col_canonical <- function(x, ...) { @@ -417,14 +470,16 @@ arrange_col_canonical.default <- function(x, ...) { cli::cli_abort(c( "`arrange_col_canonical()` is only meaningful for an {.cls epi_df}." )) - return(x) + x } #' @export arrange_col_canonical.epi_df <- function(x, ...) { rlang::check_dots_empty() - cols <- key_colnames(x) - x %>% dplyr::relocate(dplyr::all_of(cols), .before = 1) + all_names <- names(x) + key_names <- key_colnames(x) + val_names <- all_names[!all_names %in% key_names] + x[c(key_names, val_names)] } #' Group an `epi_df` object by default keys @@ -434,7 +489,7 @@ arrange_col_canonical.epi_df <- function(x, ...) { #' @export group_epi_df <- function(x, exclude = character()) { cols <- key_colnames(x, exclude = exclude) - x %>% group_by(across(all_of(cols))) + reclass(grouped_df(x, cols), attr(x, "metadata")) } #' Aggregate an `epi_df` object diff --git a/R/outliers.R b/R/outliers.R index 68e921bb1..25428e0a9 100644 --- a/R/outliers.R +++ b/R/outliers.R @@ -122,7 +122,7 @@ detect_outlr <- function(x = seq_along(y), y, # Update column names with model abbreviation colnames(results) <- paste(abbr, colnames(results), sep = "_") - return(results) + results }) # Combine information about detected outliers diff --git a/R/reexports.R b/R/reexports.R index 9a33e94bf..00ac83c2c 100644 --- a/R/reexports.R +++ b/R/reexports.R @@ -75,40 +75,3 @@ tidyr::full_seq #' @importFrom ggplot2 autoplot #' @export ggplot2::autoplot - - -# epidatasets ------------------------------------------------------------------- - -#' @rdname epidatasets_reexports -#' -#' @title Selected example data sets from `epidatasets` -#' -#' @description Data sets re-exported from `epidatasets`; please see -#' documentation for each of these objects in `epidatasets`. -#' -#' A brief description of the format of each of the objects above are described -#' in matching order below. -#' -#' @keywords internal -#' @export -delayedAssign("cases_deaths_subset", epidatasets::cases_deaths_subset) - -#' @rdname epidatasets_reexports -#' @keywords internal -#' @export -delayedAssign("covid_incidence_county_subset", epidatasets::covid_incidence_county_subset) - -#' @rdname epidatasets_reexports -#' @keywords internal -#' @export -delayedAssign("covid_incidence_outliers", epidatasets::covid_incidence_outliers) - -#' @rdname epidatasets_reexports -#' @keywords internal -#' @export -delayedAssign("archive_cases_dv_subset", epidatasets::archive_cases_dv_subset) - -#' @rdname epidatasets_reexports -#' @keywords internal -#' @export -delayedAssign("covid_case_death_rates_extended", epidatasets::covid_case_death_rates_extended) diff --git a/R/revision_analysis.R b/R/revision_analysis.R index 279444896..f36dcc16a 100644 --- a/R/revision_analysis.R +++ b/R/revision_analysis.R @@ -19,47 +19,62 @@ #' 8. `rel_spread`: `spread` divided by the largest value (so it will #' always be less than 1). Note that this need not be the final value. It will #' be `NA` whenever `spread` is 0. -#' 9. `time_near_latest`: the time taken for the revisions to settle to within +#' 9. `lag_near_latest`: the time taken for the revisions to settle to within #' `within_latest` (default 20%) of the final value and stay there. For #' example, consider the series (0, 20, 99, 150, 102, 100); then -#' `time_near_latest` is 5, since even though 99 is within 20%, it is outside +#' `lag_near_latest` is 5, since even though 99 is within 20%, it is outside #' the window afterwards at 150. #' #' @param epi_arch an epi_archive to be analyzed #' @param ... <[`tidyselect`][dplyr_tidy_select]>, used to choose the column to -#' summarize. If empty, it chooses the first. Currently only implemented for -#' one column at a time. +#' summarize. If empty and there is only one value/measurement column (i.e., +#' not in [`key_colnames`]) in the archive, it will automatically select it. +#' If supplied, `...` must select exactly one column. #' @param drop_nas bool, drop any `NA` values from the archive? After dropping -#' `NA`'s compactify is run again to make sure there are no duplicate values -#' from occasions when the signal is revised to `NA`, and then back to its -#' immediately-preceding value. +#' `NA`'s compactify is run again if `compactify` is `TRUE` to make +#' sure there are no duplicate values from occasions when the signal is +#' revised to `NA`, and then back to its immediately-preceding value. #' @param print_inform bool, determines whether to print summary information, or #' only return the full summary tibble #' @param min_waiting_period `difftime`, integer or `NULL`. Sets a cutoff: any -#' time_values not earlier than `min_waiting_period` before `versions_end` are -#' removed. `min_waiting_period` should characterize the typical time during -#' which revisions occur. The default of 60 days corresponds to a typical -#' final value for case counts as reported in the context of insurance. To -#' avoid this filtering, either set to `NULL` or 0. +#' time_values that have not had at least `min_waiting_period` to stabilize as +#' of the `versions_end` are removed. `min_waiting_period` should characterize +#' the typical time during which most significant revisions occur. The default +#' of 60 days corresponds to a typical near-final value for case counts as +#' reported in the context of insurance. To avoid this filtering, either set +#' to `NULL` or 0. #' @param within_latest double between 0 and 1. Determines the threshold -#' used for the `time_to` +#' used for the `lag_to` #' @param quick_revision difftime or integer (integer is treated as days), for #' the printed summary, the amount of time between the final revision and the #' actual time_value to consider the revision quickly resolved. Default of 3 #' days #' @param few_revisions integer, for the printed summary, the upper bound on the #' number of revisions to consider "few". Default is 3. -#' @param abs_spread_threshold numeric, for the printed summary, the maximum -#' spread used to characterize revisions which don't actually change very -#' much. Default is 5% of the maximum value in the dataset, but this is the -#' most unit dependent of values, and likely needs to be chosen appropriate -#' for the scale of the dataset. -#' @param rel_spread_threshold float between 0 and 1, for the printed summary, -#' the relative spread fraction used to characterize revisions which don't -#' actually change very much. Default is .1, or 10% of the final value -#' @param compactify_tol float, used if `drop_nas=TRUE`, it determines the -#' threshold for when two floats are considered identical. -#' @param should_compactify bool. Compactify if `TRUE`. +#' @param abs_spread_threshold length-1 numeric, for the printed summary, the +#' maximum spread used to characterize revisions which don't actually change +#' very much. Default is 5% of the maximum value in the dataset, but this is +#' the most unit dependent of values, and likely needs to be chosen +#' appropriate for the scale of the dataset. +#' @param rel_spread_threshold length-1 double between 0 and 1, for the printed +#' summary, the relative spread fraction used to characterize revisions which +#' don't actually change very much. Default is .1, or 10% of the final value +#' @param compactify bool. If `TRUE`, we will compactify after the signal +#' requested in `...` has been selected on its own and the `drop_nas` step. +#' This helps, for example, to give similar results when called on +#' [merged][epix_merge] and single-signal archives, since merged archives +#' record an update when any of the other signals change, not just the +#' requested signal. The default is `TRUE`. +#' @param compactify_abs_tol length-1 double, used if `compactify` is `TRUE`, it +#' determines the threshold for when two doubles are considered identical. +#' +#' @details Applies to `epi_archive`s with `time_type`s of `"day"`, `"week"`, +#' and `"yearmonth"`. It can also work with a `time_type` of `"integer"` if +#' the possible `time_values` are all consecutive integers; you will need to +#' manually specify the `min_waiting_period` and `quick_revision`, though. +#' Using a `time_type` of `"integer"` with week numbers like 202501 will +#' produce incorrect results for some calculations, since week numbering +#' contains jumps at year boundaries. #' #' @examples #' revision_example <- revision_summary(archive_cases_dv_subset, percent_cli) @@ -67,31 +82,53 @@ #' #' @export #' @importFrom cli cli_inform cli_abort cli_li -#' @importFrom rlang list2 syms +#' @importFrom rlang list2 syms dots_n +#' @importFrom vctrs vec_cast #' @importFrom dplyr mutate group_by arrange filter if_any all_of across pull pick c_across #' everything ungroup summarize if_else %>% revision_summary <- function(epi_arch, ..., drop_nas = TRUE, print_inform = TRUE, - min_waiting_period = as.difftime(60, units = "days"), + min_waiting_period = as.difftime(60, units = "days") %>% + difftime_approx_ceiling_time_delta(epi_arch$time_type), within_latest = 0.2, - quick_revision = as.difftime(3, units = "days"), + quick_revision = as.difftime(3, units = "days") %>% + difftime_approx_ceiling_time_delta(epi_arch$time_type), few_revisions = 3, abs_spread_threshold = NULL, rel_spread_threshold = 0.1, - compactify_tol = .Machine$double.eps^0.5, - should_compactify = TRUE) { - arg <- names(eval_select(rlang::expr(c(...)), allow_rename = FALSE, data = epi_arch$DT)) - if (length(arg) == 0) { - # Choose the first column that's not a key or version - arg <- setdiff(names(epi_arch$DT), c(key_colnames(epi_arch), "version"))[[1]] - } else if (length(arg) > 1) { - cli_abort("Not currently implementing more than one column at a time. Run each separately") + compactify = TRUE, + compactify_abs_tol = 0) { + assert_class(epi_arch, "epi_archive") + # if the column to summarize isn't specified, use the only one if there is only one + if (dots_n(...) == 0) { + # Choose the first column that's not a key: + value_colnames <- setdiff(names(epi_arch$DT), key_colnames(epi_arch)) + if (length(value_colnames) == 1) { + arg <- value_colnames + } else { + cli_abort(c( + "Cannot determine which column to summarize.", + "i" = "Value/measurement columns appear to be: {format_varnames(value_colnames)}", + ">" = "Please specify which column to summarize in `...` (with tidyselect syntax)." + ), class = "epiprocess__revision_summary_cannot_determine_default_selection") + } + } else { + # get the names of columns matching any tidyselect used in `...` + arg <- names(eval_select(rlang::expr(c(...)), allow_rename = FALSE, data = epi_arch$DT)) + if (length(arg) == 0) { + cli_abort("Could not find any columns matching the selection in `...`.", + class = "epiprocess__revision_summary__selected_zero_columns" + ) + } + if (length(arg) > 1) { + cli_abort("Not currently implementing more than one column at a time. Run each separately.") + } } if (is.null(abs_spread_threshold)) { abs_spread_threshold <- .05 * epi_arch$DT %>% - pull(...) %>% + pull(!!arg) %>% max(na.rm = TRUE) } # for each time_value, get @@ -101,58 +138,64 @@ revision_summary <- function(epi_arch, # the max lag # # revision_tibble - keys <- key_colnames(epi_arch) + epikey_names <- key_colnames(epi_arch, exclude = c("time_value", "version")) + epikeytime_names <- c(epikey_names, "time_value") + ukey_names <- c(epikeytime_names, "version") + time_type <- epi_arch$time_type revision_behavior <- epi_arch$DT %>% - select(all_of(unique(c("geo_value", "time_value", keys, "version", arg)))) + select(all_of(unique(c(ukey_names, arg)))) if (!is.null(min_waiting_period)) { + last_semistable_time_value <- time_minus_n_steps( + epi_arch$versions_end, + time_delta_to_n_steps(min_waiting_period, time_type), + time_type + ) revision_behavior <- revision_behavior %>% - filter(abs(time_value - as.Date(epi_arch$versions_end)) >= min_waiting_period) + filter(time_value <= last_semistable_time_value) } if (drop_nas) { # if we're dropping NA's, we should recompactify revision_behavior <- revision_behavior %>% - filter(!is.na(c_across(!!arg))) + filter(!is.na(.data[[arg]])) } else { revision_behavior <- epi_arch$DT } - if (should_compactify) { + if (compactify) { revision_behavior <- revision_behavior %>% - arrange(across(c(geo_value, time_value, all_of(keys), version))) %>% # need to sort before compactifying - apply_compactify(c(keys, version), compactify_tol) + apply_compactify(ukey_names, compactify_abs_tol) } revision_behavior <- revision_behavior %>% - mutate(lag = as.integer(version) - as.integer(time_value)) %>% # nolint: object_usage_linter - group_by(across(all_of(keys))) %>% # group by all the keys + mutate(lag = time_minus_time_in_n_steps(version, time_value, time_type)) %>% # nolint: object_usage_linter + group_by(pick(all_of(epikeytime_names))) %>% # group = versions of one measurement summarize( n_revisions = dplyr::n() - 1, min_lag = min(lag), # nolint: object_usage_linter max_lag = max(lag), # nolint: object_usage_linter - min_value = f_no_na(min, pick(!!arg)), - max_value = f_no_na(max, pick(!!arg)), - median_value = f_no_na(median, pick(!!arg)), - time_to = time_within_x_latest(lag, pick(!!arg), prop = within_latest), # nolint: object_usage_linter + min_value = f_no_na(min, .data[[arg]]), + max_value = f_no_na(max, .data[[arg]]), + median_value = f_no_na(median, .data[[arg]]), + lag_to = lag_within_x_latest(lag, .data[[arg]], prop = within_latest), .groups = "drop" ) %>% mutate( spread = max_value - min_value, # nolint: object_usage_linter rel_spread = spread / max_value, # nolint: object_usage_linter - # TODO the units here may be a problem - min_lag = as.difftime(min_lag, units = "days"), # nolint: object_usage_linter - max_lag = as.difftime(max_lag, units = "days"), # nolint: object_usage_linter - time_near_latest = as.difftime(time_to, units = "days") # nolint: object_usage_linter + min_lag = n_steps_to_time_delta(min_lag, time_type), # nolint: object_usage_linter + max_lag = n_steps_to_time_delta(max_lag, time_type), # nolint: object_usage_linter + lag_near_latest = n_steps_to_time_delta(lag_to, time_type) # nolint: object_usage_linter ) %>% - select(-time_to) %>% + select(-lag_to) %>% relocate( - time_value, geo_value, all_of(keys), n_revisions, min_lag, max_lag, # nolint: object_usage_linter - time_near_latest, spread, rel_spread, min_value, max_value, median_value # nolint: object_usage_linter + time_value, geo_value, all_of(epikey_names), n_revisions, min_lag, max_lag, # nolint: object_usage_linter + lag_near_latest, spread, rel_spread, min_value, max_value, median_value # nolint: object_usage_linter ) if (print_inform) { cli_inform("Min lag (time to first version):") - difftime_summary(revision_behavior$min_lag) %>% print() + time_delta_summary(revision_behavior$min_lag, time_type) %>% print() if (!drop_nas) { total_na <- epi_arch$DT %>% filter(is.na(c_across(!!arg))) %>% # nolint: object_usage_linter @@ -167,11 +210,11 @@ revision_summary <- function(epi_arch, cli_inform("No revisions:") cli_li(num_percent(total_num_unrevised, total_num, "")) total_quickly_revised <- sum( # nolint: object_usage_linter - revision_behavior$max_lag <= - as.difftime(quick_revision, units = "days") + time_delta_to_n_steps(revision_behavior$max_lag, time_type) <= + time_delta_to_n_steps(quick_revision, time_type) ) - cli_inform("Quick revisions (last revision within {quick_revision} -{units(quick_revision)} of the `time_value`):") + cli_inform("Quick revisions (last revision within {format_time_delta(quick_revision, time_type)} + of the `time_value`):") cli_li(num_percent(total_quickly_revised, total_num, "")) total_barely_revised <- sum( # nolint: object_usage_linter revision_behavior$n_revisions <= @@ -198,17 +241,21 @@ revision_summary <- function(epi_arch, cli_inform("Spread of more than {abs_spread_threshold} in actual value (when revised):") cli_li(num_percent(abs_spread, n_real_revised, "")) - cli_inform("{units(quick_revision)} until within {within_latest*100}% of the latest value:") - difftime_summary(revision_behavior[["time_near_latest"]]) %>% print() + # time_type_unit_pluralizer[[time_type]] is a format string controlled by us + # and/or downstream devs, so we can paste it onto our format string safely: + units_plural <- pluralize(paste0("{qty(2)}", time_type_unit_pluralizer[[time_type]])) # nolint: object_usage_linter + cli_inform("{toTitleCase(units_plural)} until within {within_latest*100}% of the latest value:") + time_delta_summary(revision_behavior[["lag_near_latest"]], time_type) %>% print() } return(revision_behavior) } -#' pull the value from lags when values starts indefinitely being within prop of it's last value. -#' @param values this should be a 1 column tibble. errors may occur otherwise +#' pull the value from lags when values starts indefinitely being within prop of its latest value. +#' @param lags vector of lags; should be sorted +#' @param values this should be a vector (e.g., a column) with length matching that of `lags` +#' @param prop optional length-1 double; proportion #' @keywords internal -time_within_x_latest <- function(lags, values, prop = .2) { - values <- values[[1]] +lag_within_x_latest <- function(lags, values, prop = .2) { latest_value <- values[[length(values)]] close_enough <- abs(values - latest_value) < prop * latest_value # we want to ignore any stretches where it's close, but goes farther away later @@ -224,19 +271,18 @@ time_within_x_latest <- function(lags, values, prop = .2) { #' @keywords internal get_last_run <- function(bool_vec, values_from) { runs <- rle(bool_vec) - length(bool_vec) - tail(runs$lengths, n = 1) values_from[[length(bool_vec) - tail(runs$lengths, n = 1) + 1]] } -#' use when the default behavior returns a warning on empty lists, which we do +#' use when the default behavior returns a warning on empty vectors, which we do #' not want, and there is no super clean way of preventing this #' @keywords internal f_no_na <- function(f, x) { x <- x[!is.na(x)] if (length(x) == 0) { - return(Inf) + Inf } else { - return(f(x)) + f(x) } } @@ -248,20 +294,27 @@ num_percent <- function(a, b, b_description) { ({round(a/b*100,digits=2)}%)") } -#' summary doesn't work on difftimes +#' Like `summary` but working across all "time deltas", including difftimes +#' +#' Also standardizes units of difftimes to the natural unit for the given +#' `time_type` (via conversion to and from a corresponding number of time +#' steps). +#' #' @keywords internal -difftime_summary <- function(diff_time_val) { - if (length(diff_time_val) > 0) { +time_delta_summary <- function(time_delta, time_type) { + if (length(time_delta) > 0) { + n_steps <- time_delta_to_n_steps(time_delta, time_type) res <- data.frame( - min = min(diff_time_val), - median = median(diff_time_val), - mean = round(mean(diff_time_val), 1), - max = max(diff_time_val), + min = min(n_steps), + median = median(n_steps), + mean = round(mean(n_steps), 1), + max = max(n_steps), row.names = " ", check.names = FALSE - ) - return(res) + ) %>% + mutate(across(c(min, median, mean, max), ~ .x * unit_time_delta(time_type))) + res } else { - return(data.frame()) + data.frame() } } diff --git a/R/slide.R b/R/slide.R index 761639d44..abc7c3b77 100644 --- a/R/slide.R +++ b/R/slide.R @@ -65,7 +65,7 @@ #' determined the time window for the current computation. #' #' @importFrom lubridate days weeks -#' @importFrom dplyr bind_rows group_map group_vars filter select +#' @importFrom dplyr group_map group_vars filter select #' @importFrom rlang .data .env !! enquos sym env missing_arg #' @export #' @seealso [`epi_slide_opt`] for optimized slide functions @@ -281,7 +281,7 @@ epi_slide <- function( i <<- i + 1L .slide_comp(.x, .group_key, .ref_time_value, ...) } - return(slide_comp_wrapper) + slide_comp_wrapper } # - If .x is not grouped, then the trivial group is applied: @@ -309,15 +309,14 @@ epi_slide <- function( ..., .keep = TRUE ) %>% - bind_rows() %>% - filter(.real) %>% - select(-.real) %>% + list_rbind() %>% + `[`(.$.real, names(.) != ".real") %>% arrange_col_canonical() %>% group_by(!!!.x_orig_groups) # If every group in epi_slide_one_group takes the # length(available_ref_time_values) == 0 branch then we end up here. - if (ncol(result) == ncol(.x %>% select(-.real))) { + if (ncol(result) == ncol(.x[names(.x) != ".real"])) { cli_abort( "epi_slide: no new columns were created. This can happen if every group has no available ref_time_values. This is likely a mistake in your data, in the slide computation, or in the ref_time_values argument.", @@ -341,30 +340,33 @@ epi_slide_one_group <- function( # Unpack the date_seq_list argument and complete the data group with missing # time values, padding on the left and right as needed. all_dates <- .date_seq_list$all_dates - missing_times <- all_dates[!(all_dates %in% .data_group$time_value)] - .data_group <- bind_rows( + missing_times <- all_dates[!vec_in(all_dates, .data_group$time_value)] + .data_group <- reclass(vec_rbind( .data_group, - dplyr::bind_cols( - .group_key, - tibble( - time_value = c( - missing_times, - .date_seq_list$pad_early_dates, - .date_seq_list$pad_late_dates - ), .real = FALSE - ) - ) - ) %>% - arrange(.data$time_value) + # (^ epi_df; epi_slide uses .keep = TRUE) + # (v tibble -> vec_rbind outputs tibble) + new_tibble(vec_recycle_common( + !!!.group_key, + time_value = c( + missing_times, + .date_seq_list$pad_early_dates, + .date_seq_list$pad_late_dates + ), + .real = FALSE + )) + # we should be adding time values of the same time_type (and shouldn't be + # introducing duplicate epikeytime values); we can reclass without checks: + ), attr(.data_group, "metadata")) %>% + `[`(vec_order(.$time_value), ) # If the data group does not contain any of the reference time values, return - # the original .data_group without slide columns and let bind_rows at the end + # the original .data_group without slide columns and let vec_rbind at the end # of group_modify handle filling the empty data frame with NA values. if (length(available_ref_time_values) == 0L) { if (.all_rows) { return(.data_group) } - return(.data_group %>% filter(FALSE)) + return(.data_group[0, ]) } # Get stateful function that tracks ref_time_value per group and sends it to @@ -398,11 +400,11 @@ epi_slide_one_group <- function( # back. return_types <- purrr::map_chr(slide_values_list, function(x) { if (is.data.frame(x)) { - return("data.frame") + "data.frame" } else if (vctrs::obj_is_vector(x) && is.null(vctrs::vec_names(x))) { - return("vector") + "vector" } else { - return("other") + "other" } }) %>% unique() # Returned values must be data.frame or vector. @@ -436,10 +438,10 @@ epi_slide_one_group <- function( # If all rows, then pad slide values with NAs, else filter down data group if (.all_rows) { orig_values <- slide_values - slide_values <- vctrs::vec_rep(vctrs::vec_cast(NA, orig_values), nrow(.data_group)) - vctrs::vec_slice(slide_values, .data_group$time_value %in% available_ref_time_values) <- orig_values + slide_values <- vec_rep(vec_cast(NA, orig_values), nrow(.data_group)) + vec_slice(slide_values, vec_in(.data_group$time_value, available_ref_time_values)) <- orig_values } else { - .data_group <- .data_group %>% filter(time_value %in% available_ref_time_values) + .data_group <- .data_group[vec_in(.data_group$time_value, available_ref_time_values), ] } # To label the result, we will parallel some code from `epix_slide`, though @@ -512,7 +514,7 @@ epi_slide_one_group <- function( res[[.new_col_name]] <- slide_values } - return(res) + res } get_before_after_from_window <- function(window_size, align, time_type) { @@ -544,7 +546,7 @@ get_before_after_from_window <- function(window_size, align, time_type) { after <- window_size - 1 } } - return(list(before = before, after = after)) + list(before = before, after = after) } #' Optimized slide functions for common cases @@ -609,7 +611,7 @@ get_before_after_from_window <- function(window_size, align, time_type) { #' - `{.f_abbr}` will be a character vector containing a short abbreviation #' for `.f` factoring in the input column type(s) for `.col_names` #' -#' @importFrom dplyr bind_rows mutate %>% arrange tibble select all_of +#' @importFrom dplyr mutate %>% arrange tibble select all_of #' @importFrom rlang enquo expr_label caller_arg quo_get_env #' @importFrom tidyselect eval_select #' @importFrom glue glue @@ -895,15 +897,18 @@ epi_slide_opt <- function( pad_late_dates <- date_seq_list$pad_late_dates slide_one_grp <- function(.data_group, .group_key, ...) { - missing_times <- all_dates[!(all_dates %in% .data_group$time_value)] + missing_times <- all_dates[!vec_in(all_dates, .data_group$time_value)] # `frollmean` requires a full window to compute a result. Add NA values # to beginning and end of the group so that we get results for the # first `before` and last `after` elements. - .data_group <- bind_rows( - .data_group, - tibble(time_value = c(missing_times, pad_early_dates, pad_late_dates), .real = FALSE) + .data_group <- vec_rbind( + .data_group, # (tibble; epi_slide_opt uses .keep = FALSE) + new_tibble(vec_recycle_common( + time_value = c(missing_times, pad_early_dates, pad_late_dates), + .real = FALSE + )) ) %>% - arrange(.data$time_value) + `[`(vec_order(.$time_value), ) if (f_from_package == "data.table") { # Grouping should ensure that we don't have duplicate time values. @@ -952,20 +957,20 @@ epi_slide_opt <- function( } } - return(.data_group) + .data_group } - result <- mutate(.x, .real = TRUE) %>% + result <- .x %>% + `[[<-`(".real", value = TRUE) %>% group_modify(slide_one_grp, ..., .keep = FALSE) %>% - filter(.data$.real) %>% - select(-.real) %>% + `[`(.$.real, names(.) != ".real") %>% arrange_col_canonical() %>% group_by(!!!.x_orig_groups) if (.all_rows) { - result[!(result$time_value %in% ref_time_values), result_col_names] <- NA + result[!vec_in(result$time_value, ref_time_values), result_col_names] <- NA } else if (user_provided_rtvs) { - result <- result[result$time_value %in% ref_time_values, ] + result <- result[vec_in(result$time_value, ref_time_values), ] } if (!is_epi_df(result)) { @@ -979,7 +984,7 @@ epi_slide_opt <- function( #' @rdname epi_slide_opt #' @description `epi_slide_mean` is a wrapper around `epi_slide_opt` with `.f = -#' datatable::frollmean`. +#' data.table::frollmean`. #' #' @export epi_slide_mean <- function( @@ -1039,7 +1044,7 @@ epi_slide_mean <- function( #' @rdname epi_slide_opt #' @description `epi_slide_sum` is a wrapper around `epi_slide_opt` with `.f = -#' datatable::frollsum`. +#' data.table::frollsum`. #' #' @export epi_slide_sum <- function( @@ -1147,9 +1152,9 @@ full_date_seq <- function(x, before, after, time_type) { } } - return(list( + list( all_dates = all_dates, pad_early_dates = pad_early_dates, pad_late_dates = pad_late_dates - )) + ) } diff --git a/R/time-utils.R b/R/time-utils.R new file mode 100644 index 000000000..73fbc8a56 --- /dev/null +++ b/R/time-utils.R @@ -0,0 +1,370 @@ +#' Use max valid period as guess for `period` of `time_values` +#' +#' `r lifecycle::badge("experimental")` +#' +#' @param time_values Vector containing time-interval-like or time-point-like +#' data, with at least two distinct values. +#' @param time_values_arg Optional, string; name to give `time_values` in error +#' messages. Defaults to quoting the expression the caller fed into the +#' `time_values` argument. +#' @param ... Should be empty, there to satisfy the S3 generic. +#' @return length-1 vector; `r lifecycle::badge("experimental")` class will +#' either be the same class as [`base::diff()`] on such time values, an +#' integer, or a double, such that all `time_values` can be exactly obtained +#' by adding `k * result` for an integer k, and such that there is no smaller +#' `result` that can achieve this. +#' +#' @keywords internal +#' @export +guess_period <- function(time_values, time_values_arg = rlang::caller_arg(time_values), ...) { + UseMethod("guess_period") +} + +#' @export +guess_period.default <- function(time_values, time_values_arg = rlang::caller_arg(time_values), ...) { + rlang::check_dots_empty() + sorted_distinct_time_values <- sort(unique(time_values)) + if (length(sorted_distinct_time_values) < 2L) { + cli_abort("Not enough distinct values in {.code {time_values_arg}} to guess the period.", + class = "epiprocess__guess_period__not_enough_times", + time_values = time_values + ) + } + skips <- diff(sorted_distinct_time_values) + # Certain diff results have special classes or attributes; use vctrs to try to + # appropriately destructure for gcd_num, then restore to their original class + # & attributes. + skips_data <- vctrs::vec_data(skips) + period_data <- gcd_num(skips_data, rrtol = 0) + vctrs::vec_restore(period_data, skips) +} + +# `full_seq()` doesn't like difftimes, so convert to the natural units of some time types: + +#' @export +guess_period.Date <- function(time_values, time_values_arg = rlang::caller_arg(time_values), ...) { + as.numeric(NextMethod(), units = "days") +} + +#' @export +guess_period.POSIXt <- function(time_values, time_values_arg = rlang::caller_arg(time_values), ...) { + as.numeric(NextMethod(), units = "secs") +} + +#' Validate `.before` or `.window_size` argument +#' @keywords internal +validate_slide_window_arg <- function(arg, time_type, lower = 1, allow_inf = TRUE, arg_name = rlang::caller_arg(arg)) { + if (time_type == "custom") { + cli_abort( + "Unsure how to interpret slide units with a custom time type. Consider converting your time + column to a Date, yearmonth, or integer type.", + class = "epiprocess__validate_slide_window_arg" + ) + } + + msg <- "" + inf_if_okay <- if (allow_inf) { + "Inf" + } else { + character(0L) + } + + # nolint start: indentation_linter. + if (time_type == "day") { + if (!(test_sensible_int(arg, lower = lower) || + inherits(arg, "difftime") && length(arg) == 1L && units(arg) == "days" || + allow_inf && identical(arg, Inf) + )) { + msg <- glue::glue_collapse(c("length-1 difftime with units in days", "non-negative integer", inf_if_okay), " or ") + } + } else if (time_type == "week") { + if (!(inherits(arg, "difftime") && length(arg) == 1L && units(arg) == "weeks" || + allow_inf && identical(arg, Inf) + )) { + msg <- glue::glue_collapse(c("length-1 difftime with units in weeks", inf_if_okay), " or ") + } + } else if (time_type == "yearmonth") { + if (!(test_sensible_int(arg, lower = lower) || + allow_inf && identical(arg, Inf) + )) { + msg <- glue::glue_collapse(c("non-negative integer", inf_if_okay), " or ") + } + } else if (time_type == "integer") { + if (!(test_sensible_int(arg, lower = lower) || + allow_inf && identical(arg, Inf) + )) { + msg <- glue::glue_collapse(c("non-negative integer", inf_if_okay), " or ") + } + } else { + cli_abort('`epiprocess` internal error: unrecognized time_type: "{time_type}"', + class = "epiprocess__unrecognized_time_type" + ) + } + # nolint end + + if (msg != "") { + cli_abort( + "Slide function expected `{arg_name}` to be a {msg}.", + class = "epiprocess__validate_slide_window_arg" + ) + } +} + +#' Object that, added to time_values of time_type, advances by one time step/interval +#' +#' @param time_type string; `epi_df`'s or `epi_archive`'s `time_type` +#' @param format "friendly" or "fast"; for some time_types, there are multiple +#' ways to represent time_deltas. "friendly" tries to output a format that +#' will be more informative when printed, and produce errors in more cases +#' when used in unexpected ways. "fast" tries to output a time_delta that will +#' be faster in downstream operations. +#' @return an object `u` such that `time_values + u` represents advancing by one +#' time step / moving to the subsequent time interval for any `time_values` +#' object of time type `time_type`, and such that `time_values + k * u` for +#' integerish vector `k` advances by `k` steps (with vectorization, +#' recycling). At time of writing, these objects also all support +#' multiplication by nonintegerish numeric vectors, `mean`, and `median`, +#' which are useful for summarizing vector time_deltas, but these fractional +#' time_deltas are not allowed in time_delta-specific operations. +#' +#' @keywords internal +unit_time_delta <- function(time_type, format = c("friendly", "fast")) { + format <- rlang::arg_match(format) + switch(format, + friendly = switch(time_type, + day = as.difftime(1, units = "days"), + week = as.difftime(1, units = "weeks"), + yearmonth = 1, + integer = 1L, + cli_abort("Unsupported time_type: {time_type}") + ), + fast = switch(time_type, + day = 1, + week = 7, + yearmonth = 1, + integer = 1L, + cli_abort("Unsupported time_type: {time_type}") + ) + ) +} + +#' Convert a time delta to a integerish number of "unit" steps between time values +#' +#' @param time_delta a vector that can be added to time values of time type +#' `time_type` to arrive at other time values of that time type, or +#' `r lifecycle::badge("experimental")` such a vector with Inf/-Inf entries mixed +#' in, if supported by the class of `time_delta`, even if `time_type` doesn't +#' necessarily support Inf/-Inf entries. Basically a slide window arg but +#' without sign and length restrictions. +#' @param time_type as in `validate_slide_window_arg` +#' @return [bare integerish][rlang::is_integerish] vector (with possible +#' infinite values) that produces the same result as `time_delta` when +#' multiplied by the natural [`unit_time_delta`] for +#' that time type and added to time values of time type `time_type`. If the +#' given time type does not support infinite values, then it should produce +#' +Inf or -Inf for analogous entries of `time_delta`, and match the addition +#' result match the addition result for non-infinite entries. +#' +#' @keywords internal +time_delta_to_n_steps <- function(time_delta, time_type) { + # could be S3 if we're willing to export + if (inherits(time_delta, "difftime")) { + output_units <- switch(time_type, + day = "days", + week = "weeks", + cli_abort("difftime objects not supported for time_type {format_chr_with_quotes(time_type)}") + ) + units(time_delta) <- output_units # converts number to represent same duration; not just attr<- + n_steps <- vec_data(time_delta) + if (!is_bare_integerish(n_steps)) { + cli_abort("`time_delta` did not appear to contain only integerish numbers + of steps between time values of time type {format_chr_with_quotes(time_type)}") + } + n_steps + } else if (is_bare_integerish(time_delta)) { # (allows infinite values) + switch(time_type, + day = , + week = , + yearmonth = , + integer = time_delta, + cli_abort("Invalid or unsupported time_type {format_chr_with_quotes(time_type)}") + ) + } else { + cli_abort("Invalid or unsupported kind of `time_delta`") + } +} + +#' Convert from integerish/infinite/mix to time_delta +#' +#' @param n_steps integerish vector that can mix in infinite values +#' @param time_type as in [`validate_slide_window_arg`] +#' @param format optional; `"friendly"` to output a more descriptive/friendly +#' class like `"difftime"` when possible; `"fast"` to output a class that's +#' generally faster to work with when possible, like a vanilla `"numeric"`. +#' Default is `"friendly"`. +#' +#' @keywords internal +n_steps_to_time_delta <- function(n_steps, time_type, format = c("friendly", "fast")) { + if (!is_bare_integerish(n_steps)) { + cli_abort("`n_steps` did not appear to be integerish (or infinite, or a mix)") + } + n_steps * unit_time_delta(time_type, format) +} + +#' Standardize time_deltas to a multiple of [`unit_time_delta()`] +#' +#' @keywords internal +time_delta_standardize <- function(time_delta, time_type, format = c("friendly", "fast")) { + time_delta_to_n_steps(time_delta, time_type) * unit_time_delta(time_type, format) +} + +#' Helper data for [`time_type_unit_abbr`] +#' +#' @keywords internal +time_type_unit_abbrs <- c( + day = "d", + week = "w", + yearmonth = "m" +) +# ^ Using these unit abbreviations happens to make our automatic slide output +# naming look like taking ISO-8601 duration designations, removing the P, and +# lowercasing any characters. Fortnightly or sub-daily time types would need an +# adjustment to remain consistent. + +#' Get an abbreviation for the "units" of `unit_time_delta(time_type)` +#' +#' For use in formatting or automatically naming things based on +#' `time_delta_to_n_steps(time_delta)` for a `time_delta` between times of time +#' type `time_type`. +#' +#' @param time_type str +#' @return str +#' +#' @keywords internal +time_type_unit_abbr <- function(time_type) { + maybe_unit_abbr <- time_type_unit_abbrs[time_type] + if (is.na(maybe_unit_abbr)) { + cli_abort("Cannot determine the units of time type {format_chr_with_quotes(time_type)}") + } + maybe_unit_abbr +} + +#' Helper data for [`format_time_delta`] +#' +#' Should not be altered on the basis of untrusted user input, as it is used as +#' a cli format string and may run code. +#' +#' @keywords internal +time_type_unit_pluralizer <- c( + day = "day{?s}", + week = "week{?s}", + yearmonth = "month{?s}", + integer = "time step{?s}" +) + +#' Format a length-1 time delta to a character to assist messaging +#' +#' This is meant to address the following: +#' - `glue::glue("{as.difftime(1, units = 'days')}")` is "1" +#' - `glue::glue("{format(as.difftime(1, units = 'days'))}")` is "1 days" +#' - time deltas for yearmonths and integers don't have units attached at all +#' +#' @keywords internal +format_time_delta <- function(x, time_type) { + n_steps <- time_delta_to_n_steps(x, time_type) # nolint: object_usage_linter + # time_type_unit_pluralizer[[time_type]] is a format string controlled by us + # and/or downstream devs, so we can paste it onto our format string safely: + pluralize(paste0("{n_steps} ", time_type_unit_pluralizer[[time_type]])) +} + +#' Convert `time_delta` to an approximate difftime +#' +#' `r lifecycle::badge("experimental")` +#' +#' To assist in comparing `time_delta`s to default `difftime` thresholds when we +#' want to reduce friction. +#' +#' It may be better to try to do something like make `time_delta` validation +#' more accommodating (e.g., of difftimes with units of "days" when working on +#' weekly scale), and remain rigid on yearmonths. Applying deltas and comparing +#' time_values might also be an approach but seems more fraught as the least +#' common denominator would be start/mid/end datetimes of time intervals, but +#' those are also ambiguous (starting&representation wdays of weeks are unknown, +#' timezone of dates are unknown). +#' +#' Another alternative approach, below, converts difftimes to time_deltas +#' instead. It requires knowledge of which way to round in order to get +#' time_deltas representing an integer number of time steps, but avoids some +#' potential inconsistencies of the time-delta-to-difftime approach when we +#' think about applying it to, e.g., months / spans of months with varying +#' numbers of days, and also makes it easier to avoid "magical defaults". +#' +#' @keywords internal +time_delta_to_approx_difftime <- function(time_delta, time_type) { + switch(time_type, + day = , + week = time_delta_standardize(time_delta, time_type, "friendly"), + yearmonth = time_delta * as.difftime(30, units = "days"), + integer = , + cli_abort("Unsupported time_type for this operation: {time_type}") + ) +} + +#' Closest time_delta that's approximately greater than or equal to given difftime +#' +#' `r lifecycle::badge("experimental")` +#' +#' @param difftime a difftime object +#' @param time_type as in [`validate_slide_window_arg`] +#' @return An object representing an integerish number (or vector of numbers) of +#' time steps between consecutive time_values of type `time_type`. +#' +#' @keywords internal +difftime_approx_ceiling_time_delta <- function(difftime, time_type) { + assert_class(difftime, "difftime") + switch(time_type, + day = , + week = { + units(difftime) <- paste0(time_type, "s") + ceiling(difftime) + }, + yearmonth = { + units(difftime) <- "days" + ceiling(as.numeric(difftime) / 30) + }, + integer = , + cli_abort("Unsupported time_type for this operation: {time_type}") + ) +} + +#' Difference between two time value vectors in terms of number of time "steps" +#' +#' @param x a time_value (vector) of time type `time_type` +#' @param y a time_value (vector) of time type `time_type` +#' @param time_type as in [`validate_slide_window_arg()`] +#' @return integerish vector such that `x + n_steps_to_time_delta_fast(result)` +#' should equal `y`. +#' +#' @keywords internal +time_minus_time_in_n_steps <- function(x, y, time_type) { + time_delta_to_n_steps(x - y, time_type) +} + +#' Advance/retreat time_values by specified number of time "steps" +#' +#' Here, a "step" is based on the `time_type`, not just the class of `x`. +#' +#' @param x a time_value (vector) of time type `time_type` +#' @param y integerish (vector) +#' @param time_type as in [`validate_slide_window_arg()`] +#' @return a time_value (vector) of time type `time_type` +#' +#' @keywords internal +time_plus_n_steps <- function(x, y, time_type) { + x + y * unit_time_delta(time_type, "fast") +} + +#' @rdname time_plus_n_steps +time_minus_n_steps <- function(x, y, time_type) { + x - y * unit_time_delta(time_type, "fast") +} diff --git a/R/utils.R b/R/utils.R index e350ade25..66270d69c 100644 --- a/R/utils.R +++ b/R/utils.R @@ -89,13 +89,13 @@ paste_lines <- function(lines) { paste(paste0(lines, "\n"), collapse = "") } -#' Format a class vector as a string via deparsing it +#' Format a character vector as a string via deparsing it #' -#' @param class_vec `chr`; output of `class(object)` for some `object` +#' @param x `chr`; e.g., output of `class(object)` for some `object` #' @return string #' @keywords internal -format_class_vec <- function(class_vec) { - paste(collapse = "", deparse(class_vec)) +format_chr_deparse <- function(x) { + paste(collapse = "", deparse(x)) } #' Format a character vector as a string via deparsing/quoting each @@ -547,7 +547,7 @@ as_slide_computation <- function(.f, ..., } cli_abort( - "Can't convert an object of class {format_class_vec(class(.f))} + "Can't convert an object of class {format_chr_deparse(class(.f))} to a slide computation", class = "epiprocess__as_slide_computation__cant_convert_catchall", epiprocess__f = .f, @@ -617,7 +617,7 @@ guess_geo_type <- function(geo_value) { } } - return("custom") + "custom" } @@ -679,7 +679,7 @@ time_column_names <- function() { ) substitutions <- upcase_snake_case(substitutions) names(substitutions) <- rep("time_value", length(substitutions)) - return(substitutions) + substitutions } # #' potential geo_value columns @@ -696,7 +696,7 @@ geo_column_names <- function() { ) substitutions <- upcase_snake_case(substitutions) names(substitutions) <- rep("geo_value", length(substitutions)) - return(substitutions) + substitutions } #' potential version columns @@ -711,7 +711,7 @@ version_column_names <- function() { ) substitutions <- upcase_snake_case(substitutions) names(substitutions) <- rep("version", length(substitutions)) - return(substitutions) + substitutions } #' rename potential time_value columns @@ -753,7 +753,7 @@ guess_column_name <- function(x, column_name, substitutions) { } ) } - return(x) + x } ########## @@ -979,59 +979,6 @@ gcd_num <- function(dividends, ..., rrtol = 1e-6, pqlim = 1e6, irtol = 1e-6) { vctrs::vec_cast(numeric_gcd, dividends) } -#' Use max valid period as guess for `period` of `time_values` -#' -#' `r lifecycle::badge("experimental")` -#' -#' @param time_values Vector containing time-interval-like or time-point-like -#' data, with at least two distinct values. -#' @param time_values_arg Optional, string; name to give `time_values` in error -#' messages. Defaults to quoting the expression the caller fed into the -#' `time_values` argument. -#' @param ... Should be empty, there to satisfy the S3 generic. -#' @return length-1 vector; `r lifecycle::badge("experimental")` class will -#' either be the same class as [`base::diff()`] on such time values, an -#' integer, or a double, such that all `time_values` can be exactly obtained -#' by adding `k * result` for an integer k, and such that there is no smaller -#' `result` that can achieve this. -#' -#' @keywords internal -#' @export -guess_period <- function(time_values, time_values_arg = rlang::caller_arg(time_values), ...) { - UseMethod("guess_period") -} - -#' @export -guess_period.default <- function(time_values, time_values_arg = rlang::caller_arg(time_values), ...) { - rlang::check_dots_empty() - sorted_distinct_time_values <- sort(unique(time_values)) - if (length(sorted_distinct_time_values) < 2L) { - cli_abort("Not enough distinct values in {.code {time_values_arg}} to guess the period.", - class = "epiprocess__guess_period__not_enough_times", - time_values = time_values - ) - } - skips <- diff(sorted_distinct_time_values) - # Certain diff results have special classes or attributes; use vctrs to try to - # appropriately destructure for gcd_num, then restore to their original class - # & attributes. - skips_data <- vctrs::vec_data(skips) - period_data <- gcd_num(skips_data, rrtol = 0) - vctrs::vec_restore(period_data, skips) -} - -# `full_seq()` doesn't like difftimes, so convert to the natural units of some time types: - -#' @export -guess_period.Date <- function(time_values, time_values_arg = rlang::caller_arg(time_values), ...) { - as.numeric(NextMethod(), units = "days") -} - -#' @export -guess_period.POSIXt <- function(time_values, time_values_arg = rlang::caller_arg(time_values), ...) { - as.numeric(NextMethod(), units = "secs") -} - #' Is `x` an "int" with a sensible class? TRUE/FALSE #' #' Like [`checkmate::test_int`] but disallowing some non-sensible classes that @@ -1053,148 +1000,6 @@ test_sensible_int <- function(x, na.ok = FALSE, lower = -Inf, upper = Inf, # nol } } -validate_slide_window_arg <- function(arg, time_type, lower = 1, allow_inf = TRUE, arg_name = rlang::caller_arg(arg)) { - if (time_type == "custom") { - cli_abort( - "Unsure how to interpret slide units with a custom time type. Consider converting your time - column to a Date, yearmonth, or integer type.", - class = "epiprocess__validate_slide_window_arg" - ) - } - - msg <- "" - inf_if_okay <- if (allow_inf) { - "Inf" - } else { - character(0L) - } - - # nolint start: indentation_linter. - if (time_type == "day") { - if (!(test_sensible_int(arg, lower = lower) || - inherits(arg, "difftime") && length(arg) == 1L && units(arg) == "days" || - allow_inf && identical(arg, Inf) - )) { - msg <- glue::glue_collapse(c("length-1 difftime with units in days", "non-negative integer", inf_if_okay), " or ") - } - } else if (time_type == "week") { - if (!(inherits(arg, "difftime") && length(arg) == 1L && units(arg) == "weeks" || - allow_inf && identical(arg, Inf) - )) { - msg <- glue::glue_collapse(c("length-1 difftime with units in weeks", inf_if_okay), " or ") - } - } else if (time_type == "yearmonth") { - if (!(test_sensible_int(arg, lower = lower) || - allow_inf && identical(arg, Inf) - )) { - msg <- glue::glue_collapse(c("non-negative integer", inf_if_okay), " or ") - } - } else if (time_type == "integer") { - if (!(test_sensible_int(arg, lower = lower) || - allow_inf && identical(arg, Inf) - )) { - msg <- glue::glue_collapse(c("non-negative integer", inf_if_okay), " or ") - } - } else { - cli_abort('`epiprocess` internal error: unrecognized time_type: "{time_type}"', - class = "epiprocess__unrecognized_time_type" - ) - } - # nolint end - - if (msg != "") { - cli_abort( - "Slide function expected `{arg_name}` to be a {msg}.", - class = "epiprocess__validate_slide_window_arg" - ) - } -} - - -#' Convert a time delta to a integerish number of "unit" steps between time values -#' -#' @param time_delta a vector that can be added to time values of time type -#' `time_type` to arrive at other time values of that time type, or -#' `r lifecycle::badge("experimental")` such a vector with Inf/-Inf entries mixed -#' in, if supported by the class of `time_delta`, even if `time_type` doesn't -#' necessarily support Inf/-Inf entries. Basically a slide window arg but -#' without sign and length restrictions. -#' @param time_type as in `validate_slide_window_arg` -#' @return [bare integerish][rlang::is_integerish] vector (with possible -#' infinite values) that produces the same result as `time_delta` when -#' multiplied by the natural [`unit_time_delta`] for -#' that time type and added to time values of time type `time_type`. If the -#' given time type does not support infinite values, then it should produce -#' +Inf or -Inf for analogous entries of `time_delta`, and match the addition -#' result match the addition result for non-infinite entries. -#' -#' @keywords internal -time_delta_to_n_steps <- function(time_delta, time_type) { - # could be S3 if we're willing to export - if (inherits(time_delta, "difftime")) { - output_units <- switch(time_type, - day = "days", - week = "weeks", - cli_abort("difftime objects not supported for time_type {format_chr_with_quotes(time_type)}") - ) - units(time_delta) <- output_units # converts number to represent same duration; not just attr<- - n_steps <- vec_data(time_delta) - if (!is_bare_integerish(n_steps)) { - cli_abort("`time_delta` did not appear to contain only integerish numbers - of steps between time values of time type {format_chr_with_quotes(time_type)}") - } - n_steps - } else if (is_bare_integerish(time_delta)) { # (allows infinite values) - switch(time_type, - day = , - week = , - yearmonth = , - integer = time_delta, - cli_abort("Invalid or unsupported time_type {format_chr_with_quotes(time_type)}") - ) - } else { - cli_abort("Invalid or unsupported kind of `time_delta`") - } -} - -#' Object that, added to time_values of time_type, advances by one time step/interval -#' -#' @param time_type string; `epi_df`'s or `epi_archive`'s `time_type` -#' @return an object `u` such that `time_values + u` represents advancing by one -#' time step / moving to the subsequent time interval for any `time_values` -#' object of time type `time_type`, and such that `time_values + k * u` for -#' integerish vector `k` advances by `k` steps (with vectorization, -#' recycling). -#' -#' @keywords internal -unit_time_delta <- function(time_type) { - switch(time_type, - day = as.difftime(1, units = "days"), - week = as.difftime(1, units = "weeks"), - yearmonth = 1, - integer = 1L, - cli_abort("Unsupported time_type: {time_type}") - ) -} - -# Using these unit abbreviations happens to make our automatic slide output -# naming look like taking ISO-8601 duration designations, removing the P, and -# lowercasing any characters. Fortnightly or sub-daily time types would need an -# adjustment to remain consistent. -time_type_unit_abbrs <- c( - day = "d", - week = "w", - yearmonth = "m" -) - -time_type_unit_abbr <- function(time_type) { - maybe_unit_abbr <- time_type_unit_abbrs[time_type] - if (is.na(maybe_unit_abbr)) { - cli_abort("Cannot determine the units of time type {format_chr_with_quotes(time_type)}") - } - maybe_unit_abbr -} - #' Extract singular element of a length-1 unnamed list (validated) #' #' Inverse of `list(elt)`. @@ -1233,12 +1038,11 @@ check_ukey_unique <- function(x, ukey_names, end_cli_message = character()) { TRUE } else { # Fast check, slow error message. - arranged_ukeys <- arrange(x[ukey_names], across(all_of(ukey_names))) - if (!any(vec_equal(arranged_ukeys[-1L, ], arranged_ukeys[-nrow(arranged_ukeys), ]))) { + if (!vctrs::vec_duplicate_any(x[ukey_names])) { TRUE } else { bad_data <- x %>% - group_by(across(all_of(ukey_names))) %>% + group_by(pick(all_of(ukey_names))) %>% filter(dplyr::n() > 1) %>% ungroup() lines <- c( diff --git a/README.Rmd b/README.Rmd index 0e8756d3d..cb3fea3a9 100644 --- a/README.Rmd +++ b/README.Rmd @@ -95,31 +95,31 @@ df <- pub_covidcast( df ``` -Convert the data to an epi_df object and sort by geo_value and time_value. You +Convert the data to an `epi_df` object and sort by `geo_value` and `time_value`. You can work with an `epi_df` like you can with a `{tibble}` by using `{dplyr}` -verbs +verbs. ```{r} edf <- df %>% as_epi_df(as_of = as.Date("2024-01-01")) %>% arrange_canonical() %>% group_by(geo_value) %>% - mutate(cases_daily = cases_cumulative - lag(cases_cumulative, default = 0)) + mutate(cases_daily = cases_cumulative - lag(cases_cumulative, default = 0)) %>% + ungroup() edf ``` -Compute the 7 day moving average of the confirmed daily cases for each geo_value +Compute the 7 day moving average of the confirmed daily cases for each `geo_value` ```{r} edf <- edf %>% - group_by(geo_value) %>% epi_slide_mean(cases_daily, .window_size = 7, na.rm = TRUE, .prefix = "smoothed_") edf ``` -Autoplot the confirmed daily cases for each geo_value +Autoplot the confirmed daily cases for each `geo_value` -```{r} +```{r, dev='svg'} edf %>% autoplot(smoothed_cases_daily) ``` diff --git a/README.md b/README.md index af8c24e90..5d82c3f5f 100644 --- a/README.md +++ b/README.md @@ -14,24 +14,24 @@ forecasting. `{epiprocess}` contains: - - `epi_df()` and `epi_archive()`, two data frame classes (that work - like a `{tibble}` with `{dplyr}` verbs) for working with - epidemiological time series data - - `epi_df` is for working with a snapshot of data at a single - point in time - - `epi_archive` is for working with histories of data that changes - over time - - one of the most common uses of `epi_archive` is for accurate - backtesting of forecasting models, see `vignette("backtesting", - package="epipredict")` - - signal processing tools building on these data structures such as - - `epi_slide()` for sliding window operations (aids with feature - creation) - - `epix_slide()` for sliding window operations on archives (aids - with backtesting) - - `growth_rate()` for computing growth rates - - `detect_outlr()` for outlier detection - - `epi_cor()` for computing correlations +- `epi_df()` and `epi_archive()`, two data frame classes (that work like + a `{tibble}` with `{dplyr}` verbs) for working with epidemiological + time series data + - `epi_df` is for working with a snapshot of data at a single point in + time + - `epi_archive` is for working with histories of data that changes + over time + - one of the most common uses of `epi_archive` is for accurate + backtesting of forecasting models, see + `vignette("backtesting", package="epipredict")` +- signal processing tools building on these data structures such as + - `epi_slide()` for sliding window operations (aids with feature + creation) + - `epix_slide()` for sliding window operations on archives (aids with + backtesting) + - `growth_rate()` for computing growth rates + - `detect_outlr()` for outlier detection + - `epi_cor()` for computing correlations If you are new to this set of tools, you may be interested learning through a book format: [Introduction to Epidemiological @@ -39,10 +39,10 @@ Forecasting](https://cmu-delphi.github.io/delphi-tooling-book/). You may also be interested in: - - `{epidatr}`, for accessing wide range of epidemiological data sets, - including COVID-19 data, flu data, and more. - - [rtestim](https://github.com/dajmcdon/rtestim), a package for - estimating the time-varying reproduction number of an epidemic. +- `{epidatr}`, for accessing wide range of epidemiological data sets, + including COVID-19 data, flu data, and more. +- [rtestim](https://github.com/dajmcdon/rtestim), a package for + estimating the time-varying reproduction number of an epidemic. This package is provided by the [Delphi group](https://delphi.cmu.edu/) at Carnegie Mellon University. @@ -101,16 +101,17 @@ df #> # ℹ 2,802 more rows ``` -Convert the data to an epi\_df object and sort by geo\_value and -time\_value. You can work with an `epi_df` like you can with a -`{tibble}` by using `{dplyr}` verbs +Convert the data to an `epi_df` object and sort by `geo_value` and +`time_value`. You can work with an `epi_df` like you can with a +`{tibble}` by using `{dplyr}` verbs. ``` r edf <- df %>% as_epi_df(as_of = as.Date("2024-01-01")) %>% arrange_canonical() %>% group_by(geo_value) %>% - mutate(cases_daily = cases_cumulative - lag(cases_cumulative, default = 0)) + mutate(cases_daily = cases_cumulative - lag(cases_cumulative, default = 0)) %>% + ungroup() edf #> An `epi_df` object, 2,808 x 4 with metadata: #> * geo_type = state @@ -118,9 +119,8 @@ edf #> * as_of = 2024-01-01 #> #> # A tibble: 2,808 × 4 -#> # Groups: geo_value [4] #> geo_value time_value cases_cumulative cases_daily -#> * +#> #> 1 ca 2020-03-01 19 19 #> 2 ca 2020-03-02 23 4 #> 3 ca 2020-03-03 29 6 @@ -131,11 +131,10 @@ edf ``` Compute the 7 day moving average of the confirmed daily cases for each -geo\_value +`geo_value` ``` r edf <- edf %>% - group_by(geo_value) %>% epi_slide_mean(cases_daily, .window_size = 7, na.rm = TRUE, .prefix = "smoothed_") edf #> An `epi_df` object, 2,808 x 5 with metadata: @@ -144,9 +143,8 @@ edf #> * as_of = 2024-01-01 #> #> # A tibble: 2,808 × 5 -#> # Groups: geo_value [4] #> geo_value time_value cases_cumulative cases_daily smoothed_cases_daily -#> * +#> #> 1 ca 2020-03-01 19 19 19 #> 2 ca 2020-03-02 23 4 11.5 #> 3 ca 2020-03-03 29 6 9.67 @@ -156,11 +154,11 @@ edf #> # ℹ 2,802 more rows ``` -Autoplot the confirmed daily cases for each geo\_value +Autoplot the confirmed daily cases for each `geo_value` ``` r edf %>% autoplot(smoothed_cases_daily) ``` - + diff --git a/_pkgdown.yml b/_pkgdown.yml index e8a2c8c3b..3742bd416 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -63,7 +63,7 @@ reference: - sum_groups_epi_df - epi_cor - detect_outlr - - growth_rate + - starts_with("growth_rate") - as_tibble.epi_df - as_tsibble.epi_df diff --git a/man/apply_compactify.Rd b/man/apply_compactify.Rd index 0e1f0b3c9..e96108789 100644 --- a/man/apply_compactify.Rd +++ b/man/apply_compactify.Rd @@ -4,7 +4,19 @@ \alias{apply_compactify} \title{Given a tibble as would be found in an epi_archive, remove duplicate entries.} \usage{ -apply_compactify(df, keys, tolerance = .Machine$double.eps^0.5) +apply_compactify(updates_df, ukey_names, abs_tol = 0) +} +\arguments{ +\item{updates_df}{DT of an \code{epi_archive} or something analogous (though +potentially unsorted) of another class} + +\item{ukey_names}{chr; the column names forming a unique key for the +\code{updates_df}; "version" must come last. For an \code{epi_archive}'s \code{DT}, this +would be \code{key(DT)}.} + +\item{abs_tol}{numeric, >=0; absolute tolerance to use on numeric measurement +columns when determining whether something can be compactified away; see +\code{\link{is_locf}}} } \description{ Works by shifting all rows except the version, then comparing values to see diff --git a/man/as_tibble.epi_df.Rd b/man/as_tibble.epi_df.Rd index 9d016cd60..56905c594 100644 --- a/man/as_tibble.epi_df.Rd +++ b/man/as_tibble.epi_df.Rd @@ -9,11 +9,11 @@ \arguments{ \item{x}{an \code{epi_df}} -\item{...}{Unused, for extensibility.} +\item{...}{if present, forwarded to \code{\link[tibble:as_tibble]{tibble::as_tibble}}} } \description{ -Converts an \code{epi_df} object into a tibble, dropping metadata and any -grouping. +Converts an \code{epi_df} object into a tibble, dropping metadata, any +grouping, and any unrelated classes and attributes. } \details{ Advanced: if you are working with a third-party package that uses diff --git a/man/difftime_approx_ceiling_time_delta.Rd b/man/difftime_approx_ceiling_time_delta.Rd new file mode 100644 index 000000000..060cadadd --- /dev/null +++ b/man/difftime_approx_ceiling_time_delta.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/time-utils.R +\name{difftime_approx_ceiling_time_delta} +\alias{difftime_approx_ceiling_time_delta} +\title{Closest time_delta that's approximately greater than or equal to given difftime} +\usage{ +difftime_approx_ceiling_time_delta(difftime, time_type) +} +\arguments{ +\item{difftime}{a difftime object} + +\item{time_type}{as in \code{\link{validate_slide_window_arg}}} +} +\value{ +An object representing an integerish number (or vector of numbers) of +time steps between consecutive time_values of type \code{time_type}. +} +\description{ +\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#experimental}{\figure{lifecycle-experimental.svg}{options: alt='[Experimental]'}}}{\strong{[Experimental]}} +} +\keyword{internal} diff --git a/man/difftime_summary.Rd b/man/difftime_summary.Rd deleted file mode 100644 index ef153f3d1..000000000 --- a/man/difftime_summary.Rd +++ /dev/null @@ -1,12 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/revision_analysis.R -\name{difftime_summary} -\alias{difftime_summary} -\title{summary doesn't work on difftimes} -\usage{ -difftime_summary(diff_time_val) -} -\description{ -summary doesn't work on difftimes -} -\keyword{internal} diff --git a/man/epi_archive.Rd b/man/epi_archive.Rd index 4b459d5e4..f98666e03 100644 --- a/man/epi_archive.Rd +++ b/man/epi_archive.Rd @@ -12,26 +12,19 @@ new_epi_archive( geo_type, time_type, other_keys, - compactify, - clobberable_versions_start, - versions_end, - compactify_tol = .Machine$double.eps^0.5 -) - -validate_epi_archive( - x, - other_keys, - compactify, clobberable_versions_start, versions_end ) +validate_epi_archive(x) + as_epi_archive( x, geo_type = deprecated(), time_type = deprecated(), other_keys = character(), - compactify = NULL, + compactify = TRUE, + compactify_abs_tol = 0, clobberable_versions_start = NA, .versions_end = max_version_with_row_in(x), ..., @@ -54,10 +47,6 @@ that should be considered key variables (in the language of \code{data.table}) apart from "geo_value", "time_value", and "version". Typical examples are "age" or more granular geographies.} -\item{compactify}{Optional; Boolean. \code{TRUE} will remove some -redundant rows, \code{FALSE} will not, and missing or \code{NULL} will remove -redundant rows, but issue a warning. See more information at \code{compactify}.} - \item{clobberable_versions_start}{Optional; \code{length}-1; either a value of the same \code{class} and \code{typeof} as \code{x$version}, or an \code{NA} of any \code{class} and \code{typeof}: specifically, either (a) the earliest version that could be @@ -84,8 +73,16 @@ value of \code{clobberable_versions_start} does not fully trust these empty updates, and assumes that any version \verb{>= max(x$version)} could be clobbered.) If \code{nrow(x) == 0}, then this argument is mandatory.} -\item{compactify_tol}{double. the tolerance used to detect approximate -equality for compactification} +\item{compactify}{Optional; \code{TRUE}, \code{FALSE}, or \code{"message"}. \code{TRUE} will +remove some redundant rows, \code{FALSE} will not. \code{"message"} is like \code{TRUE} +but will emit a message if anything was changed. Default is \code{TRUE}. See +more information below under "Compactification:".} + +\item{compactify_abs_tol}{Optional; double. A tolerance level used to detect +approximate equality for compactification. The default is 0, which +corresponds to exact equality. Consider using this if your value columns +undergo tiny nonmeaningful revisions and the archive object with the +default setting is too large.} \item{.versions_end}{location based versions_end, used to avoid prefix \code{version = issue} from being assigned to \code{versions_end} instead of being @@ -107,8 +104,11 @@ possibly other key columns) along with measurement values. In brief, an column tracks the time at which the data was available. This allows for version-aware forecasting. -\code{new_epi_archive} is the constructor for \code{epi_archive} objects that assumes -all arguments have been validated. Most users should use \code{as_epi_archive}. +\code{new_epi_archive} is the low-level constructor for \code{epi_archive} objects that +only performs some fast, basic checks on the inputs. \code{validate_epi_archive} +can perform more costly validation checks on its output. But most users +should use \code{as_epi_archive}, which performs all necessary checks and has some +additional features. } \details{ An \code{epi_archive} contains a \code{data.table} object \code{DT} (from the diff --git a/man/epi_slide_opt.Rd b/man/epi_slide_opt.Rd index 68244410b..4b75e9ffb 100644 --- a/man/epi_slide_opt.Rd +++ b/man/epi_slide_opt.Rd @@ -139,9 +139,9 @@ or \link[slider:summary-slide]{slider::summary-slide} function over variables in These functions tend to be much faster than \code{epi_slide()}. See \code{vignette("epi_df")} for more examples. -\code{epi_slide_mean} is a wrapper around \code{epi_slide_opt} with \code{.f = datatable::frollmean}. +\code{epi_slide_mean} is a wrapper around \code{epi_slide_opt} with \code{.f = data.table::frollmean}. -\code{epi_slide_sum} is a wrapper around \code{epi_slide_opt} with \code{.f = datatable::frollsum}. +\code{epi_slide_sum} is a wrapper around \code{epi_slide_opt} with \code{.f = data.table::frollsum}. } \section{Prefix and suffix shorthand}{ diff --git a/man/epidatasets_reexports.Rd b/man/epidatasets_reexports.Rd deleted file mode 100644 index 3dc809e45..000000000 --- a/man/epidatasets_reexports.Rd +++ /dev/null @@ -1,40 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/reexports.R -\docType{data} -\name{cases_deaths_subset} -\alias{cases_deaths_subset} -\alias{covid_incidence_county_subset} -\alias{covid_incidence_outliers} -\alias{archive_cases_dv_subset} -\alias{covid_case_death_rates_extended} -\title{Selected example data sets from \code{epidatasets}} -\format{ -An object of class \code{epi_df} (inherits from \code{tbl_df}, \code{tbl}, \code{data.frame}) with 4026 rows and 6 columns. - -An object of class \code{epi_df} (inherits from \code{tbl_df}, \code{tbl}, \code{data.frame}) with 16212 rows and 5 columns. - -An object of class \code{epi_df} (inherits from \code{tbl_df}, \code{tbl}, \code{data.frame}) with 730 rows and 3 columns. - -An object of class \code{epi_archive} of length 6. - -An object of class \code{epi_df} (inherits from \code{tbl_df}, \code{tbl}, \code{data.frame}) with 37576 rows and 4 columns. -} -\usage{ -cases_deaths_subset - -covid_incidence_county_subset - -covid_incidence_outliers - -archive_cases_dv_subset - -covid_case_death_rates_extended -} -\description{ -Data sets re-exported from \code{epidatasets}; please see -documentation for each of these objects in \code{epidatasets}. - -A brief description of the format of each of the objects above are described -in matching order below. -} -\keyword{internal} diff --git a/man/epiprocess-package.Rd b/man/epiprocess-package.Rd index 774d5f8ac..fe79c01e8 100644 --- a/man/epiprocess-package.Rd +++ b/man/epiprocess-package.Rd @@ -20,6 +20,7 @@ Useful links: Authors: \itemize{ + \item Nat DeFries \item Daniel McDonald \item Evan Ray \item Dmitry Shemetov @@ -30,7 +31,6 @@ Other contributors: \itemize{ \item Jacob Bien [contributor] \item Rafael Catoia [contributor] - \item Nat DeFries [contributor] \item Rachel Lobay [contributor] \item Ken Mawer [contributor] \item Chloe You [contributor] diff --git a/man/epix_as_of.Rd b/man/epix_as_of.Rd index c3682489a..be5c977aa 100644 --- a/man/epix_as_of.Rd +++ b/man/epix_as_of.Rd @@ -71,8 +71,7 @@ archive_cases_dv_subset2 <- as_epi_archive( # (a.k.a. "hotfixed", "clobbered", etc.): clobberable_versions_start = max(archive_cases_dv_subset$DT$version), # Suppose today is the following day, and there are no updates out yet: - versions_end = max(archive_cases_dv_subset$DT$version) + 1L, - compactify = TRUE + versions_end = max(archive_cases_dv_subset$DT$version) + 1L ) epix_as_of(archive_cases_dv_subset2, max(archive_cases_dv_subset$DT$version)) diff --git a/man/epix_merge.Rd b/man/epix_merge.Rd index 3ffebc990..ca6b0715f 100644 --- a/man/epix_merge.Rd +++ b/man/epix_merge.Rd @@ -8,7 +8,8 @@ epix_merge( x, y, sync = c("forbid", "na", "locf", "truncate"), - compactify = TRUE + compactify = TRUE, + compactify_abs_tol = 0 ) } \arguments{ @@ -33,8 +34,10 @@ and use \code{min(x$versions_end, y$versions_end)} as the result's \code{versions_end}. }} -\item{compactify}{Optional; \code{TRUE} (default), \code{FALSE}, or \code{NULL}; should the +\item{compactify}{Optional; \code{TRUE} (default), \code{FALSE}, or \code{"message"}; should the result be compactified? See \code{as_epi_archive()} for details.} + +\item{compactify_abs_tol}{As in \code{\link[=as_epi_archive]{as_epi_archive()}}.} } \value{ the resulting \code{epi_archive} diff --git a/man/f_no_na.Rd b/man/f_no_na.Rd index 9a832d729..1e3acb6f7 100644 --- a/man/f_no_na.Rd +++ b/man/f_no_na.Rd @@ -2,13 +2,13 @@ % Please edit documentation in R/revision_analysis.R \name{f_no_na} \alias{f_no_na} -\title{use when the default behavior returns a warning on empty lists, which we do +\title{use when the default behavior returns a warning on empty vectors, which we do not want, and there is no super clean way of preventing this} \usage{ f_no_na(f, x) } \description{ -use when the default behavior returns a warning on empty lists, which we do +use when the default behavior returns a warning on empty vectors, which we do not want, and there is no super clean way of preventing this } \keyword{internal} diff --git a/man/figures/README-unnamed-chunk-6-1.png b/man/figures/README-unnamed-chunk-6-1.png deleted file mode 100644 index b435c6514..000000000 Binary files a/man/figures/README-unnamed-chunk-6-1.png and /dev/null differ diff --git a/man/figures/README-unnamed-chunk-6-1.svg b/man/figures/README-unnamed-chunk-6-1.svg deleted file mode 100644 index adde41573..000000000 --- a/man/figures/README-unnamed-chunk-6-1.svg +++ /dev/null @@ -1,383 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/man/figures/README-unnamed-chunk-7-1.png b/man/figures/README-unnamed-chunk-7-1.png deleted file mode 100644 index 3c40f30a4..000000000 Binary files a/man/figures/README-unnamed-chunk-7-1.png and /dev/null differ diff --git a/man/figures/README-unnamed-chunk-7-1.svg b/man/figures/README-unnamed-chunk-7-1.svg index 30058a576..e4ac3d38e 100644 --- a/man/figures/README-unnamed-chunk-7-1.svg +++ b/man/figures/README-unnamed-chunk-7-1.svg @@ -1,399 +1,388 @@ - + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - - - - + + + + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/man/figures/README-unnamed-chunk-8-1.svg b/man/figures/README-unnamed-chunk-8-1.svg deleted file mode 100644 index 5ec5ba01b..000000000 --- a/man/figures/README-unnamed-chunk-8-1.svg +++ /dev/null @@ -1,365 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/man/format_chr_deparse.Rd b/man/format_chr_deparse.Rd new file mode 100644 index 000000000..a283c3288 --- /dev/null +++ b/man/format_chr_deparse.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils.R +\name{format_chr_deparse} +\alias{format_chr_deparse} +\title{Format a character vector as a string via deparsing it} +\usage{ +format_chr_deparse(x) +} +\arguments{ +\item{x}{\code{chr}; e.g., output of \code{class(object)} for some \code{object}} +} +\value{ +string +} +\description{ +Format a character vector as a string via deparsing it +} +\keyword{internal} diff --git a/man/format_class_vec.Rd b/man/format_class_vec.Rd deleted file mode 100644 index 2c7ae4b76..000000000 --- a/man/format_class_vec.Rd +++ /dev/null @@ -1,18 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/utils.R -\name{format_class_vec} -\alias{format_class_vec} -\title{Format a class vector as a string via deparsing it} -\usage{ -format_class_vec(class_vec) -} -\arguments{ -\item{class_vec}{\code{chr}; output of \code{class(object)} for some \code{object}} -} -\value{ -string -} -\description{ -Format a class vector as a string via deparsing it -} -\keyword{internal} diff --git a/man/format_time_delta.Rd b/man/format_time_delta.Rd new file mode 100644 index 000000000..3658ff272 --- /dev/null +++ b/man/format_time_delta.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/time-utils.R +\name{format_time_delta} +\alias{format_time_delta} +\title{Format a length-1 time delta to a character to assist messaging} +\usage{ +format_time_delta(x, time_type) +} +\description{ +This is meant to address the following: +\itemize{ +\item \code{glue::glue("{as.difftime(1, units = 'days')}")} is "1" +\item \code{glue::glue("{format(as.difftime(1, units = 'days'))}")} is "1 days" +\item time deltas for yearmonths and integers don't have units attached at all +} +} +\keyword{internal} diff --git a/man/growth_rate.Rd b/man/growth_rate.Rd index 0e22508c9..1529a836e 100644 --- a/man/growth_rate.Rd +++ b/man/growth_rate.Rd @@ -5,26 +5,25 @@ \title{Estimate growth rate} \usage{ growth_rate( - x = seq_along(y), y, + x = seq_along(y), x0 = x, method = c("rel_change", "linear_reg", "smooth_spline", "trend_filter"), h = 7, log_scale = FALSE, - dup_rm = FALSE, na_rm = FALSE, - ... + params = growth_rate_params() ) } \arguments{ +\item{y}{Signal values.} + \item{x}{Design points corresponding to the signal values \code{y}. Default is \code{seq_along(y)} (that is, equally-spaced points from 1 to the length of \code{y}).} -\item{y}{Signal values.} - \item{x0}{Points at which we should estimate the growth rate. Must be a -subset of \code{x} (no extrapolation allowed). Default is \code{x}.} +contained in the range of \code{x} (no extrapolation allowed). Default is \code{x}.} \item{method}{Either "rel_change", "linear_reg", "smooth_spline", or "trend_filter", indicating the method to use for the growth rate @@ -39,16 +38,11 @@ the entire sequence. See details for more explanation.} \item{log_scale}{Should growth rates be estimated using the parametrization on the log scale? See details for an explanation. Default is \code{FALSE}.} -\item{dup_rm}{Should we check and remove duplicates in \code{x} (and corresponding -elements of \code{y}) before the computation? Some methods might handle -duplicate \code{x} values gracefully, whereas others might fail (either quietly -or loudly). Default is \code{FALSE}.} - \item{na_rm}{Should missing values be removed before the computation? Default is \code{FALSE}.} -\item{...}{Additional arguments to pass to the method used to estimate the -derivative.} +\item{params}{Additional arguments to pass to the method used to estimate the +derivative. This should be created with \code{growth_rate_params()}.} } \value{ Vector of growth rate estimates at the specified points \code{x0}. @@ -77,12 +71,14 @@ using a first-difference approximation to the derivative. sliding window centered at the reference point \code{x0}, divided by the fitted value from this linear regression at \code{x0}. \item "smooth_spline": uses the estimated derivative at \code{x0} from a smoothing -spline fit to \code{x} and \code{y}, via \code{stats::smooth.spline()}, divided by the +spline fit to \code{x} and \code{y}, via \code{\link[stats:smooth.spline]{stats::smooth.spline()}}, divided by the fitted value of the spline at \code{x0}. \item "trend_filter": uses the estimated derivative at \code{x0} from polynomial trend filtering (a discrete spline) fit to \code{x} and \code{y}, via -\code{genlasso::trendfilter()}, divided by the fitted value of the discrete -spline at \code{x0}. +\code{\link[trendfilter:trendfilter]{trendfilter::trendfilter()}}, divided by the fitted value of the discrete +spline at \code{x0}. This method requires the +\href{https://github.com/glmgen/trendfilter}{\code{{trendfilter}} package} +to be installed. } \subsection{Log Scale}{ @@ -110,26 +106,30 @@ behavior of \code{epi_slide()} with \code{before = h - 1} and \code{after = h}). \subsection{Additional Arguments}{ For the global methods, "smooth_spline" and "trend_filter", additional -arguments can be specified via \code{...} for the underlying estimation -function. For the smoothing spline case, these additional arguments are -passed directly to \code{stats::smooth.spline()} (and the defaults are exactly -as in this function). The trend filtering case works a bit differently: -here, a custom set of arguments is allowed (which are distributed -internally to \code{genlasso::trendfilter()} and \code{genlasso::cv.trendfilter()}): +arguments can be specified via \code{params} for the underlying estimation +function. These additional arguments are +passed to \code{\link[stats:smooth.spline]{stats::smooth.spline()}}, \code{\link[trendfilter:trendfilter]{trendfilter::trendfilter()}}, or +\code{\link[trendfilter:cv_trendfilter]{trendfilter::cv_trendfilter()}}. The defaults are exactly +as specified in those functions, except when those defaults conflict +among these functions. These cases are as follows: \itemize{ -\item \code{ord}: order of piecewise polynomial for the trend filtering fit. Default -is 3. -\item \code{maxsteps}: maximum number of steps to take in the solution path before -terminating. Default is 1000. -\item \code{cv}: should cross-validation be used to choose an effective degrees of -freedom for the fit? Default is \code{TRUE}. -\item \code{k}: number of folds if cross-validation is to be used. Default is 3. -\item \code{df}: desired effective degrees of freedom for the trend filtering fit. If -\code{cv = FALSE}, then \code{df} must be a positive integer; if \code{cv = TRUE}, then -\code{df} must be one of "min" or "1se" indicating the selection rule to use +\item \code{df}: desired effective degrees of freedom. For "smooth_spline", this must be numeric (or \code{NULL}) and will +be passed along to the underlying function. For "trend_filter", if +\code{cv = FALSE}, then \code{df} must be a positive number (integer is most sensible); +if \code{cv = TRUE}, then \code{df} must be one of "min" or "1se" indicating the +selection rule to use based on the cross-validation error curve: minimum or 1-standard-error -rule, respectively. Default is "min" (going along with the default \code{cv = TRUE}). Note that if \code{cv = FALSE}, then we require \code{df} to be set by the -user. +rule, respectively. The default is "min" (going along with the default +\code{cv = TRUE}). +\item \code{lambda}: For "smooth_spline", this should be a scalar value or \code{NULL}. +For "trend_filter", this is allowed to also be a vector, as long as either +\code{cv = TRUE} or \code{df} is specified. +\item \code{cv}: should cross-validation be used to choose an effective degrees of +freedom for the fit? The default is \code{FALSE} to match \code{\link[stats:smooth.spline]{stats::smooth.spline()}}. +In that case, as in that function, GCV is used instead. +For "trend_filter", this will be coerced to \code{TRUE} if neither +\code{df} nor \code{lambda} are specified (the default). +Note that passing both \code{df} and a scalar \code{lambda} will always be an error. } } } @@ -139,8 +139,12 @@ cases_deaths_subset \%>\% group_by(geo_value) \%>\% mutate(cases_gr = growth_rate(x = time_value, y = cases)) -# Log scale, degree 4 polynomial and 6-fold cross validation +# Degree 3 polynomial and 5-fold cross validation on the log scale +# some locations report 0 cases, so we replace these with 1 cases_deaths_subset \%>\% group_by(geo_value) \%>\% - mutate(gr_poly = growth_rate(x = time_value, y = cases, log_scale = TRUE, ord = 4, k = 6)) + mutate(gr_poly = growth_rate( + x = time_value, y = pmax(cases, 1), method = "trend_filter", + log_scale = TRUE, na_rm = TRUE + )) } diff --git a/man/growth_rate_params.Rd b/man/growth_rate_params.Rd new file mode 100644 index 000000000..b0bb00c80 --- /dev/null +++ b/man/growth_rate_params.Rd @@ -0,0 +1,120 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/growth_rate.R +\name{growth_rate_params} +\alias{growth_rate_params} +\title{Optional parameters for growth rate methods} +\usage{ +growth_rate_params( + df = NULL, + lambda = NULL, + cv = FALSE, + spar = NULL, + all.knots = FALSE, + df.offset = 0, + penalty = 1, + k = 3L, + family = c("gaussian", "logistic", "poisson"), + nlambda = 50L, + lambda_max = NULL, + lambda_min = NULL, + lambda_min_ratio = 1e-05, + error_measure = c("deviance", "mse", "mae"), + nfolds = 3L +) +} +\arguments{ +\item{df}{Numeric or NULL for "smooth_spline". May also be one of "min" or +"max" in the case of "trend_filter". The desired equivalent number of +degrees of freedom of the fit. Lower values give smoother estimates.} + +\item{lambda}{The desired smoothing parameter. For "smooth_spline", this +can be specified instead of \code{spar}. For "trend_filter", this sequence +determines the balance between data fidelity and smoothness of the +estimated curve; larger \code{lambda} results in a smoother estimate. The +default, \code{NULL} results in an automatic computation based on \code{nlambda}, +the largest value of \code{lambda} that would result in a maximally smooth +estimate, and \code{lambda_min_ratio}. Supplying a value of \code{lambda} overrides +this behaviour.} + +\item{cv}{For "smooth_spline", ordinary leave-one-out (\code{TRUE}) or ‘generalized’ +cross-validation (GCV) when \code{FALSE}; is used for smoothing parameter computation +only when both \code{spar} and \code{df} are not specified. For "trend_filter", +\code{cv} determines whether or not cross-validation is used to choose the +tuning parameter. If \code{FALSE}, then the user must specify either \code{lambda} +or \code{df}.} + +\item{spar}{smoothing parameter, typically (but not necessarily) in + \eqn{(0,1]}. When \code{spar} is specified, the coefficient + \eqn{\lambda} of the integral of the squared second derivative in the + fit (penalized log likelihood) criterion is a monotone function of + \code{spar}, see the details below. Alternatively \code{lambda} may + be specified instead of the \emph{scale free} \code{spar}=\eqn{s}.} + +\item{all.knots}{if \code{TRUE}, all distinct points in \code{x} are used + as knots. If \code{FALSE} (default), a subset of \code{x[]} is used, + specifically \code{x[j]} where the \code{nknots} indices are evenly + spaced in \code{1:n}, see also the next argument \code{nknots}. + + Alternatively, a strictly increasing \code{\link{numeric}} vector + specifying \dQuote{all the knots} to be used; must be rescaled + to \eqn{[0, 1]} already such that it corresponds to the + \code{ans $ fit$knots} sequence returned, not repeating the boundary + knots.} + +\item{df.offset}{allows the degrees of freedom to be increased by + \code{df.offset} in the GCV criterion.} + +\item{penalty}{the coefficient of the penalty for degrees of freedom + in the GCV criterion.} + +\item{k}{Integer. Degree of the piecewise polynomial curve to be +estimated. For example, \code{k = 0} corresponds to a piecewise constant +curve.} + +\item{family}{Character or function. Specifies the loss function +to use. Valid options are: +\itemize{ +\item \code{"gaussian"} - least squares loss (the default), +\item \code{"binomial"} - logistic loss (classification), +\item \code{"poisson"} - Poisson loss for count data +} + +For any other type, a valid \code{\link[stats:family]{stats::family()}} object may be passed. Note +that these will generally be much slower to estimate than the built-in +options passed as strings. So for example, \code{family = "gaussian"} and +\code{family = gaussian()} will produce the same results, but the first +will be much faster.character.} + +\item{nlambda}{Integer. Number of lambda values to use in the sequence.} + +\item{lambda_max}{Optional value for the largest \code{lambda} to use.} + +\item{lambda_min}{Optional value for the smallest \code{lambda} to use (> 0).} + +\item{lambda_min_ratio}{If neither \code{lambda} nor \code{lambda_min} is specified, +\code{lambda_min = lambda_max * lambda_min_ratio}. +A very small value will lead to the solution \code{theta = y} (for the Gaussian +loss). This argument has no effect if there is a user-defined \code{lambda} +sequence.} + +\item{error_measure}{Metric used to calculate cross validation scores. May +be \code{mse}, \code{mae}, or \code{deviance}.} + +\item{nfolds}{Integer. The number of folds to use. For leave-vth-out cross +validation, every vth \code{y} value and its corresponding position (and weight) +are placed into the same fold. The first and last observations are not +assigned to any folds. This value must be at least 2. As an example, with +15 data points and \code{nfolds = 4}, the points are assigned to folds in the +following way: +\deqn{ + 0 \; 1 \; 2 \; 3 \; 4 \; 1 \; 2 \; 3 \; 4 \; 1 \; 2 \; 3 \; 4 \; 1 \; 0 + }{0 1 2 3 4 1 2 3 4 1 2 3 4 1 0} where 0 indicates no assignment. +Therefore, the folds are not random and running \code{cv_trendfilter()} twice +will give the same result.} +} +\value{ +A list of parameter configurations. +} +\description{ +Construct an object containing non-standard arguments for \code{\link[=growth_rate]{growth_rate()}}. +} diff --git a/man/guess_period.Rd b/man/guess_period.Rd index 5f17cf4ef..9cbfebc6c 100644 --- a/man/guess_period.Rd +++ b/man/guess_period.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/utils.R +% Please edit documentation in R/time-utils.R \name{guess_period} \alias{guess_period} \title{Use max valid period as guess for \code{period} of \code{time_values}} diff --git a/man/is_locf.Rd b/man/is_locf.Rd index 8efeecfd4..f8f8eefcb 100644 --- a/man/is_locf.Rd +++ b/man/is_locf.Rd @@ -4,12 +4,29 @@ \alias{is_locf} \title{Checks to see if a value in a vector is LOCF} \usage{ -is_locf(vec, tolerance) +is_locf(vec, abs_tol, is_key) } \description{ -LOCF meaning last observation carried forward. lags the vector by 1, then -compares with itself. For doubles it uses float comparison via -\code{\link[dplyr:near]{dplyr::near}}, otherwise it uses equality. \code{NA}'s and \code{NaN}'s are -considered equal to themselves and each other. +LOCF meaning last observation carried forward (to later +versions). Lags the vector by 1, then compares with itself. If \code{is_key} is +\code{TRUE}, only values that are exactly the same between the lagged and +original are considered LOCF. If \code{is_key} is \code{FALSE} and \code{vec} is a vector +of numbers (\code{\link[base:numeric]{base::is.numeric}}), then approximate equality will be used, +checking whether the absolute difference between each pair of entries is +\verb{<= abs_tol}; if \code{vec} is something else, then exact equality is used +instead. +} +\details{ +We include epikey-time columns in LOCF comparisons as part of an optimization +to avoid slower grouped operations while still ensuring that the first +observation for each time series will not be marked as LOCF. We test these +key columns for exact equality to prevent chopping off consecutive +time_values during flat periods when \code{abs_tol} is high. + +We use exact equality for non-\code{is.numeric} double/integer columns such as +dates, datetimes, difftimes, \code{tsibble::yearmonth}s, etc., as these may be +used as part of re-indexing or grouping procedures, and we don't want to +change the number of groups for those operations when we remove LOCF data +during compactification. } \keyword{internal} diff --git a/man/key_colnames.Rd b/man/key_colnames.Rd index f5e13837c..3a1865bde 100644 --- a/man/key_colnames.Rd +++ b/man/key_colnames.Rd @@ -2,35 +2,62 @@ % Please edit documentation in R/key_colnames.R \name{key_colnames} \alias{key_colnames} -\alias{key_colnames.default} \alias{key_colnames.data.frame} \alias{key_colnames.epi_df} +\alias{key_colnames.tbl_ts} \alias{key_colnames.epi_archive} -\title{Grab any keys associated to an epi_df} +\title{Get names of columns that form a (unique) key associated with an object} \usage{ -key_colnames(x, ...) +key_colnames(x, ..., exclude = character()) -\method{key_colnames}{default}(x, ...) +\method{key_colnames}{data.frame}(x, ..., geo_keys, other_keys, time_keys, exclude = character()) -\method{key_colnames}{data.frame}(x, other_keys = character(0L), exclude = character(0L), ...) +\method{key_colnames}{epi_df}( + x, + ..., + geo_keys = "geo_value", + other_keys = attr(x, "metadata")$other_keys, + time_keys = "time_value", + exclude = character() +) -\method{key_colnames}{epi_df}(x, exclude = character(0L), ...) +\method{key_colnames}{tbl_ts}(x, ..., exclude = character()) -\method{key_colnames}{epi_archive}(x, exclude = character(0L), ...) +\method{key_colnames}{epi_archive}(x, ..., exclude = character()) } \arguments{ -\item{x}{a data.frame, tibble, or epi_df} +\item{x}{an object, often a data frame or something similar. \code{{epiprocess}} +includes implementations for \code{\link{epi_df}}s, \code{\link{epi_archive}}s, +\code{\link[tsibble:tsibble]{tsibble::tsibble}}s, and other data frames (including +\code{\link[tibble:tibble]{tibble::tibble}}s); other packages, like \code{{epipredict}}, can add more.} \item{...}{additional arguments passed on to methods} -\item{other_keys}{an optional character vector of other keys to include} +\item{exclude}{an optional character vector of key column names to exclude +from the result} -\item{exclude}{an optional character vector of keys to exclude} +\item{geo_keys, other_keys, time_keys}{character vectors, sometimes optional; +which variables (if any) should be considered as part of a unique +key/identifier for data in \code{x}, dealing respectively with the associated +geographical region, demographic/strain/other information needed in +addition to the geographical region to identify individual time series in +\code{x}, and time interval during which associated events occurred. + +Mandatory if \code{x} is a regular \code{data.frame} or \code{tibble}. Optional if \code{x} is +an \code{epi_df}; the defaults are \code{"geo_value"}, the \code{epi_df}'s \code{other_keys} +metadata, and \code{"time_value"}, respectively; if you provide these manually, +they must match the defaults. (This behavior is to enable consistent and +sane results when you can't guarantee whether \code{x} is an \code{epi_df} or just a +\code{tibble}/\code{data.frame}. You don't need to use it if you know that \code{x} is +definitely an \code{epi_df}.) Not accepted when \code{x} is a \code{tsibble} or an +\code{epi_archive}.} } \value{ -If an \code{epi_df}, this returns all "keys". Otherwise \code{NULL}. +character vector } \description{ -Grab any keys associated to an epi_df +This is entirely based on metadata and arguments passed; there are no +explicit checks that the key actually is unique in any associated data +structures. } \keyword{internal} diff --git a/man/lag_within_x_latest.Rd b/man/lag_within_x_latest.Rd new file mode 100644 index 000000000..9c90fd8c3 --- /dev/null +++ b/man/lag_within_x_latest.Rd @@ -0,0 +1,19 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/revision_analysis.R +\name{lag_within_x_latest} +\alias{lag_within_x_latest} +\title{pull the value from lags when values starts indefinitely being within prop of its latest value.} +\usage{ +lag_within_x_latest(lags, values, prop = 0.2) +} +\arguments{ +\item{lags}{vector of lags; should be sorted} + +\item{values}{this should be a vector (e.g., a column) with length matching that of \code{lags}} + +\item{prop}{optional length-1 double; proportion} +} +\description{ +pull the value from lags when values starts indefinitely being within prop of its latest value. +} +\keyword{internal} diff --git a/man/n_steps_to_time_delta.Rd b/man/n_steps_to_time_delta.Rd new file mode 100644 index 000000000..6a4763464 --- /dev/null +++ b/man/n_steps_to_time_delta.Rd @@ -0,0 +1,22 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/time-utils.R +\name{n_steps_to_time_delta} +\alias{n_steps_to_time_delta} +\title{Convert from integerish/infinite/mix to time_delta} +\usage{ +n_steps_to_time_delta(n_steps, time_type, format = c("friendly", "fast")) +} +\arguments{ +\item{n_steps}{integerish vector that can mix in infinite values} + +\item{time_type}{as in \code{\link{validate_slide_window_arg}}} + +\item{format}{optional; \code{"friendly"} to output a more descriptive/friendly +class like \code{"difftime"} when possible; \code{"fast"} to output a class that's +generally faster to work with when possible, like a vanilla \code{"numeric"}. +Default is \code{"friendly"}.} +} +\description{ +Convert from integerish/infinite/mix to time_delta +} +\keyword{internal} diff --git a/man/reconstruct_light_edf.Rd b/man/reconstruct_light_edf.Rd new file mode 100644 index 000000000..dacaa6ab7 --- /dev/null +++ b/man/reconstruct_light_edf.Rd @@ -0,0 +1,27 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/methods-epi_df.R +\name{reconstruct_light_edf} +\alias{reconstruct_light_edf} +\title{Like \code{dplyr_reconstruct.epi_df} but not recomputing any grouping} +\usage{ +reconstruct_light_edf(data, template) +} +\description{ +In the move to our current not-quite-proper/effective "implementation" of +\code{\link[dplyr:dplyr_extending]{dplyr::dplyr_extending}} for \code{epi_df}s, we moved a lot of checks in +\code{dplyr_reconstruct} and used it instead of \code{reclass()} in various +operations to prevent operations from outputting invalid metadata/classes, +instead of more careful tailored and relevant checks. However, this actually +introduced extra overhead due to \code{dplyr_reconstruct.epi_df()} passing off to +\code{dplyr_reconstruct.grouped_df()} when grouped, which assumes that it will +need to / should for safety recompute the groups, even when it'd be safe for +it not to do so. In many operations, we're using \code{NextMethod()} to dispatch +to \code{grouped_df} behavior if needed, and it should output something with valid +groupings. +} +\details{ +This function serves the original purpose of performing \code{epi_df}-centric +checks rather than just throwing on potentially-incorrect metadata like +\code{reclass()}, but without unnecessary \code{dplyr_reconstruct()} delegation. +} +\keyword{internal} diff --git a/man/removed_by_compactify.Rd b/man/removed_by_compactify.Rd index 2f1298885..fa65fa792 100644 --- a/man/removed_by_compactify.Rd +++ b/man/removed_by_compactify.Rd @@ -4,7 +4,7 @@ \alias{removed_by_compactify} \title{get the entries that \code{compactify} would remove} \usage{ -removed_by_compactify(df, keys, tolerance) +removed_by_compactify(updates_df, ukey_names, abs_tol) } \description{ get the entries that \code{compactify} would remove diff --git a/man/revision_summary.Rd b/man/revision_summary.Rd index 39a72b9a3..54abf098c 100644 --- a/man/revision_summary.Rd +++ b/man/revision_summary.Rd @@ -9,40 +9,44 @@ revision_summary( ..., drop_nas = TRUE, print_inform = TRUE, - min_waiting_period = as.difftime(60, units = "days"), + min_waiting_period = as.difftime(60, units = "days") \%>\% + difftime_approx_ceiling_time_delta(epi_arch$time_type), within_latest = 0.2, - quick_revision = as.difftime(3, units = "days"), + quick_revision = as.difftime(3, units = "days") \%>\% + difftime_approx_ceiling_time_delta(epi_arch$time_type), few_revisions = 3, abs_spread_threshold = NULL, rel_spread_threshold = 0.1, - compactify_tol = .Machine$double.eps^0.5, - should_compactify = TRUE + compactify = TRUE, + compactify_abs_tol = 0 ) } \arguments{ \item{epi_arch}{an epi_archive to be analyzed} \item{...}{<\code{\link[=dplyr_tidy_select]{tidyselect}}>, used to choose the column to -summarize. If empty, it chooses the first. Currently only implemented for -one column at a time.} +summarize. If empty and there is only one value/measurement column (i.e., +not in \code{\link{key_colnames}}) in the archive, it will automatically select it. +If supplied, \code{...} must select exactly one column.} \item{drop_nas}{bool, drop any \code{NA} values from the archive? After dropping -\code{NA}'s compactify is run again to make sure there are no duplicate values -from occasions when the signal is revised to \code{NA}, and then back to its -immediately-preceding value.} +\code{NA}'s compactify is run again if \code{compactify} is \code{TRUE} to make +sure there are no duplicate values from occasions when the signal is +revised to \code{NA}, and then back to its immediately-preceding value.} \item{print_inform}{bool, determines whether to print summary information, or only return the full summary tibble} \item{min_waiting_period}{\code{difftime}, integer or \code{NULL}. Sets a cutoff: any -time_values not earlier than \code{min_waiting_period} before \code{versions_end} are -removed. \code{min_waiting_period} should characterize the typical time during -which revisions occur. The default of 60 days corresponds to a typical -final value for case counts as reported in the context of insurance. To -avoid this filtering, either set to \code{NULL} or 0.} +time_values that have not had at least \code{min_waiting_period} to stabilize as +of the \code{versions_end} are removed. \code{min_waiting_period} should characterize +the typical time during which most significant revisions occur. The default +of 60 days corresponds to a typical near-final value for case counts as +reported in the context of insurance. To avoid this filtering, either set +to \code{NULL} or 0.} \item{within_latest}{double between 0 and 1. Determines the threshold -used for the \code{time_to}} +used for the \code{lag_to}} \item{quick_revision}{difftime or integer (integer is treated as days), for the printed summary, the amount of time between the final revision and the @@ -52,20 +56,25 @@ days} \item{few_revisions}{integer, for the printed summary, the upper bound on the number of revisions to consider "few". Default is 3.} -\item{abs_spread_threshold}{numeric, for the printed summary, the maximum -spread used to characterize revisions which don't actually change very -much. Default is 5\% of the maximum value in the dataset, but this is the -most unit dependent of values, and likely needs to be chosen appropriate -for the scale of the dataset.} +\item{abs_spread_threshold}{length-1 numeric, for the printed summary, the +maximum spread used to characterize revisions which don't actually change +very much. Default is 5\% of the maximum value in the dataset, but this is +the most unit dependent of values, and likely needs to be chosen +appropriate for the scale of the dataset.} -\item{rel_spread_threshold}{float between 0 and 1, for the printed summary, -the relative spread fraction used to characterize revisions which don't -actually change very much. Default is .1, or 10\% of the final value} +\item{rel_spread_threshold}{length-1 double between 0 and 1, for the printed +summary, the relative spread fraction used to characterize revisions which +don't actually change very much. Default is .1, or 10\% of the final value} -\item{compactify_tol}{float, used if \code{drop_nas=TRUE}, it determines the -threshold for when two floats are considered identical.} +\item{compactify}{bool. If \code{TRUE}, we will compactify after the signal +requested in \code{...} has been selected on its own and the \code{drop_nas} step. +This helps, for example, to give similar results when called on +\link[=epix_merge]{merged} and single-signal archives, since merged archives +record an update when any of the other signals change, not just the +requested signal. The default is \code{TRUE}.} -\item{should_compactify}{bool. Compactify if \code{TRUE}.} +\item{compactify_abs_tol}{length-1 double, used if \code{compactify} is \code{TRUE}, it +determines the threshold for when two doubles are considered identical.} } \description{ \code{revision_summary} removes all missing values (if requested), and then @@ -87,13 +96,22 @@ always excludes \code{NA} values) \item \code{rel_spread}: \code{spread} divided by the largest value (so it will always be less than 1). Note that this need not be the final value. It will be \code{NA} whenever \code{spread} is 0. -\item \code{time_near_latest}: the time taken for the revisions to settle to within +\item \code{lag_near_latest}: the time taken for the revisions to settle to within \code{within_latest} (default 20\%) of the final value and stay there. For example, consider the series (0, 20, 99, 150, 102, 100); then -\code{time_near_latest} is 5, since even though 99 is within 20\%, it is outside +\code{lag_near_latest} is 5, since even though 99 is within 20\%, it is outside the window afterwards at 150. } } +\details{ +Applies to \code{epi_archive}s with \code{time_type}s of \code{"day"}, \code{"week"}, +and \code{"yearmonth"}. It can also work with a \code{time_type} of \code{"integer"} if +the possible \code{time_values} are all consecutive integers; you will need to +manually specify the \code{min_waiting_period} and \code{quick_revision}, though. +Using a \code{time_type} of \code{"integer"} with week numbers like 202501 will +produce incorrect results for some calculations, since week numbering +contains jumps at year boundaries. +} \examples{ revision_example <- revision_summary(archive_cases_dv_subset, percent_cli) revision_example \%>\% arrange(desc(spread)) diff --git a/man/time_delta_standardize.Rd b/man/time_delta_standardize.Rd new file mode 100644 index 000000000..4ce0bd53d --- /dev/null +++ b/man/time_delta_standardize.Rd @@ -0,0 +1,12 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/time-utils.R +\name{time_delta_standardize} +\alias{time_delta_standardize} +\title{Standardize time_deltas to a multiple of \code{\link[=unit_time_delta]{unit_time_delta()}}} +\usage{ +time_delta_standardize(time_delta, time_type, format = c("friendly", "fast")) +} +\description{ +Standardize time_deltas to a multiple of \code{\link[=unit_time_delta]{unit_time_delta()}} +} +\keyword{internal} diff --git a/man/time_delta_summary.Rd b/man/time_delta_summary.Rd new file mode 100644 index 000000000..c6b57c9bb --- /dev/null +++ b/man/time_delta_summary.Rd @@ -0,0 +1,14 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/revision_analysis.R +\name{time_delta_summary} +\alias{time_delta_summary} +\title{Like \code{summary} but working across all "time deltas", including difftimes} +\usage{ +time_delta_summary(time_delta, time_type) +} +\description{ +Also standardizes units of difftimes to the natural unit for the given +\code{time_type} (via conversion to and from a corresponding number of time +steps). +} +\keyword{internal} diff --git a/man/time_delta_to_approx_difftime.Rd b/man/time_delta_to_approx_difftime.Rd new file mode 100644 index 000000000..a726fd62b --- /dev/null +++ b/man/time_delta_to_approx_difftime.Rd @@ -0,0 +1,31 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/time-utils.R +\name{time_delta_to_approx_difftime} +\alias{time_delta_to_approx_difftime} +\title{Convert \code{time_delta} to an approximate difftime} +\usage{ +time_delta_to_approx_difftime(time_delta, time_type) +} +\description{ +\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#experimental}{\figure{lifecycle-experimental.svg}{options: alt='[Experimental]'}}}{\strong{[Experimental]}} +} +\details{ +To assist in comparing \code{time_delta}s to default \code{difftime} thresholds when we +want to reduce friction. + +It may be better to try to do something like make \code{time_delta} validation +more accommodating (e.g., of difftimes with units of "days" when working on +weekly scale), and remain rigid on yearmonths. Applying deltas and comparing +time_values might also be an approach but seems more fraught as the least +common denominator would be start/mid/end datetimes of time intervals, but +those are also ambiguous (starting&representation wdays of weeks are unknown, +timezone of dates are unknown). + +Another alternative approach, below, converts difftimes to time_deltas +instead. It requires knowledge of which way to round in order to get +time_deltas representing an integer number of time steps, but avoids some +potential inconsistencies of the time-delta-to-difftime approach when we +think about applying it to, e.g., months / spans of months with varying +numbers of days, and also makes it easier to avoid "magical defaults". +} +\keyword{internal} diff --git a/man/time_delta_to_n_steps.Rd b/man/time_delta_to_n_steps.Rd index 937159195..d7858064b 100644 --- a/man/time_delta_to_n_steps.Rd +++ b/man/time_delta_to_n_steps.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/utils.R +% Please edit documentation in R/time-utils.R \name{time_delta_to_n_steps} \alias{time_delta_to_n_steps} \title{Convert a time delta to a integerish number of "unit" steps between time values} diff --git a/man/time_minus_time_in_n_steps.Rd b/man/time_minus_time_in_n_steps.Rd new file mode 100644 index 000000000..aab030dea --- /dev/null +++ b/man/time_minus_time_in_n_steps.Rd @@ -0,0 +1,23 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/time-utils.R +\name{time_minus_time_in_n_steps} +\alias{time_minus_time_in_n_steps} +\title{Difference between two time value vectors in terms of number of time "steps"} +\usage{ +time_minus_time_in_n_steps(x, y, time_type) +} +\arguments{ +\item{x}{a time_value (vector) of time type \code{time_type}} + +\item{y}{a time_value (vector) of time type \code{time_type}} + +\item{time_type}{as in \code{\link[=validate_slide_window_arg]{validate_slide_window_arg()}}} +} +\value{ +integerish vector such that \code{x + n_steps_to_time_delta_fast(result)} +should equal \code{y}. +} +\description{ +Difference between two time value vectors in terms of number of time "steps" +} +\keyword{internal} diff --git a/man/time_plus_n_steps.Rd b/man/time_plus_n_steps.Rd new file mode 100644 index 000000000..f7071c132 --- /dev/null +++ b/man/time_plus_n_steps.Rd @@ -0,0 +1,25 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/time-utils.R +\name{time_plus_n_steps} +\alias{time_plus_n_steps} +\alias{time_minus_n_steps} +\title{Advance/retreat time_values by specified number of time "steps"} +\usage{ +time_plus_n_steps(x, y, time_type) + +time_minus_n_steps(x, y, time_type) +} +\arguments{ +\item{x}{a time_value (vector) of time type \code{time_type}} + +\item{y}{integerish (vector)} + +\item{time_type}{as in \code{\link[=validate_slide_window_arg]{validate_slide_window_arg()}}} +} +\value{ +a time_value (vector) of time type \code{time_type} +} +\description{ +Here, a "step" is based on the \code{time_type}, not just the class of \code{x}. +} +\keyword{internal} diff --git a/man/time_type_unit_abbr.Rd b/man/time_type_unit_abbr.Rd new file mode 100644 index 000000000..4e2971500 --- /dev/null +++ b/man/time_type_unit_abbr.Rd @@ -0,0 +1,20 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/time-utils.R +\name{time_type_unit_abbr} +\alias{time_type_unit_abbr} +\title{Get an abbreviation for the "units" of \code{unit_time_delta(time_type)}} +\usage{ +time_type_unit_abbr(time_type) +} +\arguments{ +\item{time_type}{str} +} +\value{ +str +} +\description{ +For use in formatting or automatically naming things based on +\code{time_delta_to_n_steps(time_delta)} for a \code{time_delta} between times of time +type \code{time_type}. +} +\keyword{internal} diff --git a/man/time_type_unit_abbrs.Rd b/man/time_type_unit_abbrs.Rd new file mode 100644 index 000000000..d6b93ec1b --- /dev/null +++ b/man/time_type_unit_abbrs.Rd @@ -0,0 +1,16 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/time-utils.R +\docType{data} +\name{time_type_unit_abbrs} +\alias{time_type_unit_abbrs} +\title{Helper data for \code{\link{time_type_unit_abbr}}} +\format{ +An object of class \code{character} of length 3. +} +\usage{ +time_type_unit_abbrs +} +\description{ +Helper data for \code{\link{time_type_unit_abbr}} +} +\keyword{internal} diff --git a/man/time_type_unit_pluralizer.Rd b/man/time_type_unit_pluralizer.Rd new file mode 100644 index 000000000..a815e483f --- /dev/null +++ b/man/time_type_unit_pluralizer.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/time-utils.R +\docType{data} +\name{time_type_unit_pluralizer} +\alias{time_type_unit_pluralizer} +\title{Helper data for \code{\link{format_time_delta}}} +\format{ +An object of class \code{character} of length 4. +} +\usage{ +time_type_unit_pluralizer +} +\description{ +Should not be altered on the basis of untrusted user input, as it is used as +a cli format string and may run code. +} +\keyword{internal} diff --git a/man/time_within_x_latest.Rd b/man/time_within_x_latest.Rd deleted file mode 100644 index 1dd7e8010..000000000 --- a/man/time_within_x_latest.Rd +++ /dev/null @@ -1,15 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/revision_analysis.R -\name{time_within_x_latest} -\alias{time_within_x_latest} -\title{pull the value from lags when values starts indefinitely being within prop of it's last value.} -\usage{ -time_within_x_latest(lags, values, prop = 0.2) -} -\arguments{ -\item{values}{this should be a 1 column tibble. errors may occur otherwise} -} -\description{ -pull the value from lags when values starts indefinitely being within prop of it's last value. -} -\keyword{internal} diff --git a/man/unit_time_delta.Rd b/man/unit_time_delta.Rd index 46b3c48d6..ec63d2558 100644 --- a/man/unit_time_delta.Rd +++ b/man/unit_time_delta.Rd @@ -1,20 +1,29 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/utils.R +% Please edit documentation in R/time-utils.R \name{unit_time_delta} \alias{unit_time_delta} \title{Object that, added to time_values of time_type, advances by one time step/interval} \usage{ -unit_time_delta(time_type) +unit_time_delta(time_type, format = c("friendly", "fast")) } \arguments{ \item{time_type}{string; \code{epi_df}'s or \code{epi_archive}'s \code{time_type}} + +\item{format}{"friendly" or "fast"; for some time_types, there are multiple +ways to represent time_deltas. "friendly" tries to output a format that +will be more informative when printed, and produce errors in more cases +when used in unexpected ways. "fast" tries to output a time_delta that will +be faster in downstream operations.} } \value{ an object \code{u} such that \code{time_values + u} represents advancing by one time step / moving to the subsequent time interval for any \code{time_values} object of time type \code{time_type}, and such that \code{time_values + k * u} for integerish vector \code{k} advances by \code{k} steps (with vectorization, -recycling). +recycling). At time of writing, these objects also all support +multiplication by nonintegerish numeric vectors, \code{mean}, and \code{median}, +which are useful for summarizing vector time_deltas, but these fractional +time_deltas are not allowed in time_delta-specific operations. } \description{ Object that, added to time_values of time_type, advances by one time step/interval diff --git a/man/update_is_locf.Rd b/man/update_is_locf.Rd new file mode 100644 index 000000000..722f3d5c4 --- /dev/null +++ b/man/update_is_locf.Rd @@ -0,0 +1,25 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/archive.R +\name{update_is_locf} +\alias{update_is_locf} +\title{Internal helper; lgl; which updates are LOCF} +\usage{ +update_is_locf(arranged_updates_df, ukey_names, abs_tol) +} +\arguments{ +\item{arranged_updates_df}{an arranged update data frame like an \code{epi_archive} \code{DT}} + +\item{ukey_names}{(not validated:) chr; the archive/equivalent +\code{\link{key_colnames}}; must include \code{"version"}.} + +\item{abs_tol}{(not validated:) as in \code{\link{apply_compactify}}} +} +\value{ +lgl +} +\description{ +(Not validated:) Must be called inside certain dplyr data masking verbs (e.g., +\code{filter} or \code{mutate}) being run on an \code{epi_archive}'s \code{DT} or a data frame +formatted like one. +} +\keyword{internal} diff --git a/man/validate_slide_window_arg.Rd b/man/validate_slide_window_arg.Rd new file mode 100644 index 000000000..13e79fb67 --- /dev/null +++ b/man/validate_slide_window_arg.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/time-utils.R +\name{validate_slide_window_arg} +\alias{validate_slide_window_arg} +\title{Validate \code{.before} or \code{.window_size} argument} +\usage{ +validate_slide_window_arg( + arg, + time_type, + lower = 1, + allow_inf = TRUE, + arg_name = rlang::caller_arg(arg) +) +} +\description{ +Validate \code{.before} or \code{.window_size} argument +} +\keyword{internal} diff --git a/tests/testthat/_snaps/archive.md b/tests/testthat/_snaps/archive.md index 6e010da06..61cabbd3d 100644 --- a/tests/testthat/_snaps/archive.md +++ b/tests/testthat/_snaps/archive.md @@ -1,13 +1,5 @@ -# new_epi_archive correctly detects and warns about compactification +# as_epi_archive default compactification (no longer messages/warns) Code res <- dumb_ex %>% as_epi_archive() - Condition - Warning: - Found rows that appear redundant based on last (version of each) observation carried forward; these rows have been removed to 'compactify' and save space: - Key: - geo_value time_value value version - - 1: ca 2020-01-01 1 2020-01-02 - Built-in `epi_archive` functionality should be unaffected, but results may change if you work directly with its fields (such as `DT`). See `?as_epi_archive` for details. To silence this warning but keep compactification, you can pass `compactify=TRUE` when constructing the archive. diff --git a/tests/testthat/_snaps/growth_rate.md b/tests/testthat/_snaps/growth_rate.md new file mode 100644 index 000000000..213c27569 --- /dev/null +++ b/tests/testthat/_snaps/growth_rate.md @@ -0,0 +1,109 @@ +# global param constructor errors when required + + Code + growth_rate_params(df = -5) + Condition + Error in `growth_rate_params()`: + ! Assertion on 'df' failed: Element 1 is not >= 0. + +--- + + Code + growth_rate_params(nlambda = 5:8) + Condition + Error in `growth_rate_params()`: + ! Assertion on 'nlambda' failed: Must have length 1. + +# new setup args and warnings are as expected + + Code + growth_rate(y = -10:10, log_scale = TRUE) + Condition + Warning: + `y` contains 0 or negative values. Taking logs may produce strange results. + Error in `growth_rate()`: + ! Either the first or last `y` values are not finite. This may be due to `log_scale = TRUE`. + +--- + + Code + growth_rate(y = -10:10, log_scale = TRUE, method = "smooth_spline") + Condition + Warning: + `y` contains 0 or negative values. Taking logs may produce strange results. + Error in `growth_rate()`: + ! Either the first or last `y` values are not finite. This may be due to `log_scale = TRUE`. + +--- + + Code + growth_rate(y = 1:30, x = c(1:20, NA, 22:30), na_rm = TRUE) + Condition + Error in `growth_rate()`: + ! Neither `x` nor `x0` may contain `NA`s. + +--- + + Code + growth_rate(y = 1:20, method = "smooth_spline", params = growth_rate_params( + lambda = 1:20)) + Condition + Error in `growth_rate()`: + ! "smooth_spline" requires 1 `lambda` but more were used. + +# parser sees all cases + + Code + parse_trendfilter_params(l) + Condition + Error in `parse_trendfilter_params()`: + ! When `cv = TRUE`, `df` must be `NULL` or character and `lambda` must be `NULL` or a vector. + +--- + + Code + parse_trendfilter_params(l) + Condition + Error in `parse_trendfilter_params()`: + ! When `cv = TRUE`, `df` must be `NULL` or character and `lambda` must be `NULL` or a vector. + +--- + + Code + parse_trendfilter_params(l) + Condition + Error in `parse_trendfilter_params()`: + ! When `cv = TRUE`, `df` must be `NULL` or character and `lambda` must be `NULL` or a vector. + +--- + + Code + parse_trendfilter_params(l) + Condition + Error in `parse_trendfilter_params()`: + ! When `cv = TRUE`, `df` must be `NULL` or character and `lambda` must be `NULL` or a vector. + +--- + + Code + parse_trendfilter_params(l) + Condition + Error in `parse_trendfilter_params()`: + ! When `cv = TRUE`, `df` must be `NULL` or character and `lambda` must be `NULL` or a vector. + +--- + + Code + parse_trendfilter_params(l) + Condition + Error in `parse_trendfilter_params()`: + ! `df` a character implies using CV, but also setting `lambda` to a single value implies no CV. + +--- + + Code + parse_trendfilter_params(l) + Condition + Error in `parse_trendfilter_params()`: + ! `df` and `lambda` cannot both be scalars. + diff --git a/tests/testthat/_snaps/revision-latency-functions.md b/tests/testthat/_snaps/revision-latency-functions.md index 1ac214691..4f2bfe269 100644 --- a/tests/testthat/_snaps/revision-latency-functions.md +++ b/tests/testthat/_snaps/revision-latency-functions.md @@ -1,4 +1,4 @@ -# revision_summary works for a dummy dataset +# revision_summary works for dummy datasets Code dummy_ex %>% revision_summary() %>% print(n = 10, width = 300) @@ -20,20 +20,20 @@ * 1 out of 4 (25%) Spread of more than 5.1 in actual value (when revised): * 3 out of 4 (75%) - days until within 20% of the latest value: + Days until within 20% of the latest value: Output min median mean max 0 days 3 days 6.9 days 19 days # A tibble: 7 x 11 - time_value geo_value n_revisions min_lag max_lag time_near_latest spread - - 1 2020-01-01 ak 4 2 days 19 days 19 days 101 - 2 2020-01-02 ak 1 4 days 5 days 4 days 9 - 3 2020-01-03 ak 0 3 days 3 days 3 days 0 - 4 2020-01-01 al 1 0 days 19 days 19 days 99 - 5 2020-01-02 al 0 0 days 0 days 0 days 0 - 6 2020-01-03 al 1 1 days 2 days 2 days 3 - 7 2020-01-04 al 0 1 days 1 days 1 days 0 + time_value geo_value n_revisions min_lag max_lag lag_near_latest spread + + 1 2020-01-01 ak 4 2 days 19 days 19 days 101 + 2 2020-01-02 ak 1 4 days 5 days 4 days 9 + 3 2020-01-03 ak 0 3 days 3 days 3 days 0 + 4 2020-01-01 al 1 0 days 19 days 19 days 99 + 5 2020-01-02 al 0 0 days 0 days 0 days 0 + 6 2020-01-03 al 1 1 days 2 days 2 days 3 + 7 2020-01-04 al 0 1 days 1 days 1 days 0 rel_spread min_value max_value median_value 1 0.990 1 102 6 @@ -68,20 +68,166 @@ * 2 out of 5 (40%) Spread of more than 5.1 in actual value (when revised): * 3 out of 5 (60%) - days until within 20% of the latest value: + Days until within 20% of the latest value: Output min median mean max 0 days 3 days 6.9 days 19 days # A tibble: 7 x 11 - time_value geo_value n_revisions min_lag max_lag time_near_latest spread - - 1 2020-01-01 ak 6 2 days 19 days 19 days 101 - 2 2020-01-02 ak 1 4 days 5 days 4 days 9 - 3 2020-01-03 ak 0 3 days 3 days 3 days 0 - 4 2020-01-01 al 1 0 days 19 days 19 days 99 - 5 2020-01-02 al 0 0 days 0 days 0 days 0 - 6 2020-01-03 al 1 1 days 2 days 2 days 3 - 7 2020-01-04 al 1 0 days 1 days 1 days 0 + time_value geo_value n_revisions min_lag max_lag lag_near_latest spread + + 1 2020-01-01 ak 6 2 days 19 days 19 days 101 + 2 2020-01-02 ak 1 4 days 5 days 4 days 9 + 3 2020-01-03 ak 0 3 days 3 days 3 days 0 + 4 2020-01-01 al 1 0 days 19 days 19 days 99 + 5 2020-01-02 al 0 0 days 0 days 0 days 0 + 6 2020-01-03 al 1 1 days 2 days 2 days 3 + 7 2020-01-04 al 1 0 days 1 days 1 days 0 + rel_spread min_value max_value median_value + + 1 0.990 1 102 5.5 + 2 0.09 91 100 95.5 + 3 NaN 0 0 0 + 4 0.99 1 100 50.5 + 5 0 1 1 1 + 6 0.75 1 4 2.5 + 7 0 9 9 9 + +--- + + Code + dummy_ex_weekly %>% revision_summary(drop_nas = FALSE) %>% print(n = 10, width = 300) + Message + Min lag (time to first version): + Output + min median mean max + 0 weeks 1 weeks 1.4 weeks 4 weeks + Message + Fraction of all versions that are `NA`: + * 2 out of 19 (10.53%) + Fraction of epi_key+time_values with + No revisions: + * 2 out of 7 (28.57%) + Quick revisions (last revision within 1 week of the `time_value`): + * 2 out of 7 (28.57%) + Few revisions (At most 3 revisions for that `time_value`): + * 6 out of 7 (85.71%) + Fraction of revised epi_key+time_values which have: + Less than 0.1 spread in relative value: + * 2 out of 5 (40%) + Spread of more than 5.1 in actual value (when revised): + * 3 out of 5 (60%) + Weeks until within 20% of the latest value: + Output + min median mean max + 0 weeks 3 weeks 6.9 weeks 19 weeks + # A tibble: 7 x 11 + time_value geo_value n_revisions min_lag max_lag lag_near_latest spread + + 1 2020-01-01 ak 6 2 weeks 19 weeks 19 weeks 101 + 2 2020-01-08 ak 1 4 weeks 5 weeks 4 weeks 9 + 3 2020-01-15 ak 0 3 weeks 3 weeks 3 weeks 0 + 4 2020-01-01 al 1 0 weeks 19 weeks 19 weeks 99 + 5 2020-01-08 al 0 0 weeks 0 weeks 0 weeks 0 + 6 2020-01-15 al 1 1 weeks 2 weeks 2 weeks 3 + 7 2020-01-22 al 1 0 weeks 1 weeks 1 weeks 0 + rel_spread min_value max_value median_value + + 1 0.990 1 102 5.5 + 2 0.09 91 100 95.5 + 3 NaN 0 0 0 + 4 0.99 1 100 50.5 + 5 0 1 1 1 + 6 0.75 1 4 2.5 + 7 0 9 9 9 + +--- + + Code + dummy_ex_yearmonthly %>% revision_summary(drop_nas = FALSE) %>% print(n = 10, + width = 300) + Message + Min lag (time to first version): + Output + min median mean max + 0 1 1.4 4 + Message + Fraction of all versions that are `NA`: + * 2 out of 19 (10.53%) + Fraction of epi_key+time_values with + No revisions: + * 2 out of 7 (28.57%) + Quick revisions (last revision within 1 month of the `time_value`): + * 2 out of 7 (28.57%) + Few revisions (At most 3 revisions for that `time_value`): + * 6 out of 7 (85.71%) + Fraction of revised epi_key+time_values which have: + Less than 0.1 spread in relative value: + * 2 out of 5 (40%) + Spread of more than 5.1 in actual value (when revised): + * 3 out of 5 (60%) + Months until within 20% of the latest value: + Output + min median mean max + 0 3 6.9 19 + # A tibble: 7 x 11 + time_value geo_value n_revisions min_lag max_lag lag_near_latest spread + + 1 2020 Jan ak 6 2 19 19 101 + 2 2020 Feb ak 1 4 5 4 9 + 3 2020 Mar ak 0 3 3 3 0 + 4 2020 Jan al 1 0 19 19 99 + 5 2020 Feb al 0 0 0 0 0 + 6 2020 Mar al 1 1 2 2 3 + 7 2020 Apr al 1 0 1 1 0 + rel_spread min_value max_value median_value + + 1 0.990 1 102 5.5 + 2 0.09 91 100 95.5 + 3 NaN 0 0 0 + 4 0.99 1 100 50.5 + 5 0 1 1 1 + 6 0.75 1 4 2.5 + 7 0 9 9 9 + +--- + + Code + dummy_ex_integerly %>% revision_summary(min_waiting_period = 60, + quick_revision = 3, drop_nas = FALSE) %>% print(n = 10, width = 300) + Message + Min lag (time to first version): + Output + min median mean max + 0 1 1.4 4 + Message + Fraction of all versions that are `NA`: + * 2 out of 19 (10.53%) + Fraction of epi_key+time_values with + No revisions: + * 2 out of 7 (28.57%) + Quick revisions (last revision within 3 time steps of the `time_value`): + * 4 out of 7 (57.14%) + Few revisions (At most 3 revisions for that `time_value`): + * 6 out of 7 (85.71%) + Fraction of revised epi_key+time_values which have: + Less than 0.1 spread in relative value: + * 2 out of 5 (40%) + Spread of more than 5.1 in actual value (when revised): + * 3 out of 5 (60%) + Time Steps until within 20% of the latest value: + Output + min median mean max + 0 3 6.9 19 + # A tibble: 7 x 11 + time_value geo_value n_revisions min_lag max_lag lag_near_latest spread + + 1 1 ak 6 2 19 19 101 + 2 2 ak 1 4 5 4 9 + 3 3 ak 0 3 3 3 0 + 4 1 al 1 0 19 19 99 + 5 2 al 0 0 0 0 0 + 6 3 al 1 1 2 2 3 + 7 4 al 1 0 1 1 0 rel_spread min_value max_value median_value 1 0.990 1 102 5.5 diff --git a/tests/testthat/test-archive-version-bounds.R b/tests/testthat/test-archive-version-bounds.R index d12c40604..ee500d308 100644 --- a/tests/testthat/test-archive-version-bounds.R +++ b/tests/testthat/test-archive-version-bounds.R @@ -110,14 +110,14 @@ test_that("archive version bounds args work as intended", { clobberable_versions_start = NA, versions_end = measurement_date ), - regexp = "`x` contained updates for a later version" + regexp = "`x\\$DT` contained updates for a later version" ) expect_error( as_epi_archive(update_tbl, clobberable_versions_start = measurement_date + 6L, versions_end = measurement_date + 5L ), - regexp = "`clobberable_versions_start`.*indicated that there were later observed versions" + regexp = "`x\\$clobberable_versions_start`.*indicating that there were later observed versions" ) expect_error(as_epi_archive(update_tbl, versions_end = NA), class = "epiprocess__version_bound_na_with_na_not_okay" diff --git a/tests/testthat/test-archive.R b/tests/testthat/test-archive.R index 0a06cf794..0e84b03ba 100644 --- a/tests/testthat/test-archive.R +++ b/tests/testthat/test-archive.R @@ -54,13 +54,13 @@ dumb_ex <- data.frame( value = c(1, 1), version = as.Date(c("2020-01-01", "2020-01-02")) ) -test_that("new_epi_archive correctly detects and warns about compactification", { - expect_snapshot(res <- dumb_ex %>% as_epi_archive(), cnd_class = TRUE) +test_that("as_epi_archive default compactification (no longer messages/warns)", { + expect_snapshot(res <- dumb_ex %>% as_epi_archive()) }) test_that("other_keys can only contain names of the data.frame columns", { expect_error(as_epi_archive(archive_data, other_keys = "xyz", compactify = FALSE), - regexp = "`other_keys` must be contained in the column names of `x`." + regexp = "missing the following expected columns: xyz" ) expect_error(as_epi_archive(archive_data, other_keys = "percent_cli", compactify = FALSE), NA) }) @@ -220,5 +220,5 @@ test_that("`epi_archive` rejects dataframes where time_value and version columns test_that("is_locf works as expected", { vec <- c(1, 1, 1e-10, 1.1e-10, NA, NA, NaN, NaN) is_repeated <- c(0, 1, 0, 1, 0, 1, 1, 1) - expect_equal(is_locf(vec, .Machine$double.eps^0.5), as.logical(is_repeated)) + expect_equal(is_locf(vec, .Machine$double.eps^0.5, FALSE), as.logical(is_repeated)) }) diff --git a/tests/testthat/test-compactify.R b/tests/testthat/test-compactify.R index d05fe0b33..2eed5025f 100644 --- a/tests/testthat/test-compactify.R +++ b/tests/testthat/test-compactify.R @@ -52,14 +52,15 @@ dt <- row_replace(dt, 74, 73, 74) # Not LOCF dt_true <- as_tibble(as_epi_archive(dt, compactify = TRUE)$DT) dt_false <- as_tibble(as_epi_archive(dt, compactify = FALSE)$DT) -dt_null <- suppressWarnings(as_tibble(as_epi_archive(dt, compactify = NULL)$DT)) +dt_message <- suppressMessages(as_tibble(as_epi_archive(dt, compactify = "message")$DT)) +dt_0 <- as_tibble(as_epi_archive(dt, compactify = TRUE, compactify_abs_tol = 0)$DT) -test_that("Warning for LOCF with compactify as NULL", { - expect_warning(as_epi_archive(dt, compactify = NULL)) +test_that('Warning for LOCF with compactify as "message"', { + expect_message(as_epi_archive(dt, compactify = "message")) }) test_that("No warning when there is no LOCF", { - expect_warning(as_epi_archive(dt[1:5], compactify = NULL), NA) + expect_no_message(as_epi_archive(dt[1:5], compactify = "message")) }) test_that("LOCF values are ignored with compactify=FALSE", { @@ -69,8 +70,21 @@ test_that("LOCF values are ignored with compactify=FALSE", { test_that("LOCF values are taken out with compactify=TRUE", { dt_test <- as_tibble(as_epi_archive(dt[-c(21, 22, 40), ], compactify = FALSE)$DT) - expect_identical(dt_true, dt_null) - expect_identical(dt_null, dt_test) + expect_identical(dt_true, dt_message) + expect_identical(dt_message, dt_test) + + # Tolerance is nonstrict and tolerance 0 still compactifies: + expect_identical(dt_0, dt_test) +}) + +test_that("apply_compactify yields compatible results with tibbles and archive DTs", { + via_ea_compactified_tbl <- as_tibble(as.data.frame(as_epi_archive(dt)$DT)) + + tbl <- as_tibble(as.data.frame(dt)) + ea_key_names <- key(dt) + + expect_equal(apply_compactify(tbl, ea_key_names), via_ea_compactified_tbl) + expect_equal(apply_compactify(arrange(tbl, version), ea_key_names), via_ea_compactified_tbl) }) test_that("as_of produces the same results with compactify=TRUE as with compactify=FALSE", { @@ -105,3 +119,80 @@ test_that("compactify does not alter the default clobberable and observed versio expect_identical(ea_true$clobberable_versions_start, ea_false$clobberable_versions_start) expect_identical(ea_true$versions_end, ea_false$versions_end) }) + +test_that("compactify works on distributions", { + forecasts <- tibble( + ahead = 2L, + geo_value = "ak", + target_end_date = as.Date("2020-01-19"), + forecast_date = as.Date("2020-01-17") + 1:8, + actual = 25, + .pred_distn = c( + epipredict::dist_quantiles(c(1, 5, 9), c(0.1, 0.5, 0.9)), + epipredict::dist_quantiles(c(1, NA, 9), c(0.1, 0.5, 0.9)), # single NA in quantiles + epipredict::dist_quantiles(c(NA, NA, NA), c(0.1, 0.5, 0.9)), # all NAs in quantiles + distributional::dist_missing(1), # the actual `NA` for distributions + epipredict::dist_quantiles(c(1, 5, 9), c(0.1, 0.5, 0.9)), # and back + epipredict::dist_quantiles(c(3, 5, 9), c(0.1, 0.5, 0.9)), # change quantile + epipredict::dist_quantiles(c(3, 5, 9), c(0.2, 0.5, 0.8)), # change level + epipredict::dist_quantiles(c(3, 5, 9), c(0.2, 0.5, 0.8)) # LOCF + ) + ) + expect_equal( + forecasts %>% + as_epi_archive( + other_keys = "ahead", time_value = target_end_date, version = forecast_date, + compactify = TRUE + ) %>% + .$DT %>% + as.data.frame() %>% + as_tibble(), + forecasts[-8, ] %>% + rename(time_value = target_end_date, version = forecast_date) + ) +}) + +test_that("Large compactify_abs_tol does not drop edf keys", { + # several epikeytimes, each with a single version + x <- tibble( + geo_value = 1, + time_value = 1:5, + version = 11:15, + value = 1001:1005 + ) + # We shouldn't drop epikeytimes: + expect_equal(as_tibble(as.data.frame(as_epi_archive(x, compactify_abs_tol = 3)$DT)), x) +}) + +test_that("Large compactify_abs_tol does not apply to non-is.numeric columns", { + # one epikeytime with many versions: + d <- as.Date("2000-01-01") + x <- tibble( + geo_value = 1, + time_value = d + 1, + version = d + 11:15, + lag = version - time_value, # non-is.numeric + value = 1001:1005 + ) + expect_equal(as_tibble(as.data.frame(as_epi_archive(x, compactify_abs_tol = 3)$DT)), x) +}) + +test_that("Large compactify_abs_tol works on value columns", { + # one epikeytime with many versions: + d <- as.Date("2000-01-01") + x <- tibble( + geo_value = 1, + time_value = d + 1, + version = d + 11:15, + value = 1001:1005 + ) + expect_equal( + as_tibble(as.data.frame(as_epi_archive(x, compactify_abs_tol = 3)$DT)), + tibble( + geo_value = 1, + time_value = d + 1, + version = d + 11, # XXX do we want d + c(11,14) instead? + value = 1001 # XXX do we want c(1001, 1004) instead? + ) + ) +}) diff --git a/tests/testthat/test-epix_slide.R b/tests/testthat/test-epix_slide.R index c0d752dc0..d3b8c3288 100644 --- a/tests/testthat/test-epix_slide.R +++ b/tests/testthat/test-epix_slide.R @@ -329,12 +329,12 @@ ea <- tibble::tribble( test_that("epix_slide with .all_versions option has access to all older versions", { slide_fn <- function(x, gk, rtv) { - return(tibble( + tibble( n_versions = length(unique(x$DT$version)), n_row = nrow(x$DT), dt_class1 = class(x$DT)[[1L]], dt_key = list(key(x$DT)) - )) + ) } ea_orig_mirror <- ea %>% clone() @@ -481,7 +481,7 @@ test_that("epix_as_of and epix_slide with long enough window are compatible", { test_that("epix_slide `f` is passed an ungrouped `epi_archive` when `.all_versions=TRUE`", { slide_fn <- function(x, gk, rtv) { expect_class(x, "epi_archive") - return(NA) + NA } ea %>% diff --git a/tests/testthat/test-grouped_epi_archive.R b/tests/testthat/test-grouped_epi_archive.R index 8ed5ea02e..663fe4e9c 100644 --- a/tests/testthat/test-grouped_epi_archive.R +++ b/tests/testthat/test-grouped_epi_archive.R @@ -77,10 +77,6 @@ test_that("Grouping, regrouping, and ungrouping archives works as intended", { age_group = ordered(age_group, c("pediatric", "adult")), version = as.Date(version) ) %>% - # as_epi_df(as_of = as.Date("2000-01-03"), - # other_keys = "age_group") %>% - # # put back in expected order; see issue #166: - # select(geo_value, age_group, time_value, s) %>% group_by(geo_value, age_group, .drop = FALSE) ) }) diff --git a/tests/testthat/test-growth_rate.R b/tests/testthat/test-growth_rate.R new file mode 100644 index 000000000..9aa9936d6 --- /dev/null +++ b/tests/testthat/test-growth_rate.R @@ -0,0 +1,191 @@ +test_that("global param constructor errors when required", { + # Check the tree when there is parameter dependency + expect_identical(growth_rate_params(df = "1se")$df, "1se") + expect_false(growth_rate_params(df = 10, cv = FALSE)$cv) + expect_identical(growth_rate_params(df = 10L)$df, 10L) + expect_snapshot(error = TRUE, growth_rate_params(df = -5)) + + # Make sure that assert_number is len 1 + expect_identical(growth_rate_params(nlambda = 5L)$nlambda, 5L) + expect_snapshot(error = TRUE, growth_rate_params(nlambda = 5:8)) +}) + +test_that("new setup args and warnings are as expected", { + # NaN in log calculation + expect_snapshot(error = TRUE, growth_rate(y = -10:10, log_scale = TRUE)) + expect_snapshot(error = TRUE, growth_rate(y = -10:10, log_scale = TRUE, method = "smooth_spline")) + + # NAs in x or y are removed + expect_length(growth_rate(y = c(1:20, NA, 22:30)), 30L) + expect_length(growth_rate(y = c(1:20, NA, 22:30), na_rm = TRUE), 30L) + expect_snapshot(error = TRUE, growth_rate(y = 1:30, x = c(1:20, NA, 22:30), na_rm = TRUE)) + + # splines and trendfilter error on NAs + expect_length(growth_rate(y = c(1:20, NA, 22:30), method = "smooth_spline"), 30L) + expect_length(growth_rate(y = c(1:20, NA, 22:30), method = "trend_filter"), 30L) + expect_warning(growth_rate(y = c(1:20, -5, 22:30), log_scale = TRUE, method = "smooth_spline")) + expect_warning(growth_rate(y = c(1:20, -5, 22:30), log_scale = TRUE, method = "trend_filter")) + + # splines with multiple lambdas + expect_snapshot( + error = TRUE, + growth_rate( + y = 1:20, method = "smooth_spline", + params = growth_rate_params(lambda = 1:20) + ) + ) + + # other spline args give output (correctness not checked) + z <- rnorm(30) + expect_length(growth_rate(y = z, method = "smooth_spline"), 30L) + expect_length(growth_rate( + y = z, + method = "smooth_spline", params = growth_rate_params(spar = .5) + ), 30L) + expect_length(growth_rate( + y = z, + method = "smooth_spline", params = growth_rate_params(lambda = 10) + ), 30L) + expect_length(growth_rate( + y = z, + method = "smooth_spline", params = growth_rate_params(df = 14) + ), 30L) + expect_length(growth_rate( + y = z, + method = "smooth_spline", params = growth_rate_params(cv = TRUE) + ), 30L) +}) + +test_that("parser sees all cases", { + skip_if_not_installed("trendfilter", "0.0.2") + # 18 total cases + # lambda in {NULL, scalar, vector} + # df in {NULL, character, numeric} + # cv in {T/F} + + grab_l <- function(l) list(cv = l$cv, df = l$df, lambda = l$lambda) + + # CV TRUE + l <- growth_rate_params(cv = TRUE) + expect_identical( + grab_l(parse_trendfilter_params(l)), + list(cv = TRUE, df = "min", lambda = NULL) + ) + l <- growth_rate_params(cv = TRUE, df = "1se") + expect_identical( + grab_l(parse_trendfilter_params(l)), + list(cv = TRUE, df = "1se", lambda = NULL) + ) + l <- growth_rate_params(cv = TRUE, df = "min", lambda = 1:5) + expect_identical( + grab_l(parse_trendfilter_params(l)), + list(cv = TRUE, df = "min", lambda = 1:5) + ) + l <- growth_rate_params(cv = TRUE, lambda = 1:5) + expect_identical( + grab_l(parse_trendfilter_params(l)), + list(cv = TRUE, df = "min", lambda = 1:5) + ) + l <- growth_rate_params(cv = TRUE, lambda = 1) + expect_snapshot(error = TRUE, parse_trendfilter_params(l)) + l <- growth_rate_params(cv = TRUE, df = 1) + expect_snapshot(error = TRUE, parse_trendfilter_params(l)) + l <- growth_rate_params(cv = TRUE, df = 1, lambda = 1) + expect_snapshot(error = TRUE, parse_trendfilter_params(l)) + l <- growth_rate_params(cv = TRUE, df = "min", lambda = 1) + expect_snapshot(error = TRUE, parse_trendfilter_params(l)) + l <- growth_rate_params(cv = TRUE, df = 1, lambda = 1:5) + expect_snapshot(error = TRUE, parse_trendfilter_params(l)) + + # CV = FALSE (the default) + # 5 Cases where we turn CV on + l <- growth_rate_params(df = "1se") + expect_identical( + grab_l(parse_trendfilter_params(l)), + list(cv = TRUE, df = "1se", lambda = NULL) + ) + l <- growth_rate_params(df = "1se", lambda = 1:5) + expect_identical( + grab_l(parse_trendfilter_params(l)), + list(cv = TRUE, df = "1se", lambda = 1:5) + ) + l <- growth_rate_params(lambda = 1:5) + expect_identical( + grab_l(parse_trendfilter_params(l)), + list(cv = TRUE, df = "min", lambda = 1:5) + ) + expect_identical( + grab_l(parse_trendfilter_params(growth_rate_params())), + list(cv = TRUE, df = "min", lambda = NULL) + ) + # 3 cases where CV stays False + l <- growth_rate_params(lambda = 1) + expect_identical( + grab_l(parse_trendfilter_params(l)), + list(cv = FALSE, df = NULL, lambda = 1) + ) + l <- growth_rate_params(df = 5) + expect_identical( + grab_l(parse_trendfilter_params(l)), + list(cv = FALSE, df = 5, lambda = NULL) + ) + l <- growth_rate_params(df = 5, lambda = 1:5) + expect_identical( + grab_l(parse_trendfilter_params(l)), + list(cv = FALSE, df = 5, lambda = 1:5) + ) + + # 2 error cases + l <- growth_rate_params(df = "min", lambda = 1) + expect_snapshot(error = TRUE, parse_trendfilter_params(l)) + l <- growth_rate_params(df = 1, lambda = 1) + expect_snapshot(error = TRUE, parse_trendfilter_params(l)) +}) + +test_that("trendfilter growth_rate implementation", { + skip_if_not_installed("trendfilter", "0.0.2") + + # various tf args give output (correctness not checked) + z <- rnorm(30) + expect_length(growth_rate(y = z, method = "trend_filter"), 30L) + expect_length(growth_rate( + y = z, + method = "trend_filter", params = growth_rate_params(lambda = 10) + ), 30L) + expect_length(growth_rate( + y = z, + method = "trend_filter", params = growth_rate_params(df = 14) + ), 30L) + expect_length(growth_rate( + y = z, + method = "trend_filter", params = growth_rate_params(cv = TRUE) + ), 30L) + expect_length(growth_rate( + y = z, + method = "trend_filter", params = growth_rate_params(k = 3) + ), 30L) + expect_length(growth_rate( + y = z, + method = "trend_filter", params = growth_rate_params(nlambda = 10) + ), 30L) + expect_length(growth_rate( + y = z, + method = "trend_filter", params = growth_rate_params(lambda_max = 10) + ), 30L) + expect_length(growth_rate( + y = z, + method = "trend_filter", params = growth_rate_params(lambda_min = 10) + ), 30L) + expect_length(growth_rate( + y = z, + method = "trend_filter", params = growth_rate_params(lambda_min_ratio = .1) + ), 30L) + expect_length(growth_rate( + y = z, + method = "trend_filter", params = growth_rate_params(error_measure = "mse") + ), 30L) + expect_length(growth_rate( + y = z, + method = "trend_filter", params = growth_rate_params(nfolds = 3) + ), 30L) +}) diff --git a/tests/testthat/test-key_colnames.R b/tests/testthat/test-key_colnames.R new file mode 100644 index 000000000..bd873eccb --- /dev/null +++ b/tests/testthat/test-key_colnames.R @@ -0,0 +1,194 @@ +test_that("`key_colnames` on non-`epi_df`-like tibbles works as expected", { + k1k2_tbl <- tibble::tibble(k1 = 1, k2 = 1) + + expect_equal( + key_colnames(k1k2_tbl, geo_keys = character(), time_keys = character(), other_keys = c("k1", "k2")), + c("k1", "k2") + ) + # `geo_keys`, `other_keys`, `time_keys` are mandatory: + expect_error(key_colnames(k1k2_tbl, other_keys = c("k1", "k2"), time_keys = character()), + regexp = '"geo_keys" is missing' + ) + expect_error(key_colnames(k1k2_tbl, geo_keys = character(), time_keys = character()), + regexp = '"other_keys" is missing' + ) + expect_error(key_colnames(k1k2_tbl, geo_keys = character(), other_keys = c("k1", "k2")), + regexp = '"time_keys" is missing' + ) + + # Manually specifying keys that aren't there is an error: + expect_error( + key_colnames(k1k2_tbl, geo_keys = "bogus", other_keys = c("k1", "k2"), time_keys = character()), + class = "epiprocess__key_colnames__keys_not_in_colnames" + ) + expect_error( + key_colnames(k1k2_tbl, other_keys = "bogus", geo_keys = character(), time_keys = character()), + class = "epiprocess__key_colnames__keys_not_in_colnames" + ) + expect_error( + key_colnames(k1k2_tbl, time_keys = "bogus", geo_keys = character(), other_keys = c("k1", "k2")), + class = "epiprocess__key_colnames__keys_not_in_colnames" + ) + + # We can specify non-`epi_df`-like geo keys: + expect_equal( + key_colnames(k1k2_tbl, geo_keys = c("k1", "k2"), other_keys = character(), time_keys = character()), + c("k1", "k2") + ) +}) + +test_that("`key_colnames` on `epi_df`s and similar tibbles works as expected", { + withr::local_options(list(lifecycle_verbosity = "warning")) # for extra_keys tests + + gat_tbl <- tibble::tibble(geo_value = 1, age_group = 1, time_value = 1) + gat_edf <- as_epi_df(gat_tbl, other_keys = "age_group", as_of = 2) + + # For tbl: we must provide all key naming arguments: + expect_equal( + key_colnames(gat_tbl, geo_keys = "geo_value", other_keys = "age_group", time_keys = "time_value"), + c("geo_value", "age_group", "time_value") + ) + # given same inputs, compatible edfs give something compatible: + expect_equal( + key_colnames(gat_edf, geo_keys = "geo_value", other_keys = "age_group", time_keys = "time_value"), + c("geo_value", "age_group", "time_value") + ) + # though edfs don't have to specify the key settings: + expect_equal( + key_colnames(gat_edf), + c("geo_value", "age_group", "time_value") + ) + # and they will balk if we write something intended to work for both tbls and + # edfs but mis-specify something: + expect_error( + key_colnames(gat_edf, geo_keys = character(0L)), + class = "epiprocess__key_colnames__mismatched_geo_keys" + ) + expect_error( + key_colnames(gat_edf, other_keys = character(0L)), + class = "epiprocess__key_colnames__mismatched_other_keys" + ) + expect_error( + key_colnames(gat_edf, time_keys = character(0L)), + class = "epiprocess__key_colnames__mismatched_time_keys" + ) + + # edfs also won't let us specify nonstandard geotime keys: + expect_error( + key_colnames(gat_edf, geo_keys = "time_value"), + class = "epiprocess__key_colnames__mismatched_geo_keys" + ) + expect_error( + key_colnames(gat_edf, time_keys = "geo_value"), + class = "epiprocess__key_colnames__mismatched_time_keys" + ) + + # We can exclude keys: + expect_equal( + key_colnames( + gat_tbl, + geo_keys = "geo_value", other_keys = "age_group", time_keys = "time_value", + exclude = c("time_value") + ), + c("geo_value", "age_group") + ) + expect_equal( + key_colnames( + gat_tbl, + geo_keys = "geo_value", other_keys = "age_group", time_keys = "time_value", + exclude = c("geo_value", "time_value") + ), + c("age_group") + ) + expect_equal( + key_colnames(gat_edf, exclude = c("time_value")), + c("geo_value", "age_group") + ) + expect_equal( + key_colnames(gat_edf, exclude = c("geo_value", "time_value")), + c("age_group") + ) + + # Using `extra_keys =` is soft-deprecated and routes to `other_keys =`: + expect_warning( + gat_tbl_extra_keys_res <- key_colnames( + gat_tbl, + geo_keys = "geo_value", time_keys = "time_value", + extra_keys = "age_group" + ), + class = "lifecycle_warning_deprecated" + ) + expect_equal(gat_tbl_extra_keys_res, c("geo_value", "age_group", "time_value")) + + expect_warning( + gat_edf_extra_keys_exclude_res <- + key_colnames( + gat_edf, + extra_keys = "age_group", + exclude = c("geo_value", "time_value") + ), + class = "lifecycle_warning_deprecated" + ) + expect_equal(gat_edf_extra_keys_exclude_res, c("age_group")) +}) + +test_that("`key_colnames` on tsibbles works as expected", { + k1k2i_tsbl <- tsibble::tsibble(k1 = 1, k2 = 1, i = 1, key = c(k1, k2), index = i) + + # Normal operation: + expect_equal(key_colnames(k1k2i_tsbl), c("k1", "k2", "i")) + + # Currently there is just bare-bones support for tsibbles to not output + # incompatible results based on `data.frame` inheritance: + expect_error( + key_colnames(k1k2i_tsbl, geo_keys = "k1"), + class = "rlib_error_dots_nonempty" + ) + expect_error( + key_colnames(k1k2i_tsbl, time_keys = "k1"), + class = "rlib_error_dots_nonempty" + ) + expect_error( + key_colnames(k1k2i_tsbl, other_keys = "k1"), + class = "rlib_error_dots_nonempty" + ) + + # We guard against confusing cases: + expect_error( + key_colnames(k1k2i_tsbl %>% tsibble::index_by(fake_coarser_i = i)), + class = "epiprocess__key_colnames__incomplete_reindexing_operation" + ) +}) + +test_that("`key_colnames` on `epi_archive`s works as expected", { + gatv_ea <- tibble(geo_value = 1, age_group = 1, time_value = 1, version = 2) %>% + as_epi_archive(other_keys = "age_group") + + # Basic operation: + expect_equal( + key_colnames(gatv_ea), + c("geo_value", "age_group", "time_value", "version") + ) + + # Since we shouldn't have uncertainty about whether we might have an archive + # or not, there's no need to provide compatibility with the key specification + # args: + expect_error( + key_colnames(gatv_ea, geo_keys = "k1"), + class = "rlib_error_dots_nonempty" + ) + expect_error( + key_colnames(gatv_ea, time_keys = "k1"), + class = "rlib_error_dots_nonempty" + ) + expect_error( + key_colnames(gatv_ea, other_keys = "k1"), + class = "rlib_error_dots_nonempty" + ) + + # Key exclusion works: + expect_equal( + key_colnames(gatv_ea, exclude = c("version", "time_value")), + c("geo_value", "age_group") + ) +}) diff --git a/tests/testthat/test-revision-latency-functions.R b/tests/testthat/test-revision-latency-functions.R index ff7220684..b636bf32e 100644 --- a/tests/testthat/test-revision-latency-functions.R +++ b/tests/testthat/test-revision-latency-functions.R @@ -1,17 +1,5 @@ dummy_ex <- tibble::tribble( ~geo_value, ~time_value, ~version, ~value, - # al 1 has 1 real revision, a lag of 0, and changes by 99 - "al", as.Date("2020-01-01"), as.Date("2020-01-01"), 1, - "al", as.Date("2020-01-01"), as.Date("2020-01-10"), 1, - "al", as.Date("2020-01-01"), as.Date("2020-01-20"), 100, - # al 2 has no revision, a min lag of 0, and a rel_spread of 0 - "al", as.Date("2020-01-02"), as.Date("2020-01-02"), 1, - # al 3 has 1 revision and a min lag of 1, and a change of 3 - "al", as.Date("2020-01-03"), as.Date("2020-01-04"), 1, - "al", as.Date("2020-01-03"), as.Date("2020-01-05"), 4, - # al 4 has 1 revision including NA's none if not, a lag of 0/1 and changes of 0 - "al", as.Date("2020-01-04"), as.Date("2020-01-04"), NA, - "al", as.Date("2020-01-04"), as.Date("2020-01-05"), 9, # ak 1 has 4 revisions w/out NAs, but 6 with NAs # a min lag of 2, and a change of 101 "ak", as.Date("2020-01-01"), as.Date("2020-01-03"), 1, @@ -27,15 +15,164 @@ dummy_ex <- tibble::tribble( # ak 3 has 0 revisions, and a value of zero, and thus a rel_spread of NaN "ak", as.Date("2020-01-03"), as.Date("2020-01-06"), 0, "ak", as.Date("2020-01-03"), as.Date("2020-01-07"), 0, + # al 1 has 1 real revision, a lag of 0, and changes by 99 + "al", as.Date("2020-01-01"), as.Date("2020-01-01"), 1, + "al", as.Date("2020-01-01"), as.Date("2020-01-10"), 1, + "al", as.Date("2020-01-01"), as.Date("2020-01-20"), 100, + # al 2 has no revision, a min lag of 0, and a rel_spread of 0 + "al", as.Date("2020-01-02"), as.Date("2020-01-02"), 1, + # al 3 has 1 revision and a min lag of 1, and a change of 3 + "al", as.Date("2020-01-03"), as.Date("2020-01-04"), 1, + "al", as.Date("2020-01-03"), as.Date("2020-01-05"), 4, + # al 4 has 1 revision including NA's none if not, a lag of 0/1 and changes of 0 + "al", as.Date("2020-01-04"), as.Date("2020-01-04"), NA, + "al", as.Date("2020-01-04"), as.Date("2020-01-05"), 9, ) %>% as_epi_archive(versions_end = as.Date("2022-01-01"), compactify = FALSE) -test_that("revision_summary works for a dummy dataset", { +dummy_ex_weekly <- dummy_ex$DT %>% + mutate(across( + c(time_value, version), + ~ as.Date("2020-01-01") + 7 * as.numeric(.x - as.Date("2020-01-01")) + )) %>% + as_epi_archive( + versions_end = as.Date("2022-01-01") + as.numeric(as.Date("2022-01-01") - as.Date("2020-01-01")) %% 7, + compactify = FALSE + ) +stopifnot(dummy_ex_weekly$time_type == "week") + +dummy_ex_yearmonthly <- dummy_ex$DT %>% + mutate(across( + c(time_value, version), + ~ tsibble::make_yearmonth(2020, 1) + as.numeric(.x - as.Date("2020-01-01")) + )) %>% + as_epi_archive( + versions_end = tsibble::make_yearmonth(2020, 1) + as.numeric(as.Date("2022-01-01") - as.Date("2020-01-01")), + compactify = FALSE + ) +stopifnot(dummy_ex_yearmonthly$time_type == "yearmonth") + +dummy_ex_integerly <- dummy_ex$DT %>% + mutate(across( + c(time_value, version), + ~ 1 + as.numeric(.x - as.Date("2020-01-01")) + )) %>% + as_epi_archive( + versions_end = 1 + as.numeric(as.Date("2022-01-01") - as.Date("2020-01-01")), + compactify = FALSE + ) +stopifnot(dummy_ex_integerly$time_type == "integer") + +test_that("revision_summary works for dummy datasets", { expect_snapshot(dummy_ex %>% revision_summary() %>% print(n = 10, width = 300)) expect_snapshot(dummy_ex %>% revision_summary(drop_nas = FALSE) %>% print(n = 10, width = 300)) + + # Weekly dummy is mostly just "day" -> "week", but quick-revision summary changes: + expect_snapshot(dummy_ex_weekly %>% revision_summary(drop_nas = FALSE) %>% print(n = 10, width = 300)) + # Yearmonthly has the same story. It would have been close to encountering + # min_waiting_period-based filtering but we actually set its versions_end to + # sometime in 2080 rather than 2022: + expect_snapshot(dummy_ex_yearmonthly %>% revision_summary(drop_nas = FALSE) %>% print(n = 10, width = 300)) + # Integer is very much like daily. We have to provide some of the + # configuration arguments since we have no idea about what the integers + # represent. If the possible integers being used have large jumps like + # YYYYww-as-integer epiweek labels (e.g., 200053 jumps to 200101) or are + # regularly spaced apart but by more than 1, we'll still be producing + # something nonsensical, but we tried. + expect_snapshot(dummy_ex_integerly %>% + revision_summary( + min_waiting_period = 60, quick_revision = 3, + drop_nas = FALSE + ) %>% + print(n = 10, width = 300)) }) + test_that("tidyselect is functional", { - expect_no_error(revision_summary(dummy_ex, value)) - expect_no_error(revision_summary(dummy_ex, starts_with("val"))) + expect_no_error(quiet(revision_summary(dummy_ex, value))) + expect_no_error(quiet(revision_summary(dummy_ex, starts_with("val")))) + # column order shouldn't matter + with_later_key_col <- dummy_ex$DT %>% + select(geo_value, time_value, value, version) %>% + as_epi_archive(versions_end = dummy_ex$versions_end, compactify = FALSE) + expect_equal( + quiet(revision_summary(with_later_key_col)), + quiet(revision_summary(dummy_ex)) + ) + # extra column shouldn't interfere + with_later_val_col <- dummy_ex$DT %>% + mutate(value2 = 0) %>% + as_epi_archive(versions_end = dummy_ex$versions_end, compactify = FALSE) + expect_equal( + quiet(revision_summary(with_later_val_col, value)), + quiet(revision_summary(dummy_ex, value)) + ) + # error when which column we're summarizing is ambiguous + expect_error( + dummy_ex$DT %>% + copy() %>% + mutate(value2 = value) %>% + as_epi_archive( + versions_end = dummy_ex$versions_end, + compactify = FALSE + ) %>% + revision_summary(), + class = "epiprocess__revision_summary_cannot_determine_default_selection" + ) + expect_error(revision_summary(with_later_val_col, !everything()), + class = "epiprocess__revision_summary__selected_zero_columns" + ) +}) + +test_that("revision_summary default min_waiting_period works as expected", { + # just outside the window for daily data + expect_equal( + tibble( + geo_value = 1, + time_value = as.Date("2020-01-01") + 0:1, + version = time_value + 1, + value = 1:2 + ) %>% + as_epi_archive(versions_end = as.Date("2020-01-01") + 1 + 59) %>% + revision_summary(print_inform = FALSE) %>% + pull(time_value), + as.Date("2020-01-01") + ) + # just outside the window for weekly data + expect_equal( + tibble( + geo_value = 1, + time_value = as.Date("2020-01-01") + 7 * (0:1), + version = time_value + 35, + value = 1:2 + ) %>% + as_epi_archive(versions_end = as.Date("2020-01-01") + 7 + 56) %>% + revision_summary(print_inform = FALSE) %>% + pull(time_value), + as.Date("2020-01-01") + ) + # just outside the window for monthly data + expect_equal( + tibble( + geo_value = 1, + time_value = tsibble::make_yearmonth(2000, 1:2), + version = time_value + 1, + value = 1:2 + ) %>% + as_epi_archive(versions_end = tsibble::make_yearmonth(2000, 3)) %>% + revision_summary(print_inform = FALSE) %>% + pull(time_value), + tsibble::make_yearmonth(2000, 1) + ) + # we don't know how to interpret the default in terms of "integer" time_type + expect_error( + tibble( + geo_value = 1, + time_value = 1:2 + 0, + version = time_value + 1, + value = 1:2 + ) %>% + as_epi_archive(versions_end = 1 + 1 + 59) %>% + revision_summary(print_inform = FALSE), + regexp = "Unsupported time_type" + ) }) -test_that("revision_summary works for various timetypes", {}) diff --git a/tests/testthat/test-time-utils.R b/tests/testthat/test-time-utils.R new file mode 100644 index 000000000..6fe8d78ad --- /dev/null +++ b/tests/testthat/test-time-utils.R @@ -0,0 +1,263 @@ +library(dplyr) + +test_that("guess_period works", { + # Error cases: + expect_error(guess_period(numeric(0L)), class = "epiprocess__guess_period__not_enough_times") + expect_error(guess_period(c(1)), class = "epiprocess__guess_period__not_enough_times") + # Different numeric classes and cases: + expect_identical(guess_period(c(1, 8)), 7) + expect_identical(guess_period(c(1, 8, 15)), 7) + expect_identical(guess_period(c(1L, 8L, 15L)), 7L) + expect_identical(guess_period(c(0, 7, 14, 15)), 1) + # We currently allow the guessed frequency to not appear in the diffs, but + # this might not be a good idea as it likely indicates an issue with the data + # (#485). + expect_identical(guess_period(c(0, 2, 5)), 1) + expect_identical(guess_period(c(0, 4, 10)), 2) + # On Dates: + daily_dates <- seq(as.Date("2020-01-01"), as.Date("2020-01-15"), by = "day") + weekly_dates <- seq(as.Date("2020-01-01"), as.Date("2020-01-15"), by = "week") + expect_identical( + daily_dates[[1L]] + guess_period(daily_dates) * (seq_along(daily_dates) - 1L), + daily_dates + ) + expect_identical( + weekly_dates[[1L]] + guess_period(weekly_dates) * (seq_along(weekly_dates) - 1L), + weekly_dates + ) + # On POSIXcts: + daily_posixcts <- as.POSIXct(daily_dates, tz = "US/Aleutian") + 3600 + weekly_posixcts <- as.POSIXct(weekly_dates, tz = "US/Aleutian") + 3600 + expect_identical( + daily_posixcts[[1L]] + guess_period(daily_posixcts) * (seq_along(daily_posixcts) - 1L), + daily_posixcts + ) + expect_identical( + weekly_posixcts[[1L]] + guess_period(weekly_posixcts) * (seq_along(weekly_posixcts) - 1L), + weekly_posixcts + ) + # On POSIXlts: + daily_posixlts <- as.POSIXlt(daily_dates, tz = "UTC") + 3600 + weekly_posixlts <- as.POSIXlt(weekly_dates, tz = "UTC") + 3600 + expect_identical( + daily_posixlts[[1L]] + guess_period(daily_posixlts) * (seq_along(daily_posixlts) - 1L), + daily_posixlts + ) + expect_identical( + weekly_posixlts[[1L]] + guess_period(weekly_posixlts) * (seq_along(weekly_posixlts) - 1L), + weekly_posixlts + ) +}) + +test_that("validate_slide_window_arg works", { + for (time_type in c("day", "week", "integer", "yearmonth")) { + expect_no_error(validate_slide_window_arg(Inf, time_type)) + } + expect_no_error(validate_slide_window_arg(as.difftime(1, units = "days"), "day")) + expect_no_error(validate_slide_window_arg(1, "day")) + expect_error(validate_slide_window_arg(as.difftime(1, units = "weeks"), "day"), + class = "epiprocess__validate_slide_window_arg" + ) + expect_error(validate_slide_window_arg(as.difftime(1, units = "secs"), "day"), + class = "epiprocess__validate_slide_window_arg" + ) + + expect_no_error(validate_slide_window_arg(as.difftime(1, units = "weeks"), "week")) + expect_error(validate_slide_window_arg(1, "week"), + class = "epiprocess__validate_slide_window_arg" + ) + + expect_no_error(validate_slide_window_arg(1, "integer")) + expect_error(validate_slide_window_arg(as.difftime(1, units = "days"), "integer"), + class = "epiprocess__validate_slide_window_arg" + ) + expect_error(validate_slide_window_arg(as.difftime(1, units = "weeks"), "integer"), + class = "epiprocess__validate_slide_window_arg" + ) + + expect_no_error(validate_slide_window_arg(1, "yearmonth")) + expect_error(validate_slide_window_arg(as.difftime(1, units = "weeks"), "yearmonth"), + class = "epiprocess__validate_slide_window_arg" + ) +}) + +test_that("unit_time_delta works", { + for (format in c("friendly", "fast")) { + expect_equal( + as.Date("2020-01-01") + 5 * unit_time_delta("day", format = format), + as.Date("2020-01-06") + ) + expect_equal( + as.Date("2020-01-01") + 2 * unit_time_delta("week", format = format), + as.Date("2020-01-15") + ) + expect_equal( + tsibble::make_yearmonth(2000, 1) + 5 * unit_time_delta("yearmonth", format = format), + tsibble::make_yearmonth(2000, 6) + ) + expect_equal( + 1L + 5L * unit_time_delta("integer", format = format), + 6L + ) + # + expect_equal( + as.Date("2020-01-01") + + time_delta_to_n_steps(as.Date("2020-01-06") - as.Date("2020-01-01"), "day") * + unit_time_delta("day", format = format), + as.Date("2020-01-06") + ) + expect_equal( + as.Date("2020-01-01") + + time_delta_to_n_steps(as.integer(as.Date("2020-01-06") - as.Date("2020-01-01")), "day") * + unit_time_delta("day", format = format), + as.Date("2020-01-06") + ) + expect_equal( + as.Date("2020-01-01") + + time_delta_to_n_steps(as.Date("2020-01-15") - as.Date("2020-01-01"), "week") * + unit_time_delta("week", format = format), + as.Date("2020-01-15") + ) + expect_equal( + as.Date("2020-01-01") + + time_delta_to_n_steps(as.difftime(2, units = "weeks"), "week") * + unit_time_delta("week", format = format), + as.Date("2020-01-15") + ) + expect_equal( + tsibble::make_yearmonth(2000, 1) + + time_delta_to_n_steps(5, "yearmonth") * + unit_time_delta("yearmonth", format = format), + tsibble::make_yearmonth(2000, 6) + ) + expect_equal( + 1L + + time_delta_to_n_steps(5, "integer") * + unit_time_delta("integer", format = format), + 6L + ) + } +}) + +test_that("time_delta_to_approx_difftime works as expected", { + expect_equal( + time_delta_to_approx_difftime(as.difftime(3, units = "days"), "day"), + as.difftime(3, units = "days") + ) + expect_equal( + time_delta_to_approx_difftime(3, "day"), + as.difftime(3, units = "days") + ) + expect_equal( + time_delta_to_approx_difftime(3, "week"), + as.difftime(3, units = "weeks") + ) + expect_true(time_delta_to_approx_difftime(3, "yearmonth") %>% + `units<-`("days") %>% # nolint: indentation_linter + as.numeric() %>% + `-`(90) %>% + abs() %>% + `<=`(5)) + expect_error(time_delta_to_approx_difftime(3, "integer")) +}) + +test_that("format_time_delta works as expected", { + # time_type "day": + expect_equal( + format_time_delta(as.difftime(1, units = "days"), "day"), + "1 day" + ) + expect_equal( + format_time_delta(as.difftime(2, units = "days"), "day"), + "2 days" + ) + expect_equal( + format_time_delta(1, "day"), + "1 day" + ) + expect_equal( + format_time_delta(2, "day"), + "2 days" + ) + # time_type "week": + expect_equal( + format_time_delta(as.difftime(1, units = "weeks"), "week"), + "1 week" + ) + expect_equal( + format_time_delta(as.difftime(7, units = "days"), "week"), + "1 week" + ) + expect_equal( + format_time_delta(1, "week"), + "1 week" + ) + expect_equal( + format_time_delta(as.difftime(2, units = "weeks"), "week"), + "2 weeks" + ) + # time_type "yearmonth": + expect_equal( + format_time_delta(1, "yearmonth"), + "1 month" + ) + expect_equal( + format_time_delta(2, "yearmonth"), + "2 months" + ) + # time_type "integer": + expect_equal( + format_time_delta(1, "integer"), + "1 time step" + ) + expect_equal( + format_time_delta(2, "integer"), + "2 time steps" + ) + # we don't handle length != 1; pluralize will raise error for us: + expect_error(format_time_delta(numeric(0), "day")) # we don't handle length != 0 + expect_error(format_time_delta(1:5, "day")) # we don't handle length != 0 +}) + +test_that("difftime_approx_ceiling_time_delta works as expected", { + # At time of writing, docs don't guarantee difftime_approx_ceiling_time_delta + # will output friendly time_deltas, so we'll include a standardization step in + # these tests. Prevent eye-glazing repetitition by testing a bunch of cases + # simultaneously with dplyr: + comparisons <- tibble::tribble( + ~x_amount, ~x_units, ~time_type, ~expected_wrapped_friendly_result, + # days x day: + 0, "days", "day", list(as.difftime(0.0, units = "days")), + 1.5, "days", "day", list(as.difftime(2.0, units = "days")), + 2.0, "days", "day", list(as.difftime(2.0, units = "days")), + # days x week: + 2.0, "days", "week", list(as.difftime(1.0, units = "weeks")), + 7.0, "days", "week", list(as.difftime(1.0, units = "weeks")), + 8.0, "days", "week", list(as.difftime(2.0, units = "weeks")), + # weeks x week: + 1.0, "weeks", "week", list(as.difftime(1.0, units = "weeks")), + 1.1, "weeks", "week", list(as.difftime(2.0, units = "weeks")), + # days x yearmonth: + 2.0, "days", "yearmonth", list(1.0), + 32.0, "days", "yearmonth", list(2.0), + ) %>% + mutate(across(expected_wrapped_friendly_result, purrr::list_flatten)) %>% + rowwise() %>% + mutate( + wrapped_friendly_result = as.difftime(x_amount, units = x_units) %>% + difftime_approx_ceiling_time_delta(time_type) %>% + time_delta_standardize(time_type, format = "friendly") %>% + list() + ) %>% + ungroup() + + expect_equal( + comparisons$wrapped_friendly_result, + comparisons$expected_wrapped_friendly_result + ) + + # days x integer: + expect_error(difftime_approx_ceiling_time_delta(as.difftime(1, units = "days"), "integer"), + regexp = "Unsupported time_type" + ) +}) diff --git a/tests/testthat/test-utils.R b/tests/testthat/test-utils.R index 37125d533..55e79830a 100644 --- a/tests/testthat/test-utils.R +++ b/tests/testthat/test-utils.R @@ -280,141 +280,3 @@ test_that("as_slide_computation works", { h <- as_time_slide_computation(~ .x - .group_key) expect_equal(h(6, 3), 3) }) - -test_that("guess_period works", { - # Error cases: - expect_error(guess_period(numeric(0L)), class = "epiprocess__guess_period__not_enough_times") - expect_error(guess_period(c(1)), class = "epiprocess__guess_period__not_enough_times") - # Different numeric classes and cases: - expect_identical(guess_period(c(1, 8)), 7) - expect_identical(guess_period(c(1, 8, 15)), 7) - expect_identical(guess_period(c(1L, 8L, 15L)), 7L) - expect_identical(guess_period(c(0, 7, 14, 15)), 1) - # We currently allow the guessed frequency to not appear in the diffs, but - # this might not be a good idea as it likely indicates an issue with the data - # (#485). - expect_identical(guess_period(c(0, 2, 5)), 1) - expect_identical(guess_period(c(0, 4, 10)), 2) - # On Dates: - daily_dates <- seq(as.Date("2020-01-01"), as.Date("2020-01-15"), by = "day") - weekly_dates <- seq(as.Date("2020-01-01"), as.Date("2020-01-15"), by = "week") - expect_identical( - daily_dates[[1L]] + guess_period(daily_dates) * (seq_along(daily_dates) - 1L), - daily_dates - ) - expect_identical( - weekly_dates[[1L]] + guess_period(weekly_dates) * (seq_along(weekly_dates) - 1L), - weekly_dates - ) - # On POSIXcts: - daily_posixcts <- as.POSIXct(daily_dates, tz = "US/Aleutian") + 3600 - weekly_posixcts <- as.POSIXct(weekly_dates, tz = "US/Aleutian") + 3600 - expect_identical( - daily_posixcts[[1L]] + guess_period(daily_posixcts) * (seq_along(daily_posixcts) - 1L), - daily_posixcts - ) - expect_identical( - weekly_posixcts[[1L]] + guess_period(weekly_posixcts) * (seq_along(weekly_posixcts) - 1L), - weekly_posixcts - ) - # On POSIXlts: - daily_posixlts <- as.POSIXlt(daily_dates, tz = "UTC") + 3600 - weekly_posixlts <- as.POSIXlt(weekly_dates, tz = "UTC") + 3600 - expect_identical( - daily_posixlts[[1L]] + guess_period(daily_posixlts) * (seq_along(daily_posixlts) - 1L), - daily_posixlts - ) - expect_identical( - weekly_posixlts[[1L]] + guess_period(weekly_posixlts) * (seq_along(weekly_posixlts) - 1L), - weekly_posixlts - ) -}) - - -test_that("validate_slide_window_arg works", { - for (time_type in c("day", "week", "integer", "yearmonth")) { - expect_no_error(validate_slide_window_arg(Inf, time_type)) - } - expect_no_error(validate_slide_window_arg(as.difftime(1, units = "days"), "day")) - expect_no_error(validate_slide_window_arg(1, "day")) - expect_error(validate_slide_window_arg(as.difftime(1, units = "weeks"), "day"), - class = "epiprocess__validate_slide_window_arg" - ) - expect_error(validate_slide_window_arg(as.difftime(1, units = "secs"), "day"), - class = "epiprocess__validate_slide_window_arg" - ) - - expect_no_error(validate_slide_window_arg(as.difftime(1, units = "weeks"), "week")) - expect_error(validate_slide_window_arg(1, "week"), - class = "epiprocess__validate_slide_window_arg" - ) - - expect_no_error(validate_slide_window_arg(1, "integer")) - expect_error(validate_slide_window_arg(as.difftime(1, units = "days"), "integer"), - class = "epiprocess__validate_slide_window_arg" - ) - expect_error(validate_slide_window_arg(as.difftime(1, units = "weeks"), "integer"), - class = "epiprocess__validate_slide_window_arg" - ) - - expect_no_error(validate_slide_window_arg(1, "yearmonth")) - expect_error(validate_slide_window_arg(as.difftime(1, units = "weeks"), "yearmonth"), - class = "epiprocess__validate_slide_window_arg" - ) -}) - -test_that("unit_time_delta works", { - expect_equal( - as.Date("2020-01-01") + 5 * unit_time_delta("day"), - as.Date("2020-01-06") - ) - expect_equal( - as.Date("2020-01-01") + 2 * unit_time_delta("week"), - as.Date("2020-01-15") - ) - expect_equal( - tsibble::make_yearmonth(2000, 1) + 5 * unit_time_delta("yearmonth"), - tsibble::make_yearmonth(2000, 6) - ) - expect_equal( - 1L + 5L * unit_time_delta("integer"), - 6L - ) - # - expect_equal( - as.Date("2020-01-01") + - time_delta_to_n_steps(as.Date("2020-01-06") - as.Date("2020-01-01"), "day") * - unit_time_delta("day"), - as.Date("2020-01-06") - ) - expect_equal( - as.Date("2020-01-01") + - time_delta_to_n_steps(as.integer(as.Date("2020-01-06") - as.Date("2020-01-01")), "day") * - unit_time_delta("day"), - as.Date("2020-01-06") - ) - expect_equal( - as.Date("2020-01-01") + - time_delta_to_n_steps(as.Date("2020-01-15") - as.Date("2020-01-01"), "week") * - unit_time_delta("week"), - as.Date("2020-01-15") - ) - expect_equal( - as.Date("2020-01-01") + - time_delta_to_n_steps(as.difftime(2, units = "weeks"), "week") * - unit_time_delta("week"), - as.Date("2020-01-15") - ) - expect_equal( - tsibble::make_yearmonth(2000, 1) + - time_delta_to_n_steps(5, "yearmonth") * - unit_time_delta("yearmonth"), - tsibble::make_yearmonth(2000, 6) - ) - expect_equal( - 1L + - time_delta_to_n_steps(5, "integer") * - unit_time_delta("integer"), - 6L - ) -}) diff --git a/vignettes/compactify.Rmd b/vignettes/compactify.Rmd index 0101100a4..a1e7a1088 100644 --- a/vignettes/compactify.Rmd +++ b/vignettes/compactify.Rmd @@ -23,10 +23,10 @@ rows to remain. There are three different values that can be assigned to `compactify`: -* No argument: if there are LOCF-redundant rows, removes them and issues a - warning with some information about what rows were removed -* `TRUE`: removes any LOCF-redundant rows without any warning or other feedback -* `FALSE`: keeps any LOCF-redundant rows without any warning or other feedback +* `TRUE` (default): removes any LOCF-redundant rows without any message or other feedback +* `FALSE`: keeps any LOCF-redundant rows without any message or other feedback +* `"message"`: if there are LOCF-redundant rows, removes them and produces a + message with some information about what rows were removed For this example, we have one chart using LOCF values, while another doesn't use them to illustrate LOCF. Notice how the head of the first dataset differs diff --git a/vignettes/correlation.Rmd b/vignettes/correlation.Rmd index 8d5054bc0..22a794c96 100644 --- a/vignettes/correlation.Rmd +++ b/vignettes/correlation.Rmd @@ -24,7 +24,8 @@ library(epiprocess) library(dplyr) ``` -The data is included in this package (via the [`epidatasets` package](https://cmu-delphi.github.io/epidatasets/)) and can be loaded with: +The data is included in the [`epidatasets` package](https://cmu-delphi.github.io/epidatasets/), +which is loaded along with `epiprocess`, and can be accessed with: ```{r} x <- covid_case_death_rates_extended %>% arrange(geo_value, time_value) @@ -35,7 +36,7 @@ The data can also be fetched from the Delphi Epidata API with the following quer ```{r, eval = FALSE} library(epidatr) -d <- as.Date("2024-03-20") +d <- as.Date("2023-03-10") x <- pub_covidcast( source = "jhu-csse", diff --git a/vignettes/epi_archive.Rmd b/vignettes/epi_archive.Rmd index 6d8749cc2..f87ea2915 100644 --- a/vignettes/epi_archive.Rmd +++ b/vignettes/epi_archive.Rmd @@ -37,8 +37,8 @@ signal is subject to very heavy and regular revision; you can read more about it on its [API documentation page](https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/doctor-visits.html). -The data is included in this package (via the [`epidatasets` -package](https://cmu-delphi.github.io/epidatasets/)) and can be loaded with: +The data is included in the [`epidatasets` package](https://cmu-delphi.github.io/epidatasets/), +which is loaded along with `epiprocess`, and can be accessed with: ```{r, message = FALSE, warning = FALSE} library(epiprocess) @@ -87,7 +87,7 @@ format, with `issue` playing the role of `version`. We can now use ```{r} dv_archive <- dv %>% select(geo_value, time_value, version, percent_cli) %>% - as_epi_archive(compactify = TRUE) + as_epi_archive() dv_archive ``` @@ -182,7 +182,7 @@ revision_details %>% max_lag = max(max_lag), spread = mean(spread), rel_spread = mean(rel_spread), - time_near_latest = mean(time_near_latest) + lag_near_latest = mean(lag_near_latest) ) ``` @@ -216,9 +216,9 @@ y <- pub_covidcast( issues = epirange(20200601, 20211201) ) %>% select(geo_value, time_value, version = issue, case_rate_7d_av = value) %>% - as_epi_archive(compactify = TRUE) + as_epi_archive() -dv_cases_archive <- epix_merge(dv_archive, y, sync = "locf", compactify = TRUE) +dv_cases_archive <- epix_merge(dv_archive, y, sync = "locf") print(dv_cases_archive) ``` diff --git a/vignettes/epi_df.Rmd b/vignettes/epi_df.Rmd index de10b9afd..ac1ba41fe 100644 --- a/vignettes/epi_df.Rmd +++ b/vignettes/epi_df.Rmd @@ -32,8 +32,8 @@ library(epiprocess) library(dplyr) ``` -The data is included in this package (via the [`epidatasets` -package](https://cmu-delphi.github.io/epidatasets/)) and can be loaded with: +The data is included in the [`epidatasets` package](https://cmu-delphi.github.io/epidatasets/), +which is loaded along with `epiprocess`, and can be accessed with: ```{r} edf <- cases_deaths_subset %>% @@ -119,7 +119,8 @@ edf %>% complete(geo_value, time_value = seq.Date(min(time_value), max(time_value), by = "day")) %>% arrange_canonical() %>% group_by(geo_value) %>% - mutate(cases_7sd = slider::slide_dbl(cases, .f = sd, na.rm = TRUE, .before = 7, .after = 0)) + mutate(cases_7sd = slider::slide_dbl(cases, .f = sd, na.rm = TRUE, .before = 7, .after = 0)) %>% + ungroup() ``` Furthermore `epi_slide()` allows for selecting `.ref_time_value`, which the @@ -135,7 +136,8 @@ where - `g` is a one-row tibble containing the values of the grouping variables for the associated group, for instance `g$geo_value` - `t` is the ref_time_value for the current window -- `...` are additional arguments +- `...` (optional) are any additional arguments you'd like to be able to forward + to your function The same computation as above can be done with a function: @@ -159,8 +161,8 @@ formula or function case. This can be adjusted with `.new_col_name`. ### Rolling computations with multiple column outputs -If your formula (or function) returns a data.frame, then the columns of the -data.frame will be unpacked into the resulting `epi_df` (in the sense of +If your formula (or function) returns a tibble (or other kind of data frame), +then its columns will be unpacked into the resulting `epi_df` (in the sense of `tidyr::unpack()`). For example, the following computes the 7-day trailing average of daily cases as well as the the 7-day trailing standard deviation of daily cases: @@ -168,7 +170,10 @@ daily cases: ```{r} edf %>% epi_slide( - ~ data.frame(cases_mean = mean(.x$cases, na.rm = TRUE), cases_sd = sd(.x$cases, na.rm = TRUE)), + ~ tibble( + cases_mean = mean(.x$cases, na.rm = TRUE), + cases_sd = sd(.x$cases, na.rm = TRUE) + ), .window_size = 7 ) ``` @@ -179,14 +184,13 @@ For the two most common sliding operations, we offer two optimized versions: `epi_slide_mean()` and `epi_slide_sum()`. These are much faster than `epi_slide()`, so we recommend using them when you are only interested in the mean or sum of a column. The following computes the 7-day trailing mean of daily -cases: +cases, allowing means and sums to be taken over fewer than 7 observations if +there is missingness (`na.rm = TRUE`): ```{r} edf %>% - group_by(geo_value) %>% epi_slide_mean("cases", .window_size = 7, na.rm = TRUE) edf %>% - group_by(geo_value) %>% epi_slide_sum("cases", .window_size = 7, na.rm = TRUE) ``` @@ -205,16 +209,48 @@ An `epi_df` object can have more key columns than just `geo_value` and can add this as a key column. We can then aggregate the data by these key columns using `sum_groups_epi_df()`. Let's use influenza hospitalization rate data from the CDC system FluSurv as an example. We can get it from the [Delphi -Epidata API](https://cmu-delphi.github.io/delphi-epidata/api/flusurv.html) +Epidata API](https://cmu-delphi.github.io/delphi-epidata/api/flusurv.html). ```{r} library(epidatr) -flu_data <- pub_flusurv( +flu_data_api <- pub_flusurv( locations = "ca", - epiweeks = epirange(201801, 202001), -) %>% - select(location, epiweek, issue, rate_age_0, rate_age_1, rate_age_2, rate_age_3, rate_age_4) %>% - tidyr::pivot_longer(cols = starts_with("rate_age_"), names_to = "age_group", values_to = "rate") + epiweeks = epirange(201801, 202001) +) +``` + +We're interested in the age-specific rates: +```{r} +flu_data <- flu_data_api %>% + select(location, epiweek, rate_age_0, rate_age_1, rate_age_2, rate_age_3, rate_age_4) %>% + # Turn `rate_age_0`..`rate_age_4` columns into an `age_group` and `rate` + # column (with 5x as many rows): + tidyr::pivot_longer( + cols = starts_with("rate_age_"), names_to = "age_group", values_to = "rate", + # When converting column names to entries in `age_group`, remove this prefix: + names_prefix = "rate_age_", + # And add a better prefix: + names_transform = function(age_group) paste0("age_group_", age_group) + ) %>% + # Improve `age_group` labels a bit more: + mutate( + age_group = case_match( + age_group, + "age_group_0" ~ "0--4 yr", + "age_group_1" ~ "5--17 yr", + "age_group_2" ~ "18--49 yr", + "age_group_3" ~ "50--64 yr", + "age_group_4" ~ ">= 65 yr", + # Make this a factor with appropriate level ordering: + .ptype = factor(levels = c( + "0--4 yr", "5--17 yr", "18--49 yr", + "50--64 yr", ">= 65 yr" + )) + ) + ) %>% + # The API currently outputs `epiweek` in Date format (the constituent Sunday); + # rename it to remind us that it's not in YYYYww format: + rename(time_value = epiweek) flu_data ``` @@ -222,23 +258,106 @@ We can now convert this data to an `epi_df` object and set the `age_group` column as an additional group key: ```{r} -flu_data <- flu_data %>% as_epi_df(other_keys = "age_group", as_of = as.Date("2024-03-20")) +flu_data <- flu_data %>% as_epi_df(other_keys = "age_group") flu_data ``` Note that the `epi_df` object now has an additional key column `age_group`. This means that there should only be one row for each combination of `geo_value`, -`time_value`, and `age_group` in the dataset (this is enforced at construction +`age_group`, and `time_value` in the dataset (this is enforced at construction time). -Now we can aggregate the data by `age_group`, if we want to compute the total: +Now we can aggregate the data by `age_group`, if we want to compute the total. +For count data, this would be just a single call to `sum_groups_epi_df()`. Since we +are working with rates, we need to attach some population data in order to do +this aggregation. It's somewhat ambiguous whether FluSurv-NET reporting uses +either [NCHS](https://www.cdc.gov/nchs/nvss/bridged_race.htm) or +[Census](https://www.census.gov/programs-surveys/popest/technical-documentation/research/evaluation-estimates/2020-evaluation-estimates/2010s-county-detail.html) +populations for `time_value`s before 2020 included in reports published in 2020 +onward, but at least for this example, these two sources agree exactly. +FluSurv-NET also directly reports an overall rate, so we can check our work. +```{r} +# Population estimates for FluSurv-NET-covered part of CA on 2017-07-01 and +# 2018-07-01, extracted and aggregated from "vintage 2020" estimates (actually +# released by Census in June 2021 and by NCHS in September 2021), which is the +# last available reporting found with population estimates for 2017 and 2018: +pop <- tribble( + ~geo_value, ~age_group, ~time_value, ~pop, + "CA", "0--4 yr", as.Date("2017-07-01"), 203813, + "CA", "5--17 yr", as.Date("2017-07-01"), 521827, + "CA", "18--49 yr", as.Date("2017-07-01"), 1722399, + "CA", "50--64 yr", as.Date("2017-07-01"), 700090, + "CA", ">= 65 yr", as.Date("2017-07-01"), 534789, + "CA", "0--4 yr", as.Date("2018-07-01"), 201265, + "CA", "5--17 yr", as.Date("2018-07-01"), 520077, + "CA", "18--49 yr", as.Date("2018-07-01"), 1725382, + "CA", "50--64 yr", as.Date("2018-07-01"), 699145, + "CA", ">= 65 yr", as.Date("2018-07-01"), 551243, +) +# Calculate fraction of total population in each age group. +pop <- pop %>% + group_by(geo_value, time_value) %>% + mutate(frac_pop = pop / sum(pop)) %>% + ungroup() +``` + +After joining population onto the rate data, we can calculate the +population-weighted rate for each age group, that is, the portion of each +`age_group`'s rate that it contributes to the overall rate. + +```{r} +fractional_rate_by_age_group <- + flu_data %>% + inner_join( + pop, + # Simple population interpolation/extrapolation scheme: last observation + # carried forward. Use the estimated population on 2017-07-01 for + # time_values 2017-07-01 through 2018-06-30, and the estimated population on + # 2018-07-01 for all subsequent time_values: + join_by(geo_value, age_group, closest(y$time_value <= x$time_value)), + # Generate errors if the two data sets don't line up as expected: + relationship = "many-to-one", unmatched = "error", + # We'll get a second time column indicating which population estimate + # was carried forward; name it time_value_for_pop: + suffix = c("", "_for_pop") + ) %>% + mutate(rate_contrib = rate * frac_pop) +``` + +We can then use `sum_groups_epi_df` to sum population-weighted rate across all +age groups to get the overall rate. ```{r} -group_cols <- key_colnames(exclude = "age_group") -flu_data %>% - sum_groups_epi_df("rate", group_cols = group_cols) +rate_overall_recalc_edf <- + fractional_rate_by_age_group %>% + sum_groups_epi_df("rate_contrib", group_cols = c("geo_value")) %>% + rename(rate_overall_recalc = rate_contrib) %>% + # match rounding of original data: + mutate(rate_overall_recalc = round(rate_overall_recalc, 1)) +rate_overall_recalc_edf ``` +Let's compare our calculated rate to the overall rate reported by FluSurv-NET. + +```{r} +rate_overall_recalc_edf <- + rate_overall_recalc_edf %>% + # compare to published overall rates: + inner_join( + flu_data_api %>% + select(geo_value = location, time_value = epiweek, rate_overall), + by = c("geo_value", "time_value"), + # validate that we have exactly the same set of geo_value x time_value combinations: + relationship = "one-to-one", unmatched = "error" + ) +# What's our maximum error vs. the official overall estimates? +max(abs(rate_overall_recalc_edf$rate_overall - rate_overall_recalc_edf$rate_overall_recalc)) +``` +This small amount of difference is expected, since all the age-specific rates +were rounded to the first decimal place, and population data might have been +interpolated and extrapolated a bit differently in the official source, limiting +our ability to precisely recreate its estimates from an age group breakdown. + ## Detecting and filling time gaps with `complete.epi_df` Sometimes you may have missing data in your time series. This can be due to @@ -254,7 +373,8 @@ First, let's create a data set with some missing data. We will reuse the dataset edf_missing <- edf %>% filter(geo_value %in% c("ca", "tx")) %>% group_by(geo_value) %>% - slice(1:3, 5:6) + slice(1:3, 5:6) %>% + ungroup() edf_missing %>% print(n = 10) @@ -263,21 +383,28 @@ edf_missing %>% Now let's fill in the missing data with explicit zeros: ```{r} -edf_missing %>% +edf_missing <- edf_missing %>% + group_by(geo_value) %>% complete( time_value = seq.Date(min(time_value), max(time_value), by = "day"), fill = list(cases = 0) ) %>% + ungroup() + +edf_missing %>% print(n = 12) ``` +We see that rows have been added for the missing `time_value` 2020-03-04 for +both of the states, with `cases` set to `0`. If there were explicit `NA`s in the +`cases` column, those would have been replaced by `0` as well. ### Detecting and filling time gaps with `tsibble` We can also use the `tsibble` package to detect and fill time gaps. We'll work with county-level reported COVID-19 cases in MA and VT. -The data is included in this package (via the [`epidatasets` -package](https://cmu-delphi.github.io/epidatasets/)) and can be loaded with: +The data is included in the [`epidatasets` package](https://cmu-delphi.github.io/epidatasets/), +which is loaded along with `epiprocess`, and can be accessed with: ```{r, warning = FALSE, message = FALSE} library(epiprocess) @@ -515,4 +642,3 @@ Engineering. Copyright Johns Hopkins University 2020. API](https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html): These signals are taken directly from the JHU CSSE [COVID-19 GitHub repository](https://github.com/CSSEGISandData/COVID-19) without changes. - diff --git a/vignettes/epiprocess.Rmd b/vignettes/epiprocess.Rmd index b9648e00b..d979d685e 100644 --- a/vignettes/epiprocess.Rmd +++ b/vignettes/epiprocess.Rmd @@ -37,7 +37,6 @@ object, which we prepared by downloading the data using `epidatr::pub_covidcast()`. ```{r, results=FALSE, warning=FALSE, message=FALSE} -library(epidatasets) library(epidatr) library(epiprocess) library(dplyr) @@ -169,7 +168,7 @@ dv <- pub_covidcast( issues = epirange(20200601, 20211201) ) %>% select(geo_value, time_value, issue, percent_cli = value) %>% - as_epi_archive(compactify = TRUE) + as_epi_archive() dv ``` @@ -183,7 +182,7 @@ library(ggplot2) dv <- archive_cases_dv_subset$DT %>% select(-case_rate_7d_av) %>% tidyr::drop_na() %>% - as_epi_archive(compactify = TRUE) + as_epi_archive() dv ``` diff --git a/vignettes/growth_rate.Rmd b/vignettes/growth_rate.Rmd index 8f332de3a..67a75903a 100644 --- a/vignettes/growth_rate.Rmd +++ b/vignettes/growth_rate.Rmd @@ -24,7 +24,8 @@ library(dplyr) library(tidyr) ``` -The data is included in this package (via the [`epidatasets` package](https://cmu-delphi.github.io/epidatasets/)) and can be loaded with: +The data is included in the [`epidatasets` package](https://cmu-delphi.github.io/epidatasets/), +which is loaded along with `epiprocess`, and can be accessed with: ```{r} x <- cases_deaths_subset %>% @@ -81,7 +82,7 @@ the following methods for estimating the growth rate at a given reference point of the spline at `x0`. * "trend_filter": uses the estimated derivative at `x0` from polynomial trend filtering (a discrete spline) fit to `x` and `y`, via - `genlasso::trendfilter()`, divided by the fitted value of the discrete spline + `trendfilter::trendfilter()`, divided by the fitted value of the discrete spline at `x0`. The default in `growth_rate()` is `x0 = x`, so that it returns an estimate of @@ -98,7 +99,7 @@ the computed growth rates. ```{r} x <- x %>% group_by(geo_value) %>% - mutate(cases_gr1 = growth_rate(time_value, cases)) + mutate(cases_gr1 = growth_rate(cases)) head(x, 10) ``` @@ -161,7 +162,7 @@ but thankfully avoids some of the troublesome spikes: ```{r, message = FALSE, warning = FALSE, fig.width = 9, fig.height = 7} x <- x %>% group_by(geo_value) %>% - mutate(cases_gr2 = growth_rate(time_value, cases, method = "linear_reg")) + mutate(cases_gr2 = growth_rate(cases, method = "linear_reg")) x %>% pivot_longer( @@ -188,17 +189,20 @@ We can also use a nonparametric method to estimate the derivative, through computationally expensive, but it is also able to adapt better to the local level of smoothness. (The apparent efficiency is actually compounded by the particular implementations and default settings for these methods: -"trend_filter" is based on a full solution path algorithm provided in the -`genlasso` package, and performs cross-validation by default in order to pick -the level of regularization; read the documentation for `growth_rate()` more +"trend_filter" is based on a sequence of solutions provided in the +`trendfilter` package, and performs cross-validation by default in order to pick +the level of regularization; read the documentation for `growth_rate()` for more details.) +Note: The `trendfilter` package is not automatically installed with `epiprocess`. +To install it from GitHub, you can use `pak::pkg_install("glmgen/trendfilter")`. + ```{r, message = FALSE, warning = FALSE, fig.width = 9, fig.height = 7} x <- x %>% group_by(geo_value) %>% mutate( - cases_gr3 = growth_rate(time_value, cases, method = "smooth_spline"), - cases_gr4 = growth_rate(time_value, cases, method = "trend_filter") + cases_gr3 = growth_rate(cases, method = "smooth_spline"), + cases_gr4 = growth_rate(cases, method = "trend_filter") ) x %>% @@ -227,9 +231,9 @@ stable than the estimates from local relative changes and linear regressions. The smoothing spline growth rate estimates are based on the default settings in `stats::smooth.spline()`, and appear severely under-regularized here. Any of the arguments to `stats::smooth.spline()` can be customized by passing them as -additional arguments `...` in the call to `growth_rate()`; similarly, we can +additional arguments to `growth_rate_params()`; similarly, we can also use additional arguments to customize the settings in the underlying trend -filtering functions `genlasso::trendfilter()`, `genlasso::cv.trendfilter()`, and +filtering functions `trendfilter::trendfilter()`, `trendfilter::cv_trendfilter()`, and the documentation for `growth_rate()` gives the full details. ## Log scale estimation @@ -246,22 +250,10 @@ the call to `growth_rate()`. x <- x %>% group_by(geo_value) %>% mutate( - cases_gr5 = growth_rate(time_value, cases, - method = "rel_change", - log_scale = TRUE - ), - cases_gr6 = growth_rate(time_value, cases, - method = "linear_reg", - log_scale = TRUE - ), - cases_gr7 = growth_rate(time_value, cases, - method = "smooth_spline", - log_scale = TRUE - ), - cases_gr8 = growth_rate(time_value, cases, - method = "trend_filter", - log_scale = TRUE - ) + cases_gr5 = growth_rate(cases, method = "rel_change", log_scale = TRUE), + cases_gr6 = growth_rate(cases, method = "linear_reg", log_scale = TRUE), + cases_gr7 = growth_rate(cases, method = "smooth_spline", log_scale = TRUE), + cases_gr8 = growth_rate(cases, method = "trend_filter", log_scale = TRUE) ) x %>%