cmu-delphi · dshemetov · Aug 23, 2024 · Aug 23, 2024 · Aug 13, 2024 · Aug 13, 2024
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -4,7 +4,7 @@ Title: Tools for basic signal processing in epidemiology
 Version: 0.8.5
 Authors@R: c(
     person("Jacob", "Bien", role = "ctb"),
-    person("Logan", "Brooks", email = "[email protected]", role = c("aut", "cre")),
+    person("Logan", "Brooks", , "[email protected]", role = c("aut", "cre")),
     person("Rafael", "Catoia", role = "ctb"),
     person("Nat", "DeFries", role = "ctb"),
     person("Daniel", "McDonald", role = "aut"),
@@ -15,16 +15,22 @@ Authors@R: c(
     person("Evan", "Ray", role = "aut"),
     person("Dmitry", "Shemetov", role = "ctb"),
     person("Ryan", "Tibshirani", role = "aut"),
-    person("Lionel", "Henry", role = "ctb", comment = "Author of included rlang fragments"),
-    person("Hadley", "Wickham", role = "ctb", comment = "Author of included rlang fragments"),
-    person("Posit", role = "cph", comment = "Copyright holder of included rlang fragments")
+    person("Lionel", "Henry", role = "ctb",
+           comment = "Author of included rlang fragments"),
+    person("Hadley", "Wickham", role = "ctb",
+           comment = "Author of included rlang fragments"),
+    person("Posit", role = "cph",
+           comment = "Copyright holder of included rlang fragments")
   )
-Description: This package introduces a common data structure for epidemiological
-    data reported by location and time, provides another data structure to
-    work with revisions to these data sets over time, and offers associated
-    utilities to perform basic signal processing tasks.
+Description: This package introduces a common data structure for
+    epidemiological data reported by location and time, provides another
+    data structure to work with revisions to these data sets over time,
+    and offers associated utilities to perform basic signal processing
+    tasks.
 License: MIT + file LICENSE
-Copyright: file inst/COPYRIGHTS
+URL: https://cmu-delphi.github.io/epiprocess/
+Depends:
+    R (>= 3.6)
 Imports:
     checkmate,
     cli,
@@ -58,18 +64,16 @@ VignetteBuilder:
     knitr
 Remotes:
     cmu-delphi/epidatr,
-    reconverse/outbreaks,
-    glmgen/genlasso
+    glmgen/genlasso,
+    reconverse/outbreaks
 Config/testthat/edition: 3
 Config/testthat/parallel: true
+Copyright: file inst/COPYRIGHTS
 Encoding: UTF-8
 LazyData: true
 Roxygen: list(markdown = TRUE)
 RoxygenNote: 7.3.2
-Depends:
-    R (>= 2.10)
-URL: https://cmu-delphi.github.io/epiprocess/
-Collate:
+Collate: 
     'archive.R'
     'autoplot.R'
     'correlation.R'

diff --git a/NEWS.md b/NEWS.md
@@ -5,7 +5,8 @@ Pre-1.0.0 numbering scheme: 0.x will indicate releases, while 0.x.y will indicat
 # epiprocess 0.9
 
 ## Breaking changes
-- In `epi[x]_slide`:
+
+- In `epi[x]_slide`
   - `names_sep` is deprecated, and if you return data frames from your
     computations, they will no longer be unpacked into separate columns with
     name prefixes; instead:
@@ -15,12 +16,18 @@ Pre-1.0.0 numbering scheme: 0.x will indicate releases, while 0.x.y will indicat
       packed data.frame-class column (see `tidyr::pack`).
   - `as_list_col` is deprecated; you can now directly return a list from your
     slide computations instead.
+- `additional_metadata` is no longer accepted in `as_epi_df()` or
+  `as_epi_archive()`. Use the new `other_keys` arg to specify additional key
+  columns, such as age group columns or other demographic breakdowns.
+  Miscellaneous metadata are no longer handled by `epiprocess`, but you can use
+  R's built-in `attr<-` instead for a similar feature.
 
 ## Improvements
 
 - Added `complete.epi_df`, which fills in missing values in an `epi_df` with
   `NA`s. Uses `tidyr::complete` underneath and preserves `epi_df` metadata.
-- Inclusion of the function `revision_summary` to provide basic revision information for `epi_archive`s out of the box. (#492)
+- Inclusion of the function `revision_summary` to provide basic revision
+  information for `epi_archive`s out of the box. (#492)
 
 ## Bug fixes
 
@@ -87,8 +94,8 @@ Pre-1.0.0 numbering scheme: 0.x will indicate releases, while 0.x.y will indicat
   - Multiple "data-masking" tidy evaluation expressions can be passed in via
     `...`, rather than just one.
   - Additional tidy evaluation features from `dplyr::mutate` are supported: `!!
-    name_var := value`, unnamed expressions evaluating to data frames, and `=
-    NULL`; see `?epi_slide` for more details.
+name_var := value`, unnamed expressions evaluating to data frames, and `=
+NULL`; see `?epi_slide` for more details.
 
 ## Cleanup
 

diff --git a/R/archive.R b/R/archive.R
@@ -179,7 +179,8 @@ NULL
 #'
 #' * `geo_type`: the type for the geo values.
 #' * `time_type`: the type for the time values.
-#' * `additional_metadata`: list of additional metadata for the data archive.
+#' * `other_keys`: any additional keys as a character vector.
+#'    Typical examples are "age" or sub-geographies.
 #'
 #' While this metadata is not protected, it is generally recommended to treat it
 #'  as read-only, and to use the `epi_archive` methods to interact with the data
@@ -209,10 +210,8 @@ NULL
 #' if the time type is not recognized.
 #' @param other_keys Character vector specifying the names of variables in `x`
 #'   that should be considered key variables (in the language of `data.table`)
-#'   apart from "geo_value", "time_value", and "version".
-#' @param additional_metadata List of additional metadata to attach to the
-#'   `epi_archive` object. The metadata will have the `geo_type` field; named
-#'   entries from the passed list or will be included as well.
+#'   apart from "geo_value", "time_value", and "version". Typical examples
+#'   are "age" or more granular geographies.
 #' @param compactify Optional; Boolean. `TRUE` will remove some
 #'   redundant rows, `FALSE` will not, and missing or `NULL` will remove
 #'   redundant rows, but issue a warning. See more information at `compactify`.
@@ -293,7 +292,6 @@ new_epi_archive <- function(
     geo_type,
     time_type,
     other_keys,
-    additional_metadata,
     compactify,
     clobberable_versions_start,
     versions_end,
@@ -350,7 +348,7 @@ new_epi_archive <- function(
       DT = compactified,
       geo_type = geo_type,
       time_type = time_type,
-      additional_metadata = additional_metadata,
+      other_keys = other_keys,
       clobberable_versions_start = clobberable_versions_start,
       versions_end = versions_end
     ),
@@ -423,7 +421,6 @@ is_locf <- function(vec, tolerance) { # nolint: object_usage_linter
 validate_epi_archive <- function(
     x,
     other_keys,
-    additional_metadata,
     compactify,
     clobberable_versions_start,
     versions_end) {
@@ -434,9 +431,6 @@ validate_epi_archive <- function(
   if (any(c("geo_value", "time_value", "version") %in% other_keys)) {
     cli_abort("`other_keys` cannot contain \"geo_value\", \"time_value\", or \"version\".")
   }
-  if (any(names(additional_metadata) %in% c("geo_type", "time_type"))) {
-    cli_warn("`additional_metadata` names overlap with existing metadata fields \"geo_type\" or \"time_type\".")
-  }
 
   # Conduct checks and apply defaults for `compactify`
   assert_logical(compactify, len = 1, any.missing = FALSE, null.ok = TRUE)
@@ -485,8 +479,7 @@ as_epi_archive <- function(
     x,
     geo_type = deprecated(),
     time_type = deprecated(),
-    other_keys = character(0L),
-    additional_metadata = list(),
+    other_keys = character(),
     compactify = NULL,
     clobberable_versions_start = NA,
     .versions_end = max_version_with_row_in(x), ...,
@@ -518,11 +511,10 @@ as_epi_archive <- function(
   time_type <- guess_time_type(x$time_value)
 
   validate_epi_archive(
-    x, other_keys, additional_metadata,
-    compactify, clobberable_versions_start, versions_end
+    x, other_keys, compactify, clobberable_versions_start, versions_end
   )
   new_epi_archive(
-    x, geo_type, time_type, other_keys, additional_metadata,
+    x, geo_type, time_type, other_keys,
     compactify, clobberable_versions_start, versions_end
   )
 }
@@ -551,7 +543,7 @@ print.epi_archive <- function(x, ..., class = TRUE, methods = TRUE) {
     c(
       ">" = if (class) "An `epi_archive` object, with metadata:",
       "i" = if (length(setdiff(key(x$DT), c("geo_value", "time_value", "version"))) > 0) {
-        "Non-standard DT keys: {setdiff(key(x$DT), c('geo_value', 'time_value', 'version'))}"
+        "Other DT keys: {setdiff(key(x$DT), c('geo_value', 'time_value', 'version'))}"
       },
       "i" = if (nrow(x$DT) != 0L) {
         "Min/max time values: {min(x$DT$time_value)} / {max(x$DT$time_value)}"
@@ -687,7 +679,8 @@ print.epi_archive <- function(x, ..., class = TRUE, methods = TRUE) {
 #' @export
 #'
 #' @aliases grouped_epi_archive
-group_by.epi_archive <- function(.data, ..., .add = FALSE, .drop = dplyr::group_by_drop_default(.data)) {
+group_by.epi_archive <- function(.data, ..., .add = FALSE,
+                                 .drop = dplyr::group_by_drop_default(.data)) {
   # `add` makes no difference; this is an ungrouped `epi_archive`.
   detailed_mutate <- epix_detailed_restricted_mutate(.data, ...)
   assert_logical(.drop)

diff --git a/R/epi_df.R b/R/epi_df.R
@@ -127,7 +127,7 @@
 #'   dplyr::rename(geo_value = state, time_value = reported_date) %>%
 #'   as_epi_df(
 #'     as_of = "2020-06-03",
-#'     additional_metadata = list(other_keys = "pol")
+#'     other_keys = "pol"
 #'   )
 #'
 #' attr(ex2, "metadata")
@@ -146,47 +146,46 @@
 #'     state = rep("MA", 6),
 #'     pol = rep(c("blue", "swing", "swing"), each = 2)
 #'   ) %>%
-#'   # the 2 extra keys we added have to be specified in the other_keys
-#'   # component of additional_metadata.
-#'   as_epi_df(additional_metadata = list(other_keys = c("state", "pol")))
+#'   as_epi_df(other_keys = c("state", "pol"))
 #'
 #' attr(ex3, "metadata")
 NULL
 
-#' Create an `epi_df` object
-#'
-#' @rdname epi_df
-#' @param geo_type DEPRECATED Has no effect. Geo value type is inferred from the
-#' location column and set to "custom" if not recognized.
-#' @param time_type DEPRECATED Has no effect. Time value type inferred from the time
-#' column and set to "custom" if not recognized. Unpredictable behavior may result
-#' if the time type is not recognized.
+#' @describeIn epi_df Lower-level constructor for `epi_df` object
+#' @order 2
+#' @param geo_type `r lifecycle::badge("deprecated")` in `as_epi_df()`, has no
+#'   effect; the geo value type is inferred from the location column and set to
+#'   "custom" if not recognized. In `new_epi_df()`, should be set to the same
+#'   value that would be inferred.
+#' @param time_type `r lifecycle::badge("deprecated")` in `as_epi_df()`, has no
+#'   effect: the time value type inferred from the time column and set to
+#'   "custom" if not recognized. Unpredictable behavior may result if the time
+#'   type is not recognized. In `new_epi_df()`, should be set to the same value
+#'   that would be inferred.
 #' @param as_of Time value representing the time at which the given data were
 #'   available. For example, if `as_of` is January 31, 2022, then the `epi_df`
 #'   object that is created would represent the most up-to-date version of the
 #'   data available as of January 31, 2022. If the `as_of` argument is missing,
 #'   then the current day-time will be used.
-#' @param additional_metadata List of additional metadata to attach to the
-#'   `epi_df` object. The metadata will have `geo_type`, `time_type`, and
-#'   `as_of` fields; named entries from the passed list will be included as
-#'   well. If your tibble has additional keys, be sure to specify them as a
-#'   character vector in the `other_keys` component of `additional_metadata`.
+#' @param other_keys If your tibble has additional keys, be sure to specify them
+#'   as a character vector here (typical examples are "age" or sub-geographies).
 #' @param ... Additional arguments passed to methods.
 #' @return An `epi_df` object.
 #'
 #' @export
-new_epi_df <- function(x = tibble::tibble(), geo_type, time_type, as_of,
-                       additional_metadata = list()) {
+new_epi_df <- function(x = tibble::tibble(geo_value = character(), time_value = as.Date(integer())),
+                       geo_type, time_type, as_of,
+                       other_keys = character(), ...) {
   # Define metadata fields
   metadata <- list()
   metadata$geo_type <- geo_type
   metadata$time_type <- time_type
   metadata$as_of <- as_of
-  metadata <- c(metadata, additional_metadata)
+  metadata$other_keys <- other_keys
 
   # Reorder columns (geo_value, time_value, ...)
   if (sum(dim(x)) != 0) {
-    cols_to_put_first <- c("geo_value", "time_value")
+    cols_to_put_first <- c("geo_value", "time_value", other_keys)
     x <- x[, c(
       cols_to_put_first,
       # All other columns
@@ -200,7 +199,8 @@ new_epi_df <- function(x = tibble::tibble(), geo_type, time_type, as_of,
   return(x)
 }
 
-#' @rdname epi_df
+#' @describeIn epi_df The preferred way of constructing `epi_df`s
+#' @order 1
 #' @param x An `epi_df`, `data.frame`, [tibble::tibble], or [tsibble::tsibble]
 #'   to be converted
 #' @param ... used for specifying column names, as in [`dplyr::rename`]. For
@@ -211,24 +211,26 @@ as_epi_df <- function(x, ...) {
 }
 
 #' @rdname epi_df
+#' @order 1
 #' @method as_epi_df epi_df
 #' @export
 as_epi_df.epi_df <- function(x, ...) {
   return(x)
 }
 
 #' @rdname epi_df
-#' @method as_epi_df tbl_df
+#' @order 1
 #' @importFrom rlang .data
 #' @importFrom tidyselect any_of
 #' @importFrom cli cli_inform
+#' @method as_epi_df tbl_df
 #' @export
 as_epi_df.tbl_df <- function(
     x,
     geo_type = deprecated(),
     time_type = deprecated(),
     as_of,
-    additional_metadata = list(),
+    other_keys = character(),
     ...) {
   # possible standard substitutions for time_value
   x <- rename(x, ...)
@@ -274,29 +276,28 @@ as_epi_df.tbl_df <- function(
     } # Use the current day-time
   }
 
-  assert_list(additional_metadata)
-  additional_metadata[["other_keys"]] <- additional_metadata[["other_keys"]] %||% character(0L)
-  new_epi_df(x, geo_type, time_type, as_of, additional_metadata)
+  assert_character(other_keys)
+  new_epi_df(x, geo_type, time_type, as_of, other_keys)
 }
 
-#' @method as_epi_df data.frame
 #' @rdname epi_df
+#' @order 1
+#' @method as_epi_df data.frame
 #' @export
-as_epi_df.data.frame <- function(x, as_of, additional_metadata = list(), ...) {
-  as_epi_df.tbl_df(x = tibble::as_tibble(x), as_of = as_of, additional_metadata = additional_metadata, ...)
+as_epi_df.data.frame <- function(x, as_of, other_keys = character(), ...) {
+  as_epi_df.tbl_df(x = tibble::as_tibble(x), as_of = as_of, other_keys = other_keys, ...)
 }
 
-#' @method as_epi_df tbl_ts
 #' @rdname epi_df
+#' @order 1
+#' @method as_epi_df tbl_ts
 #' @export
-as_epi_df.tbl_ts <- function(x, as_of, additional_metadata = list(), ...) {
+as_epi_df.tbl_ts <- function(x, as_of, other_keys = character(), ...) {
   tsibble_other_keys <- setdiff(tsibble::key_vars(x), "geo_value")
-  if (length(tsibble_other_keys) != 0) {
-    additional_metadata$other_keys <- unique(
-      c(additional_metadata$other_keys, tsibble_other_keys)
-    )
+  if (length(tsibble_other_keys) > 0) {
+    other_keys <- unique(c(other_keys, tsibble_other_keys))
   }
-  as_epi_df.tbl_df(x = tibble::as_tibble(x), as_of = as_of, additional_metadata = additional_metadata, ...)
+  as_epi_df.tbl_df(x = tibble::as_tibble(x), as_of = as_of, other_keys = other_keys, ...)
 }
 
 #' Test for `epi_df` format