Skip to content

Commit 8948868

Browse files
committed
Merge remote-tracking branch 'upstream/dev' into lcb/fix-guess-period-datetimes
2 parents a5f397f + 69ea5e4 commit 8948868

22 files changed

+404
-78
lines changed

DESCRIPTION

+1-4
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
Type: Package
22
Package: epiprocess
33
Title: Tools for basic signal processing in epidemiology
4-
Version: 0.7.13
4+
Version: 0.7.14
55
Authors@R: c(
66
person("Jacob", "Bien", role = "ctb"),
77
person("Logan", "Brooks", email = "[email protected]", role = c("aut", "cre")),
@@ -30,9 +30,6 @@ Imports:
3030
cli,
3131
data.table,
3232
dplyr (>= 1.0.0),
33-
fabletools,
34-
feasts,
35-
generics,
3633
genlasso,
3734
ggplot2,
3835
lifecycle (>= 1.0.1),

NAMESPACE

+5
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,7 @@ export(epix_merge)
6464
export(epix_slide)
6565
export(epix_truncate_versions_after)
6666
export(filter)
67+
export(geo_column_names)
6768
export(group_by)
6869
export(group_modify)
6970
export(growth_rate)
@@ -79,9 +80,11 @@ export(next_after)
7980
export(relocate)
8081
export(rename)
8182
export(slice)
83+
export(time_column_names)
8284
export(ungroup)
8385
export(unnest)
8486
export(validate_epi_archive)
87+
export(version_column_names)
8588
importFrom(checkmate,anyInfinite)
8689
importFrom(checkmate,anyMissing)
8790
importFrom(checkmate,assert)
@@ -104,6 +107,7 @@ importFrom(checkmate,test_subset)
104107
importFrom(checkmate,vname)
105108
importFrom(cli,cat_line)
106109
importFrom(cli,cli_abort)
110+
importFrom(cli,cli_inform)
107111
importFrom(cli,cli_vec)
108112
importFrom(cli,cli_warn)
109113
importFrom(cli,format_message)
@@ -190,6 +194,7 @@ importFrom(tibble,as_tibble)
190194
importFrom(tibble,new_tibble)
191195
importFrom(tibble,validate_tibble)
192196
importFrom(tidyr,unnest)
197+
importFrom(tidyselect,any_of)
193198
importFrom(tidyselect,eval_select)
194199
importFrom(tidyselect,starts_with)
195200
importFrom(tsibble,as_tsibble)

NEWS.md

+13
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,11 @@ Pre-1.0.0 numbering scheme: 0.x will indicate releases, while 0.x.y will indicat
44

55
# epiprocess 0.8
66

7+
## Breaking changes
8+
- `detect_outlr_stl(seasonal_period = NULL)` is no longer accepted. Use
9+
`detect_outlr_stl(seasonal_period = <value>, seasonal_as_residual = TRUE)`
10+
instead. See `?detect_outlr_stl` for more details.
11+
712
## Improvements
813

914
- `epi_slide` computations are now 2-4 times faster after changing how
@@ -35,6 +40,11 @@ Pre-1.0.0 numbering scheme: 0.x will indicate releases, while 0.x.y will indicat
3540
- Improved documentation web site landing page's introduction.
3641
- Fixed documentation referring to old `epi_slide()` interface (#466, thanks
3742
@XuedaShen!).
43+
- `as_epi_df` and `as_epi_archive` now support arguments to specify column names
44+
e.g. `as_epi_df(some_tibble, geo_value=state)`. In addition, there is a list
45+
of default conversions, see `time_column_names` for a list of columns that
46+
will automatically be recognized and converted to `time_value` column (there
47+
are similar functions for `geo` and `version`).
3848
- Fixed bug where `epix_slide_ref_time_values_default()` on datetimes would
3949
output a huge number of `ref_time_values` spaced apart by mere seconds.
4050

@@ -45,6 +55,9 @@ Pre-1.0.0 numbering scheme: 0.x will indicate releases, while 0.x.y will indicat
4555
- Added optional `decay_to_tibble` attribute controlling `as_tibble()` behavior
4656
of `epi_df`s to let `{epipredict}` work more easily with other libraries (#471).
4757

58+
## Cleanup
59+
- Removed some external package dependencies.
60+
4861
# epiprocess 0.7.0
4962

5063
## Breaking changes:

R/archive.R

+15-2
Original file line numberDiff line numberDiff line change
@@ -442,6 +442,11 @@ validate_epi_archive <- function(
442442

443443
#' `as_epi_archive` converts a data frame, data table, or tibble into an
444444
#' `epi_archive` object.
445+
#' @param ... used for specifying column names, as in [`dplyr::rename`]. For
446+
#' example `version = release_date`
447+
#' @param .versions_end location based versions_end, used to avoid prefix
448+
#' `version = issue` from being assigned to `versions_end` instead of being
449+
#' used to rename columns.
445450
#'
446451
#' @rdname epi_archive
447452
#'
@@ -454,11 +459,19 @@ as_epi_archive <- function(
454459
additional_metadata = NULL,
455460
compactify = NULL,
456461
clobberable_versions_start = NULL,
457-
versions_end = NULL) {
462+
.versions_end = NULL, ...,
463+
versions_end = .versions_end) {
458464
assert_data_frame(x)
465+
x <- rename(x, ...)
466+
x <- guess_column_name(x, "time_value", time_column_names())
467+
x <- guess_column_name(x, "geo_value", geo_column_names())
468+
x <- guess_column_name(x, "version", version_column_names())
459469
if (!test_subset(c("geo_value", "time_value", "version"), names(x))) {
460470
cli_abort(
461-
"Columns `geo_value`, `time_value`, and `version` must be present in `x`."
471+
"Either columns `geo_value`, `time_value`, and `version`, or related columns
472+
(see the internal functions `guess_time_column_name()`,
473+
`guess_geo_column_name()` and/or `guess_geo_version_name()` for complete
474+
list) must be present in `x`."
462475
)
463476
}
464477
if (anyMissing(x$version)) {

R/epi_df.R

+25-10
Original file line numberDiff line numberDiff line change
@@ -95,7 +95,7 @@ NULL
9595
#'
9696
#' @export
9797
new_epi_df <- function(x = tibble::tibble(), geo_type, time_type, as_of,
98-
additional_metadata = list(), ...) {
98+
additional_metadata = list()) {
9999
assert_data_frame(x)
100100
assert_list(additional_metadata)
101101

@@ -162,6 +162,7 @@ new_epi_df <- function(x = tibble::tibble(), geo_type, time_type, as_of,
162162
#' guide](https://cmu-delphi.github.io/epiprocess/articles/epiprocess.html) for
163163
#' examples.
164164
#'
165+
#' @param ... Additional arguments passed to methods.
165166
#' @template epi_df-params
166167
#'
167168
#' @export
@@ -249,25 +250,39 @@ as_epi_df.epi_df <- function(x, ...) {
249250

250251
#' @method as_epi_df tbl_df
251252
#' @describeIn as_epi_df The input tibble `x` must contain the columns
252-
#' `geo_value` and `time_value`. All other columns will be preserved as is,
253-
#' and treated as measured variables. If `as_of` is missing, then the function
254-
#' will try to guess it from an `as_of`, `issue`, or `version` column of `x`
255-
#' (if any of these are present), or from as an `as_of` field in its metadata
256-
#' (stored in its attributes); if this fails, then the current day-time will
257-
#' be used.
253+
#' `geo_value` and `time_value`, or column names that uniquely map onto these
254+
#' (e.g. `date` or `province`). Alternatively, you can specify the conversion
255+
#' explicitly (`time_value = someWeirdColumnName`). All other columns not
256+
#' specified as `other_keys` will be preserved as is, and treated as measured
257+
#' variables.
258+
#'
259+
#' If `as_of` is missing, then the function will try to guess it from an
260+
#' `as_of`, `issue`, or `version` column of `x` (if any of these are present),
261+
#' or from as an `as_of` field in its metadata (stored in its attributes); if
262+
#' this fails, then the current day-time will be used.
258263
#' @importFrom rlang .data
264+
#' @importFrom tidyselect any_of
265+
#' @importFrom cli cli_inform
259266
#' @export
260267
as_epi_df.tbl_df <- function(x, geo_type, time_type, as_of,
261-
additional_metadata = list(), ...) {
268+
additional_metadata = list(),
269+
...) {
270+
# possible standard substitutions for time_value
271+
x <- rename(x, ...)
272+
x <- guess_column_name(x, "time_value", time_column_names())
273+
x <- guess_column_name(x, "geo_value", geo_column_names())
262274
if (!test_subset(c("geo_value", "time_value"), names(x))) {
263275
cli_abort(
264-
"Columns `geo_value` and `time_value` must be present in `x`."
276+
"Either columns `geo_value` and `time_value` or related columns
277+
(see the internal functions `guess_time_column_name()` and/or
278+
`guess_geo_column_name()` for a complete list)
279+
must be present in `x`."
265280
)
266281
}
267282

268283
new_epi_df(
269284
x, geo_type, time_type, as_of,
270-
additional_metadata, ...
285+
additional_metadata
271286
)
272287
}
273288

R/outliers.R

+59-34
Original file line numberDiff line numberDiff line change
@@ -64,9 +64,10 @@
6464
#' args = list(list(
6565
#' detect_negatives = TRUE,
6666
#' detection_multiplier = 2.5,
67-
#' seasonal_period = NULL
67+
#' seasonal_period = 7,
68+
#' seasonal_as_residual = TRUE
6869
#' )),
69-
#' abbr = "stl_nonseasonal"
70+
#' abbr = "stl_reseasonal"
7071
#' )
7172
#' )
7273
#'
@@ -216,18 +217,28 @@ detect_outlr_rm <- function(x = seq_along(y), y, n = 21,
216217
#' @param n_trend Number of time steps to use in the rolling window for trend.
217218
#' Default is 21.
218219
#' @param n_seasonal Number of time steps to use in the rolling window for
219-
#' seasonality. Default is 21.
220+
#' seasonality. Default is 21. Can also be the string "periodic". See
221+
#' `s.window` in [`stats::stl`].
220222
#' @param n_threshold Number of time steps to use in rolling window for the IQR
221223
#' outlier thresholds.
222-
#' @param seasonal_period Integer specifying period of seasonality. For example,
223-
#' for daily data, a period 7 means weekly seasonality. The default is `NULL`,
224-
#' meaning that no seasonal term will be included in the STL decomposition.
224+
#' @param seasonal_period Integer specifying period of "seasonality". For
225+
#' example, for daily data, a period 7 means weekly seasonality. It must be
226+
#' strictly larger than 1. Also impacts the size of the low-pass filter
227+
#' window; see `l.window` in [`stats::stl`].
228+
#' @param seasonal_as_residual Boolean specifying whether the seasonal(/weekly)
229+
#' component should be treated as part of the residual component instead of as
230+
#' part of the predictions. The default, FALSE, treats them as part of the
231+
#' predictions, so large seasonal(/weekly) components will not lead to
232+
#' flagging points as outliers. `TRUE` may instead consider the extrema of
233+
#' large seasonal variations to be outliers; `n_seasonal` and
234+
#' `seasonal_period` will still have an impact on the result, though, by
235+
#' impacting the estimation of the trend component.
225236
#' @template outlier-detection-options
226237
#' @template detect-outlr-return
227238
#'
228-
#' @details The STL decomposition is computed using the `feasts` package. Once
239+
#' @details The STL decomposition is computed using [`stats::stl()`]. Once
229240
#' computed, the outlier detection method is analogous to the rolling median
230-
#' method in `detect_outlr_rm()`, except with the fitted values and residuals
241+
#' method in [`detect_outlr_rm()`], except with the fitted values and residuals
231242
#' from the STL decomposition taking the place of the rolling median and
232243
#' residuals to the rolling median, respectively.
233244
#'
@@ -252,12 +263,34 @@ detect_outlr_stl <- function(x = seq_along(y), y,
252263
n_trend = 21,
253264
n_seasonal = 21,
254265
n_threshold = 21,
255-
seasonal_period = NULL,
266+
seasonal_period,
267+
seasonal_as_residual = FALSE,
256268
log_transform = FALSE,
257269
detect_negatives = FALSE,
258270
detection_multiplier = 2,
259271
min_radius = 0,
260272
replacement_multiplier = 0) {
273+
if (dplyr::n_distinct(x) != length(y)) {
274+
cli_abort("`x` contains duplicate values. (If being run on a column in an
275+
`epi_df`, did you group by relevant key variables?)")
276+
}
277+
if (length(y) <= 1L) {
278+
cli_abort("`y` has length {length(y)}; that's definitely too little for
279+
STL. (If being run in a `mutate()` or `epi_slide()`, check
280+
whether you grouped by too many variables; you should not be
281+
grouping by `time_value` in particular.)")
282+
}
283+
distinct_x_skips <- unique(diff(x))
284+
if (diff(range(distinct_x_skips)) > 1e-4 * mean(distinct_x_skips)) {
285+
cli_abort("`x` does not appear to have regular spacing; consider filling in
286+
gaps with imputed values (STL does not allow NAs).")
287+
}
288+
if (is.unsorted(x)) { # <- for performance in common (sorted) case
289+
o <- order(x)
290+
x <- x[o]
291+
y <- y[o]
292+
}
293+
261294
# Transform if requested
262295
if (log_transform) {
263296
# Replace all negative values with 0
@@ -266,32 +299,22 @@ detect_outlr_stl <- function(x = seq_along(y), y,
266299
y <- log(y + offset)
267300
}
268301

269-
# Make a tsibble for fabletools, setup and run STL
270-
z_tsibble <- tsibble::tsibble(x = x, y = y, index = x)
271-
272-
stl_formula <- y ~ trend(window = n_trend) +
273-
season(period = seasonal_period, window = n_seasonal)
302+
assert_int(seasonal_period, lower = 2L)
303+
assert_logical(seasonal_as_residual, len = 1L, any.missing = FALSE)
274304

275-
stl_components <- z_tsibble %>%
276-
fabletools::model(feasts::STL(stl_formula, robust = TRUE)) %>%
277-
generics::components() %>%
305+
yts <- stats::ts(y, frequency = seasonal_period)
306+
stl_comp <- stats::stl(yts,
307+
t.window = n_trend, s.window = n_seasonal,
308+
robust = TRUE
309+
)$time.series %>%
278310
tibble::as_tibble() %>%
279-
dplyr::select(.data$trend:.data$remainder) %>% #
280-
dplyr::rename_with(~"seasonal", tidyselect::starts_with("season")) %>%
281311
dplyr::rename(resid = .data$remainder)
282312

283313
# Allocate the seasonal term from STL to either fitted or resid
284-
if (!is.null(seasonal_period)) {
285-
stl_components <- stl_components %>%
286-
dplyr::mutate(
287-
fitted = .data$trend + .data$seasonal
288-
)
314+
if (!seasonal_as_residual) {
315+
stl_comp <- dplyr::mutate(stl_comp, fitted = .data$trend + .data$seasonal)
289316
} else {
290-
stl_components <- stl_components %>%
291-
dplyr::mutate(
292-
fitted = .data$trend,
293-
resid = .data$seasonal + resid
294-
)
317+
stl_comp <- dplyr::mutate(stl_comp, fitted = .data$trend, resid = .data$seasonal + .data$resid)
295318
}
296319

297320
# Detect negatives if requested
@@ -306,10 +329,7 @@ detect_outlr_stl <- function(x = seq_along(y), y,
306329

307330
# Calculate lower and upper thresholds and replacement value
308331
z <- z %>%
309-
dplyr::mutate(
310-
fitted = stl_components$fitted,
311-
resid = stl_components$resid
312-
) %>%
332+
dplyr::mutate(fitted = stl_comp$fitted, resid = stl_comp$resid) %>%
313333
roll_iqr(
314334
n = n_threshold,
315335
detection_multiplier = detection_multiplier,
@@ -337,7 +357,12 @@ roll_iqr <- function(z, n, detection_multiplier, min_radius,
337357
as_type <- as.numeric
338358
}
339359

340-
epi_slide(z, roll_iqr = stats::IQR(resid), before = floor((n - 1) / 2), after = ceiling((n - 1) / 2)) %>%
360+
z %>%
361+
epi_slide(
362+
roll_iqr = stats::IQR(resid),
363+
before = floor((n - 1) / 2),
364+
after = ceiling((n - 1) / 2)
365+
) %>%
341366
dplyr::mutate(
342367
lower = pmax(
343368
min_lower,

0 commit comments

Comments
 (0)