Skip to content

Commit eeb4994

Browse files
authored
Merge pull request #538 from cmu-delphi/ds/vignettes
doc: documentation rework
2 parents cb468db + 6287985 commit eeb4994

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

78 files changed

+3781
-3199
lines changed

.Rbuildignore

+5-1
Original file line numberDiff line numberDiff line change
@@ -17,4 +17,8 @@
1717
^DEVELOPMENT.md$
1818
man-roxygen
1919
^.venv$
20-
^sandbox.R$
20+
^sandbox.R$
21+
^README.Rmd$
22+
^README_cache$
23+
^pkgdown-watch.R$
24+
^.editorconfig$

.editorconfig

+21
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
# EditorConfig helps developers define and maintain consistent
2+
# coding styles between different editors and IDEs
3+
# editorconfig.org
4+
5+
root = true
6+
7+
8+
[*]
9+
10+
# Change these settings to your own preference
11+
indent_style = space
12+
indent_size = 2
13+
14+
# We recommend you to keep these unchanged
15+
end_of_line = lf
16+
charset = utf-8
17+
trim_trailing_whitespace = true
18+
insert_final_newline = true
19+
20+
[*.md]
21+
trim_trailing_whitespace = false

.gitignore

+6-1
Original file line numberDiff line numberDiff line change
@@ -13,4 +13,9 @@ docs
1313
renv/
1414
renv.lock
1515
.Rprofile
16-
sandbox.R
16+
sandbox.R
17+
# Vignette caches
18+
*_cache/
19+
vignettes/*.html
20+
vignettes/*.R
21+
!vignettes/_common.R

DESCRIPTION

+9-8
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
1-
Type: Package
21
Package: epiprocess
2+
Type: Package
33
Title: Tools for basic signal processing in epidemiology
4-
Version: 0.9.5
4+
Version: 0.9.6
55
Authors@R: c(
66
person("Jacob", "Bien", role = "ctb"),
77
person("Logan", "Brooks", , "[email protected]", role = c("aut", "cre")),
@@ -28,11 +28,11 @@ Authors@R: c(
2828
person("Carnegie Mellon University Delphi Group", role = "dtc",
2929
comment = "Owner of claims-based CLI data from the Delphi Epidata API")
3030
)
31-
Description: This package introduces a common data structure for
32-
epidemiological data reported by location and time, provides another
33-
data structure to work with revisions to these data sets over time,
34-
and offers associated utilities to perform basic signal processing
35-
tasks.
31+
Description: This package introduces common data structures for working with
32+
epidemiological data reported by location and time and offers associated
33+
utilities to perform basic signal processing tasks. The package is designed
34+
to be used in conjunction with `epipredict` for building and evaluating
35+
epidemiological models.
3636
License: MIT + file LICENSE
3737
URL: https://cmu-delphi.github.io/epiprocess/
3838
Depends:
@@ -62,6 +62,7 @@ Imports:
6262
Suggests:
6363
devtools,
6464
epidatr,
65+
here,
6566
knitr,
6667
outbreaks,
6768
readr,
@@ -88,7 +89,7 @@ Collate:
8889
'correlation.R'
8990
'epi_df.R'
9091
'epi_df_forbidden_methods.R'
91-
'epiprocess.R'
92+
'epiprocess-package.R'
9293
'group_by_epi_df_methods.R'
9394
'methods-epi_archive.R'
9495
'grouped_epi_archive.R'

DEVELOPMENT.md

+16-18
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,8 @@
11
## Setting up the development environment
22

33
```r
4-
install.packages(c('devtools', 'pkgdown', 'styler', 'lintr')) # install dev dependencies
5-
devtools::install_deps(dependencies = TRUE) # install package dependencies
6-
devtools::document() # generate package meta data and man files
7-
devtools::build() # build package
4+
install.packages(c('devtools', 'pkgdown', 'styler', 'lintr', 'pak')) # install dev dependencies
5+
pak::pkg_install(".") # install package and dependencies
86
```
97

108
## Validating the package
@@ -13,8 +11,12 @@ devtools::build() # build package
1311
styler::style_pkg() # format code
1412
lintr::lint_package() # lint code
1513

14+
devtools::check() # run R CMD check, which runs everything below
15+
devtools::document() # generate package meta data and man files
1616
devtools::test() # test package
17-
devtools::check() # check package for errors
17+
devtools::build_vignettes() # build vignettes only
18+
devtools::run_examples() # run doc examples
19+
devtools::check(vignettes = FALSE) # check package without vignettes
1820
```
1921

2022
## Developing the documentation site
@@ -24,20 +26,16 @@ Our CI builds two version of the documentation:
2426
- https://cmu-delphi.github.io/epiprocess/ from the `main` branch and
2527
- https://cmu-delphi.github.io/epiprocess/dev from the `dev` branch.
2628

27-
The documentation site can be previewed locally by running in R:
28-
29-
```r
30-
# Should automatically open a browser
31-
pkgdown::build_site(preview=TRUE)
32-
```
33-
34-
If the above does not open a browser, you can try using a Python server from the
35-
command line:
29+
We include the script `pkgdown-watch.R` that will automatically rebuild the
30+
documentation locally and preview it. It can be used with:
3631

37-
```bash
38-
R -e 'devtools::document()'
39-
R -e 'pkgdown::build_site()'
40-
python -m http.server -d docs
32+
```sh
33+
# Make sure you have servr installed
34+
R -e 'renv::install("servr")'
35+
# Will start a local server
36+
Rscript pkgdown-watch.R
37+
# You may need to first build the site with
38+
R -e 'pkgdown::build_site(".", examples = FALSE, devel = TRUE, preview = FALSE)'
4139
```
4240

4341
## Versioning

NAMESPACE

+1-4
Original file line numberDiff line numberDiff line change
@@ -42,8 +42,6 @@ S3method(key_colnames,default)
4242
S3method(key_colnames,epi_archive)
4343
S3method(key_colnames,epi_df)
4444
S3method(mean,epi_df)
45-
S3method(next_after,Date)
46-
S3method(next_after,integer)
4745
S3method(print,epi_archive)
4846
S3method(print,epi_df)
4947
S3method(print,grouped_epi_archive)
@@ -65,6 +63,7 @@ export(complete)
6563
export(covid_case_death_rates_extended)
6664
export(covid_incidence_county_subset)
6765
export(covid_incidence_outliers)
66+
export(deprecated_quo_is_present)
6867
export(detect_outlr)
6968
export(detect_outlr_rm)
7069
export(detect_outlr_stl)
@@ -89,11 +88,9 @@ export(guess_period)
8988
export(is_epi_df)
9089
export(is_grouped_epi_archive)
9190
export(key_colnames)
92-
export(max_version_with_row_in)
9391
export(mutate)
9492
export(new_epi_archive)
9593
export(new_epi_df)
96-
export(next_after)
9794
export(relocate)
9895
export(rename)
9996
export(revision_summary)

NEWS.md

+2
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,8 @@ Pre-1.0.0 numbering scheme: 0.x will indicate releases, while 0.x.y will indicat
2727
using a formula to specify the slide computation, and other bits of forgotten
2828
syntax.
2929
- Improved validation of `.window_size` arguments.
30+
- Rewrote a lot of the package documentation to be more consistent and
31+
informative. Simplified and streamlined the vignettes.
3032

3133
## Cleanup
3234

R/archive.R

+58-73
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,9 @@
99

1010
#' Validate a version bound arg
1111
#'
12-
#' Expected to be used on `clobberable_versions_start`, `versions_end`,
13-
#' and similar arguments. Some additional context-specific checks may be needed.
12+
#' Expected to be used on `clobberable_versions_start`, `versions_end`, and
13+
#' similar arguments. Some additional context-specific checks may be needed.
14+
#' Side effects: raises an error if version bound appears invalid.
1415
#'
1516
#' @param version_bound the version bound to validate
1617
#' @param x a data frame containing a version column with which to check
@@ -20,9 +21,7 @@
2021
#' @param version_bound_arg optional string; what to call the version bound in
2122
#' error messages
2223
#'
23-
#' @section Side effects: raises an error if version bound appears invalid
24-
#'
25-
#' @noRd
24+
#' @keywords internal
2625
validate_version_bound <- function(version_bound, x, na_ok = FALSE,
2726
version_bound_arg = rlang::caller_arg(version_bound),
2827
x_arg = rlang::caller_arg(x)) {
@@ -75,9 +74,7 @@ validate_version_bound <- function(version_bound, x, na_ok = FALSE,
7574
#' @return `max(x$version)` if it has any rows; raises error if it has 0 rows or
7675
#' an `NA` version value
7776
#'
78-
#' @importFrom checkmate check_names
79-
#'
80-
#' @export
77+
#' @keywords internal
8178
max_version_with_row_in <- function(x) {
8279
if (nrow(x) == 0L) {
8380
cli_abort(
@@ -108,72 +105,71 @@ max_version_with_row_in <- function(x) {
108105
#' @param x the starting "value"(s)
109106
#' @return same class, typeof, and length as `x`
110107
#'
111-
#' @export
108+
#' @keywords internal
112109
next_after <- function(x) UseMethod("next_after")
113110

114111

115-
#' @export
112+
#' @keywords internal
116113
next_after.integer <- function(x) x + 1L
117114

118115

119-
#' @export
116+
#' @keywords internal
120117
next_after.Date <- function(x) x + 1L
121118

122119

123-
#' Compactify
124-
#'
125-
#' This section describes the internals of how compactification works in an
126-
#' `epi_archive()`. Compactification can potentially improve code speed or
127-
#' memory usage, depending on your data.
128-
#'
129-
#' In general, the last version of each observation is carried forward (LOCF) to
130-
#' fill in data between recorded versions, and between the last recorded
131-
#' update and the `versions_end`. One consequence is that the `DT` doesn't
132-
#' have to contain a full snapshot of every version (although this generally
133-
#' works), but can instead contain only the rows that are new or changed from
134-
#' the previous version (see `compactify`, which does this automatically).
135-
#' Currently, deletions must be represented as revising a row to a special
136-
#' state (e.g., making the entries `NA` or including a special column that
137-
#' flags the data as removed and performing some kind of post-processing), and
138-
#' the archive is unaware of what this state is. Note that `NA`s *can* be
139-
#' introduced by `epi_archive` methods for other reasons, e.g., in
140-
#' [`epix_fill_through_version`] and [`epix_merge`], if requested, to
141-
#' represent potential update data that we do not yet have access to; or in
142-
#' [`epix_merge`] to represent the "value" of an observation before the
143-
#' version in which it was first released, or if no version of that
144-
#' observation appears in the archive data at all.
120+
#' `epi_archive` object
145121
#'
146-
#' @name compactify
147-
NULL
148-
149-
150-
#' Epi Archive
151-
#'
152-
#' @title `epi_archive` object
122+
#' @description The second main data structure for storing time series in
123+
#' `epiprocess`. It is similar to `epi_df` in that it fundamentally a table with
124+
#' a few required columns that stores epidemiological time series data. An
125+
#' `epi_archive` requires a `geo_value`, `time_value`, and `version` column (and
126+
#' possibly other key columns) along with measurement values. In brief, an
127+
#' `epi_archive` is a history of the time series data, where the `version`
128+
#' column tracks the time at which the data was available. This allows for
129+
#' version-aware forecasting.
153130
#'
154-
#' @description An `epi_archive` is an S3 class which contains a data table
155-
#' along with several relevant pieces of metadata. The data table can be seen
156-
#' as the full archive (version history) for some signal variables of
157-
#' interest.
131+
#' `new_epi_archive` is the constructor for `epi_archive` objects that assumes
132+
#' all arguments have been validated. Most users should use `as_epi_archive`.
158133
#'
159-
#' @details An `epi_archive` contains a data table `DT`, of class `data.table`
160-
#' from the `data.table` package, with (at least) the following columns:
134+
#' @details An `epi_archive` contains a `data.table` object `DT` (from the
135+
#' `{data.table}` package), with (at least) the following columns:
161136
#'
162-
#' * `geo_value`: the geographic value associated with each row of measurements.
163-
#' * `time_value`: the time value associated with each row of measurements.
137+
#' * `geo_value`: the geographic value associated with each row of measurements,
138+
#' * `time_value`: the time value associated with each row of measurements,
164139
#' * `version`: the time value specifying the version for each row of
165140
#' measurements. For example, if in a given row the `version` is January 15,
166141
#' 2022 and `time_value` is January 14, 2022, then this row contains the
167142
#' measurements of the data for January 14, 2022 that were available one day
168143
#' later.
169144
#'
170-
#' The data table `DT` has key variables `geo_value`, `time_value`, `version`,
171-
#' as well as any others (these can be specified when instantiating the
172-
#' `epi_archive` object via the `other_keys` argument, and/or set by operating
173-
#' on `DT` directly). Note that there can only be a single row per unique
174-
#' combination of key variables.
145+
#' The variables `geo_value`, `time_value`, `version` serve as key variables for
146+
#' the data table (in addition to any other keys specified in the metadata).
147+
#' There can only be a single row per unique combination of key variables. The
148+
#' keys for an `epi_archive` can be viewed with `key(epi_archive$DT)`.
149+
#'
150+
#' ## Compactification
151+
#'
152+
#' By default, an `epi_archive` will compactify the data table to remove
153+
#' redundant rows. This is done by not storing rows that have the same value,
154+
#' except for the `version` column (this is essentially a last observation
155+
#' carried forward, but along the version index). This is done to save space and
156+
#' improve performance. If you do not want to compactify the data, you can set
157+
#' `compactify = FALSE` in `as_epi_archive()`.
158+
#'
159+
#' Note that in some data scenarios, LOCF may not be appropriate. For instance,
160+
#' if you expected data to be updated on a given day, but your data source did
161+
#' not update, then it could be reasonable to code the data as `NA` for that
162+
#' day, instead of assuming LOCF.
163+
#'
164+
#' `NA`s *can* be introduced by `epi_archive` methods for other
165+
#' reasons, e.g., in [`epix_fill_through_version`] and [`epix_merge`], if
166+
#' requested, to represent potential update data that we do not yet have access
167+
#' to; or in [`epix_merge`] to represent the "value" of an observation before
168+
#' the version in which it was first released, or if no version of that
169+
#' observation appears in the archive data at all.
170+
#'
171+
#' ## Metadata
175172
#'
176-
#' @section Metadata:
177173
#' The following pieces of metadata are included as fields in an `epi_archive`
178174
#' object:
179175
#'
@@ -187,20 +183,6 @@ NULL
187183
#' archive. Unexpected behavior may result from modifying the metadata
188184
#' directly.
189185
#'
190-
#' @section Generating Snapshots:
191-
#' An `epi_archive` object can be used to generate a snapshot of the data in
192-
#' `epi_df` format, which represents the most up-to-date time series values up
193-
#' to a point in time. This is accomplished by calling `epix_as_of()`.
194-
#'
195-
#' @section Sliding Computations:
196-
#' We can run a sliding computation over an `epi_archive` object, much like
197-
#' `epi_slide()` does for an `epi_df` object. This is accomplished by calling
198-
#' the `slide()` method for an `epi_archive` object, which works similarly to
199-
#' the way `epi_slide()` works for an `epi_df` object, but with one key
200-
#' difference: it is version-aware. That is, for an `epi_archive` object, the
201-
#' sliding computation at any given reference time point t is performed on
202-
#' **data that would have been available as of t**.
203-
#'
204186
#' @param x A data.frame, data.table, or tibble, with columns `geo_value`,
205187
#' `time_value`, `version`, and then any additional number of columns.
206188
#' @param geo_type DEPRECATED Has no effect. Geo value type is inferred from the
@@ -239,10 +221,11 @@ NULL
239221
#' value of `clobberable_versions_start` does not fully trust these empty
240222
#' updates, and assumes that any version `>= max(x$version)` could be
241223
#' clobbered.) If `nrow(x) == 0`, then this argument is mandatory.
242-
#' @param compactify_tol double. the tolerance used to detect approximate equality for compactification
224+
#' @param compactify_tol double. the tolerance used to detect approximate
225+
#' equality for compactification
243226
#' @return An `epi_archive` object.
244227
#'
245-
#' @importFrom data.table as.data.table key setkeyv
228+
#' @seealso [`epix_as_of`] [`epix_merge`] [`epix_slide`]
246229
#' @importFrom dplyr if_any if_all everything
247230
#' @importFrom utils capture.output
248231
#'
@@ -356,12 +339,13 @@ new_epi_archive <- function(
356339
)
357340
}
358341

359-
#' given a tibble as would be found in an epi_archive, remove duplicate entries.
360-
#' @description
361-
#' works by shifting all rows except the version, then comparing values to see
342+
#' Given a tibble as would be found in an epi_archive, remove duplicate entries.
343+
#'
344+
#' Works by shifting all rows except the version, then comparing values to see
362345
#' if they've changed. We need to arrange in descending order, but note that
363346
#' we don't need to group, since at least one column other than version has
364347
#' changed, and so is kept.
348+
#'
365349
#' @keywords internal
366350
#' @importFrom dplyr filter
367351
apply_compactify <- function(df, keys, tolerance = .Machine$double.eps^.5) {
@@ -466,6 +450,7 @@ validate_epi_archive <- function(
466450

467451
#' `as_epi_archive` converts a data frame, data table, or tibble into an
468452
#' `epi_archive` object.
453+
#'
469454
#' @param ... used for specifying column names, as in [`dplyr::rename`]. For
470455
#' example `version = release_date`
471456
#' @param .versions_end location based versions_end, used to avoid prefix

0 commit comments

Comments
 (0)