Skip to content

Commit c7749c8

Browse files
authored
Merge pull request #103 from cmu-delphi/cyou/improve-epi-df-doc
improve epi df doc
2 parents 0177a5a + 0693a21 commit c7749c8

File tree

3 files changed

+212
-0
lines changed

3 files changed

+212
-0
lines changed

R/epi_df.R

+58
Original file line numberDiff line numberDiff line change
@@ -190,6 +190,64 @@ new_epi_df = function(x = tibble::tibble(), geo_type, time_type, as_of,
190190
#' @return An `epi_df` object.
191191
#'
192192
#' @export
193+
#' @examples
194+
#' # Convert a `tsibble` that has county code as an extra key
195+
#' # Notice that county code should be a character string to preserve any leading zeroes
196+
#'
197+
#' ex1_input <- tibble::tibble(
198+
#' geo_value = rep(c("ca", "fl", "pa"), each = 3),
199+
#' county_code = c("06059","06061","06067",
200+
#' "12111","12113","12117",
201+
#' "42101", "42103","42105"),
202+
#' time_value = rep(seq(as.Date("2020-06-01"), as.Date("2020-06-03"),
203+
#' by = "day"), length.out = length(geo_value)),
204+
#' value = 1:length(geo_value) + 0.01 * rnorm(length(geo_value))
205+
#' ) %>%
206+
#' tsibble::as_tsibble(index = time_value, key = c(geo_value, county_code))
207+
#'
208+
#' # The `other_keys` metadata (`"county_code"` in this case) is automatically
209+
#' # inferred from the `tsibble`'s `key`:
210+
#' ex1 <- as_epi_df(x = ex1_input, geo_type = "state", time_type = "day", as_of = "2020-06-03")
211+
#' attr(ex1,"metadata")[["other_keys"]]
212+
#'
213+
#'
214+
#'
215+
#' # Dealing with misspecified column names:
216+
#' # Geographical and temporal information must be provided in columns named
217+
#' # `geo_value` and `time_value`; if we start from a data frame with a
218+
#' # different format, it must be converted to use `geo_value` and `time_value`
219+
#' # before calling `as_epi_df`.
220+
#'
221+
#' ex2_input <- tibble::tibble(
222+
#' state = rep(c("ca", "fl", "pa"), each = 3), # misnamed
223+
#' pol = rep(c("blue", "swing", "swing"), each = 3), # extra key
224+
#' reported_date = rep(seq(as.Date("2020-06-01"), as.Date("2020-06-03"),
225+
#' by = "day"), length.out = length(state)), # misnamed
226+
#' value = 1:length(state) + 0.01 * rnorm(length(state))
227+
#' )
228+
#'
229+
#' print(ex2_input)
230+
#'
231+
#' ex2 <- ex2_input %>% dplyr::rename(geo_value = state, time_value = reported_date) %>%
232+
#' as_epi_df(geo_type = "state", as_of = "2020-06-03",
233+
#' additional_metadata = c(other_keys = "pol"))
234+
#'
235+
#' attr(ex2,"metadata")
236+
#'
237+
#'
238+
#'
239+
#' # Adding additional keys to an `epi_df` object
240+
#'
241+
#' ex3_input <- jhu_csse_county_level_subset %>%
242+
#' dplyr::filter(time_value > "2021-12-01", state_name == "Massachusetts") %>%
243+
#' dplyr::slice_tail(n = 6)
244+
#'
245+
#' ex3 <- ex3_input %>%
246+
#' tsibble::as_tsibble() %>% # needed to add the additional metadata
247+
#' dplyr::mutate(state = rep("MA",6)) %>%
248+
#' as_epi_df(additional_metadata = c(other_keys = "state"))
249+
#'
250+
#' attr(ex3,"metadata")
193251
as_epi_df = function(x, ...) {
194252
UseMethod("as_epi_df")
195253
}

man/as_epi_df.Rd

+59
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

vignettes/epiprocess.Rmd

+95
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,7 @@ API](https://cmu-delphi.github.io/delphi-epidata/api/covidcast.html).
5858
library(delphi.epidata)
5959
library(epiprocess)
6060
library(dplyr)
61+
library(withr)
6162
6263
cases <- covidcast(
6364
data_source = "jhu-csse",
@@ -127,6 +128,91 @@ x <- as_epi_df(cases) %>%
127128
attributes(x)$metadata
128129
```
129130

131+
## Using additional key columns in `epi_df`
132+
In the following examples we will show how to create an `epi_df` with additional keys.
133+
134+
### Converting a `tsibble` that has county code as an extra key
135+
```{r}
136+
ex1 <- tibble(
137+
geo_value = rep(c("ca", "fl", "pa"), each = 3),
138+
county_code = c("06059","06061","06067",
139+
"12111","12113","12117",
140+
"42101","42103","42105"),
141+
time_value = rep(seq(as.Date("2020-06-01"), as.Date("2020-06-03"),
142+
by = "day"), length.out = length(geo_value)),
143+
value = 1:length(geo_value) + 0.01 * withr::with_rng_version("3.0.0", withr::with_seed(42, length(geo_value)))
144+
) %>%
145+
as_tsibble(index = time_value, key = c(geo_value, county_code))
146+
147+
ex1 <- as_epi_df(x = ex1, geo_type = "state", time_type = "day", as_of = "2020-06-03")
148+
```
149+
150+
The metadata now includes `county_code` as an extra key.
151+
```{r}
152+
attr(ex1,"metadata")
153+
```
154+
155+
156+
### Dealing with misspecified column names
157+
158+
`epi_df` requires there to be columns `geo_value` and `time_value`, if they do not exist then `as_epi_df()` throws an error.
159+
```{r, error = TRUE}
160+
data.frame(
161+
state = rep(c("ca", "fl", "pa"), each = 3), # misnamed
162+
pol = rep(c("blue", "swing", "swing"), each = 3), # extra key
163+
reported_date = rep(seq(as.Date("2020-06-01"), as.Date("2020-06-03"),
164+
by = "day"), length.out = length(geo_value)), # misnamed
165+
value = 1:length(geo_value) + 0.01 * withr::with_rng_version("3.0.0", withr::with_seed(42, length(geo_value)))
166+
) %>% as_epi_df()
167+
```
168+
169+
The columns can be renamed to match `epi_df` format. In the example below, notice there is also an additional key `pol`.
170+
```{r}
171+
ex2 <- tibble(
172+
state = rep(c("ca", "fl", "pa"), each = 3), # misnamed
173+
pol = rep(c("blue", "swing", "swing"), each = 3), # extra key
174+
reported_date = rep(seq(as.Date("2020-06-01"), as.Date("2020-06-03"),
175+
by = "day"), length.out = length(state)), # misnamed
176+
value = 1:length(state) + 0.01 * withr::with_rng_version("3.0.0", withr::with_seed(42, length(state)))
177+
) %>% data.frame()
178+
179+
head(ex2)
180+
181+
ex2 <- ex2 %>% rename(geo_value = state, time_value = reported_date) %>%
182+
as_epi_df(geo_type = "state", as_of = "2020-06-03",
183+
additional_metadata = c(other_keys = "pol"))
184+
185+
attr(ex2,"metadata")
186+
```
187+
188+
189+
### Adding additional keys to an `epi_df` object
190+
191+
In the above examples, all the keys are added to objects that are not `epi_df` objects. We illustrate how to add keys to an `epi_df` object.
192+
193+
We use a toy data set included in `epiprocess` prepared using the `covidcast` library and are filtering to a single state for simplicity.
194+
195+
```{r}
196+
ex3 <- jhu_csse_county_level_subset %>%
197+
filter(time_value > "2021-12-01", state_name == "Massachusetts") %>%
198+
slice_tail(n = 6)
199+
200+
attr(ex3,"metadata") # geo_type is county currently
201+
```
202+
203+
Now we add state (MA) as a new column and a key to the metadata. Reminder that lower case state name abbreviations are what we would expect if this were a `geo_value` column.
204+
```{r}
205+
206+
ex3 <- ex3 %>%
207+
as_tibble() %>% # needed to add the additional metadata
208+
mutate(state = rep(tolower("MA"),6)) %>%
209+
as_epi_df(additional_metadata = c(other_keys = "state"))
210+
211+
attr(ex3,"metadata")
212+
```
213+
214+
Currently `other_keys` metadata in `epi_df` doesn't impact `epi_slide()`, contrary to `other_keys` in `as_epi_archive` which affects how the update data is interpreted.
215+
130216
## Working with `epi_df` objects downstream
131217

132218
Data in `epi_df` format should be easy to work with downstream, since it is a
@@ -198,3 +284,12 @@ ggplot(x, aes(x = time_value, y = cases)) +
198284
scale_x_date(minor_breaks = "month", date_labels = "%b %y") +
199285
labs(x = "Date", y = "Confirmed cases of Ebola in Sierra Leone")
200286
```
287+
288+
289+
290+
## Attribution
291+
This document contains dataset that is a modified part of the [COVID-19 Data Repository by the Center for Systems Science and Engineering (CSSE) at Johns Hopkins University](https://github.com/CSSEGISandData/COVID-19) as [republished in the COVIDcast Epidata API](https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html). This data set is licensed under the terms of the [Creative Commons Attribution 4.0 International license](https://creativecommons.org/licenses/by/4.0/) by the Johns Hopkins University on behalf of its Center for Systems Science in Engineering. Copyright Johns Hopkins University 2020.
292+
293+
[From the COVIDcast Epidata API](https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html):
294+
These signals are taken directly from the JHU CSSE [COVID-19 GitHub repository](https://github.com/CSSEGISandData/COVID-19) without changes.
295+

0 commit comments

Comments
 (0)