Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update epidist_db() to give more fine-grained control #160

Merged
merged 17 commits into from
Aug 24, 2023
Merged
Show file tree
Hide file tree
Changes from 16 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@ S3method(get_citation,epidist)
S3method(get_citation,epiparam)
S3method(get_parameters,epidist)
S3method(head,epiparam)
S3method(is_parameterised,epidist)
S3method(is_parameterised,epiparam)
S3method(plot,epidist)
S3method(plot,vb_epidist)
S3method(print,epidist)
Expand Down
84 changes: 84 additions & 0 deletions R/checkers.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
#' Checks whether <epidist> or <epiparam> object contains a distribution and
#' parameters for that distribution.
#'
#' @description If the <epidist> object or a row in the <epiparam> data frame
#' is missing either a probability distribution or parameters for the
#' probability distribution, `is_parameterised` returns `FALSE`, otherwise
#' it returns `TRUE`.
#'
#' @param x An `epidist` or `epiparam` object.
#' @param ... [`dots`] not used, extra arguments supplied will cause a warning.
#'
#' @return A boolean logical for <epidist> or vector of boolean for each entry
#' in the <epiparam>.
#' @export
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since this function is exported, would it be possible to add in the description or in the examples a typical use case for users? At the moment, I can only think about internal use cases.

#'
#' @examples
#' # parameterised <epidist>
#' edist <- epidist(
#' disease = "ebola",
#' epi_dist = "incubation",
#' prob_distribution = "gamma",
#' prob_distribution_params = c(shape = 1, scale = 1)
#' )
#' is_parameterised(edist)
#'
#' # unparameterised <epidist>
#' edist <- epidist(
#' disease = "ebola",
#' epi_dist = "incubation"
#' )
#' is_parameterised(edist)
is_parameterised <- function(x, ...) {
UseMethod("is_parameterised")
}

#' @export
is_parameterised.epiparam <- function(x, ...) {

chkDots(...)

has_params <- vapply(
as.data.frame(t(x)),
FUN = function(y) {
if (is.na(y$prob_distribution)) {
return(FALSE)
}
if (y$prob_distribution %in% c("gamma", "weibull")) {
out <- !is.na(y$shape) && !is.na(y$scale)
return(out)
}
if (y$prob_distribution == "lnorm") {
out <- !is.na(y$meanlog) && !is.na(y$sdlog)
return(out)
}
if (y$prob_distribution == "nbinom") {
out <- !is.na(y$mean) && !is.na(y$dispersion)
return(out)
}
if (y$prob_distribution %in% c("geom", "pois")) {
out <- !is.na(y$mean)
return(out)
}
return(FALSE)
},
FUN.VALUE = logical(1),
USE.NAMES = FALSE
)
joshwlambert marked this conversation as resolved.
Show resolved Hide resolved

# return result
has_params
}

#' @export
is_parameterised.epidist <- function(x, ...) {

chkDots(...)

# probability distribution object
if (is.object(x$prob_dist)) {
return(TRUE)
}

return(FALSE)
}
58 changes: 11 additions & 47 deletions R/epidist.R
Original file line number Diff line number Diff line change
Expand Up @@ -333,11 +333,19 @@ validate_epidist <- function(epidist) {
c("disease", "epi_dist", "prob_dist", "uncertainty", "summary_stats",
"citation", "metadata", "method_assess", "notes") %in%
attributes(epidist)$names,
"Epidist must contains a disease (single character string)" =
"epidist must contain a disease (single character string)" =
is.character(epidist$disease$disease) &&
length(epidist$disease$disease) == 1,
"Epidist must contain an epidemiological distribution" =
is.character(epidist$epi_dist) && length(epidist$epi_dist) == 1
"epidist must contain an epidemiological distribution" =
is.character(epidist$epi_dist) && length(epidist$epi_dist) == 1,
"epidisit must contain uncertainty, summary stats and metadata" =
all(
is.list(epidist$uncertainty),
is.list(epidist$summary_stats),
is.list(epidist$metadata)
),
"epidist must contain a citation" =
is.character(epidist$citation)
)

invisible(epidist)
Expand Down Expand Up @@ -918,47 +926,3 @@ is_truncated <- function(x) {
return(FALSE)
}
}

#' Checks whether <epidist> object contains a distribution and parameters for
#' that distribution.
#'
#' @description If the <epidist> object is missing either a probability
#' distribution or parameters for the probability distribution,
#' `is_parameterised` returns `FALSE`, otherwise it returns `TRUE`
#'
#'
#' @param x An `epidist` object.
#'
#' @return A boolean logical.
#' @export
#'
#' @examples
#' # parameterised <epidist>
#' edist <- epidist(
#' disease = "ebola",
#' epi_dist = "incubation",
#' prob_distribution = "gamma",
#' prob_distribution_params = c(shape = 1, scale = 1)
#' )
#' is_parameterised(edist)
#'
#' # unparameterised <epidist>
#' edist <- epidist(
#' disease = "ebola",
#' epi_dist = "incubation"
#' )
#' is_parameterised(edist)
is_parameterised <- function(x) {

stopifnot(
"is_parameterised only works for <epidist> objects" =
is_epidist(x)
)

# probability distribution object
if (is.object(x$prob_dist)) {
return(TRUE)
}

return(FALSE)
}
179 changes: 145 additions & 34 deletions R/epidist_db.R
Original file line number Diff line number Diff line change
@@ -1,22 +1,55 @@
#' Create an `epidist` object directly from the epiparameter library (database)
#' Create an `epidist` object(s) directly from the epiparameter library
#' (database)
#'
#' @description This function can extract an `epidist` object directly from the
#' library of epidemiological parameters without having to read in an `epiparam`
#' object and pull out an `epidist` object from one of the entries. If a
#' distribution from a specific study is required, the `author` argument can
#' be specified. For now this must match the author entry in the database
#' exactly to be recognised.
#' @description This function can extract an `epidist` object(s) directly from
#' the library of epidemiological parameters without having to read in an
#' `epiparam` object and pull out an `epidist` object from one of the entries.
#'
#' @details The current criteria for chosing a distribution when multiple
#' entries exist in the database for a given disease and epidemiological
#' distribution is to return the entry with the largest sample size.
#' If a distribution from a specific study is required, the `author` argument
#' can be specified.
#'
#' @param disease A character string specifying the disease
#' @param epi_dist A character string specifying the epidemiological
#' distribution
#' @param author The author of the study reporting the distribution
#' Multiple entries (`<epidist>` objects) can be returned, use the arguments
#' to subset entries and use `single_epidist = TRUE` to force a single
#' `<epidist>` to be returned.
#'
#' @return An `epidist` object
#' @details `disease`, `epi_dist` and `author` are given as individual arguments
#' as these are the most common variables to subset the parameter library by.
#' The `subset` argument facilitates all other subsetting of rows to select the
#' `<epidist>` object(s) desired. To subset based on multiple variables separate
#' each expression with `&`.
#'
#' @param disease A `character` string specifying the disease.
#' @param epi_dist A `character` string specifying the epidemiological
#' distribution.
#' @param author A `character` string specifying the author of the study
#' reporting the distribution.
#' @param subset Either `NULL` or a valid R expressions that evaluates to
#' logicals to subset the rows of `<epiparam>`, or a function that can be
#' applied directly to an `<epiparam>` object.
#'
#' This argument allows general `<data.frame>` subsetting that can be combined
#' with the subsetting done with the `disease` and `epidist` arguments
#' (and `author` if specified). If left as `NULL` (default) no subsetting is
#' carried out.
#'
#' The expression should be specified without using the data object name
#' (e.g. `df$var`) and instead just `var` should be supplied. In
#' other words, this argument works the same as the `subset` argument in
#' [`subset()`]. It is similar to `<data-masking>` using by the `dplyr` package.
#'
#' @param single_epidist A boolean `logical` determining whether a single
#' `<epidist>` or multiple entries from the library can be returned if
#' matched by the other arguments (`disease`, `epi_dist`, `author`). This
#' argument is used to prevent multiple sets of parameters being returned
#' when only one is wanted.
#'
#' **Note**: If multiple entries match the arguments supplied and
#' `single_epidist = TRUE` then the `<epidist>` that is parameterised and
#' has the largest sample size will be returned (see [`is_parameterised()`]).
joshwlambert marked this conversation as resolved.
Show resolved Hide resolved
#' If multiple entries are equal after this sorting the first entry will
#' be returned.
#'
#' @return An `epidist` object or list of `epidist` objects.
#' @export
#'
#' @examples
Expand All @@ -36,18 +69,53 @@
#' edist2 <- as_epidist(eparam)
#' # check the two methods produce the same `epidist` object
#' identical(edist, edist2)
#'
#' # example using custom subsetting
#' eparam <- epidist_db(
#' disease = "SARS",
#' epi_dist = "offspring_distribution",
#' subset = sample_size > 40
#' )
#'
#' # example using functional subsetting
#' eparam <- epidist_db(
#' disease = "COVID-19",
#' epi_dist = "incubation_period",
#' subset = is_parameterised
#' )
#'
#' # example forcing a single <epidist> to be returned
#' eparam <- epidist_db(
#' disease = "SARS",
#' epi_dist = "offspring_distribution",
#' single_epidist = TRUE
#' )
epidist_db <- function(disease,
epi_dist = c("incubation_period",
"onset_to_hospitalisation",
"onset_to_death",
"serial_interval",
"generation_time",
"offspring_distribution"),
author = NULL) {

epi_dist = c(
"incubation_period",
"onset_to_hospitalisation",
"onset_to_death",
"serial_interval",
"generation_time",
"offspring_distribution"
),
author = NULL,
subset = NULL,
single_epidist = FALSE) {
# check input
checkmate::assert_string(disease)
epi_dist <- match.arg(arg = epi_dist, several.ok = FALSE)
checkmate::assert_logical(single_epidist, len = 1)

# capture expression from subset and check type
expr <- substitute(subset)
if (is.character(expr)) {
stop(
"Subsetting is done with expressions that return logical values.\n",
"Remove quotation marks.",
call. = FALSE
)
}

# read in database
eparam <- epiparam(epi_dist = epi_dist)
Expand All @@ -66,10 +134,8 @@ epidist_db <- function(disease,
# filter based on pathogen and delay distribution
eparam <- eparam[clean_disease(eparam$disease) == disease, ]

# extract study or default to largest sample size
if (is.null(author)) {
eparam <- eparam[which.max(eparam$sample_size), ]
} else {
# extract study by author if given
if (!is.null(author)) {
study <- match.arg(
arg = author,
choices = unique(eparam$author),
Expand All @@ -78,18 +144,63 @@ epidist_db <- function(disease,
eparam <- eparam[eparam$author == study, ]
}

# Throw warning if more than one row found and select first
if (nrow(eparam) > 1) {
warning(
"More than one study found. Selecting first one. ",
"Please report an issue with duplicated studies.",
# subset by subset conditions
if (is.call(expr)) {
set <- eval(expr = expr, envir = eparam, enclos = parent.frame())
eparam <- eparam[set, ]
} else if (is.function(subset)) {
set <- do.call(subset, args = list(eparam))
eparam <- eparam[set, ]
}

if (nrow(eparam) == 0) {
stop(
"No entries in the database meet the subset criteria.",
call. = FALSE
)
eparam <- eparam[1, ]
}

# convert epiparam to epidist
edist <- as_epidist(x = eparam)
if (nrow(eparam) == 1) {
edist <- as_epidist(x = eparam)
validate_epidist(edist)
} else {
edist <- suppressMessages(as_epidist(x = eparam))
is_param <- vapply(edist, is_parameterised, FUN.VALUE = logical(1))

if (single_epidist) {
# select parameterised entries
if (sum(is_param) >= 1) {
edist <- edist[is_param]
}
# select largest sample size
idx <- which.max(
vapply(
edist,
function(x) x$metadata$sample_size,
FUN.VALUE = numeric(1)
)
)
edist <- edist[[idx]]
validate_epidist(edist)

message(
"Using ", get_citation(edist), ". \n",
"To retrieve the short citation use the 'get_citation' function"
)
} else {
lapply(edist, validate_epidist)

message(
"Returning ", nrow(eparam), " results that match the criteria ",
"(", sum(is_param), " are parameterised). \n",
"Use subset to filter by entry variables or ",
"single_epidist to return a single entry. \n",
"To retrieve the short citation for each use the ",
"'get_citation' function"
)
}
}

# return epidist
edist
Expand Down
Loading
Loading