epiverse-trace · joshwlambert · Aug 24, 2023 · Jul 24, 2023 · Jul 24, 2023 · Aug 22, 2023
diff --git a/NAMESPACE b/NAMESPACE
@@ -20,6 +20,8 @@ S3method(get_citation,epidist)
 S3method(get_citation,epiparam)
 S3method(get_parameters,epidist)
 S3method(head,epiparam)
+S3method(is_parameterised,epidist)
+S3method(is_parameterised,epiparam)
 S3method(plot,epidist)
 S3method(plot,vb_epidist)
 S3method(print,epidist)

diff --git a/R/checkers.R b/R/checkers.R
@@ -0,0 +1,84 @@
+#' Checks whether <epidist> or <epiparam> object contains a distribution and
+#' parameters for that distribution.
+#'
+#' @description If the <epidist> object or a row in the <epiparam> data frame
+#' is missing either a probability distribution or parameters for the
+#' probability distribution, `is_parameterised` returns `FALSE`, otherwise
+#' it returns `TRUE`.
+#'
+#' @param x An `epidist` or `epiparam` object.
+#' @param ... [`dots`] not used, extra arguments supplied will cause a warning.
+#'
+#' @return A boolean logical for <epidist> or vector of boolean for each entry
+#' in the <epiparam>.
+#' @export
+#'
+#' @examples
+#' # parameterised <epidist>
+#' edist <- epidist(
+#'   disease = "ebola",
+#'   epi_dist = "incubation",
+#'   prob_distribution = "gamma",
+#'   prob_distribution_params = c(shape = 1, scale = 1)
+#' )
+#' is_parameterised(edist)
+#'
+#' # unparameterised <epidist>
+#' edist <- epidist(
+#'   disease = "ebola",
+#'   epi_dist = "incubation"
+#' )
+#' is_parameterised(edist)
+is_parameterised <- function(x, ...) {
+  UseMethod("is_parameterised")
+}
+
+#' @export
+is_parameterised.epiparam <- function(x, ...) {
+
+  chkDots(...)
+
+  has_params <- vapply(
+    as.data.frame(t(x)),
+    FUN = function(y) {
+      if (is.na(y$prob_distribution)) {
+        return(FALSE)
+      }
+      if (y$prob_distribution %in% c("gamma", "weibull")) {
+        out <- !is.na(y$shape) && !is.na(y$scale)
+        return(out)
+      }
+      if (y$prob_distribution == "lnorm") {
+        out <- !is.na(y$meanlog) && !is.na(y$sdlog)
+        return(out)
+      }
+      if (y$prob_distribution == "nbinom") {
+        out <- !is.na(y$mean) && !is.na(y$dispersion)
+        return(out)
+      }
+      if (y$prob_distribution %in% c("geom", "pois")) {
+        out <- !is.na(y$mean)
+        return(out)
+      }
+      return(FALSE)
+    },
+    FUN.VALUE = logical(1),
+    USE.NAMES = FALSE
+  )
+
+  # return result
+  has_params
+}
+
+#' @export
+is_parameterised.epidist <- function(x, ...) {
+
+  chkDots(...)
+
+  # probability distribution object
+  if (is.object(x$prob_dist)) {
+    return(TRUE)
+  }
+
+  return(FALSE)
+}
diff --git a/R/epidist.R b/R/epidist.R
@@ -333,11 +333,19 @@ validate_epidist <- function(epidist) {
       c("disease", "epi_dist", "prob_dist", "uncertainty", "summary_stats",
         "citation", "metadata", "method_assess", "notes") %in%
       attributes(epidist)$names,
-    "Epidist must contains a disease (single character string)" =
+    "epidist must contain a disease (single character string)" =
       is.character(epidist$disease$disease) &&
       length(epidist$disease$disease) == 1,
-    "Epidist must contain an epidemiological distribution" =
-      is.character(epidist$epi_dist) && length(epidist$epi_dist) == 1
+    "epidist must contain an epidemiological distribution" =
+      is.character(epidist$epi_dist) && length(epidist$epi_dist) == 1,
+    "epidisit must contain uncertainty, summary stats and metadata" =
+      all(
+        is.list(epidist$uncertainty),
+        is.list(epidist$summary_stats),
+        is.list(epidist$metadata)
+      ),
+    "epidist must contain a citation" =
+      is.character(epidist$citation)
   )
 
   invisible(epidist)
@@ -918,47 +926,3 @@ is_truncated <- function(x) {
     return(FALSE)
   }
 }
-
-#' Checks whether <epidist> object contains a distribution and parameters for
-#' that distribution.
-#'
-#' @description If the <epidist> object is missing either a probability
-#' distribution or parameters for the probability distribution,
-#' `is_parameterised` returns `FALSE`, otherwise it returns `TRUE`
-#'
-#'
-#' @param x An `epidist` object.
-#'
-#' @return A boolean logical.
-#' @export
-#'
-#' @examples
-#' # parameterised <epidist>
-#' edist <- epidist(
-#'   disease = "ebola",
-#'   epi_dist = "incubation",
-#'   prob_distribution = "gamma",
-#'   prob_distribution_params = c(shape = 1, scale = 1)
-#' )
-#' is_parameterised(edist)
-#'
-#' # unparameterised <epidist>
-#' edist <- epidist(
-#'   disease = "ebola",
-#'   epi_dist = "incubation"
-#' )
-#' is_parameterised(edist)
-is_parameterised <- function(x) {
-
-  stopifnot(
-    "is_parameterised only works for <epidist> objects" =
-      is_epidist(x)
-  )
-
-  # probability distribution object
-  if (is.object(x$prob_dist)) {
-    return(TRUE)
-  }
-
-  return(FALSE)
-}
diff --git a/R/epidist_db.R b/R/epidist_db.R
@@ -1,22 +1,55 @@
-#' Create an `epidist` object directly from the epiparameter library (database)
+#' Create an `epidist` object(s) directly from the epiparameter library
+#' (database)
 #'
-#' @description This function can extract an `epidist` object directly from the
-#' library of epidemiological parameters without having to read in an `epiparam`
-#' object and pull out an `epidist` object from one of the entries. If a
-#' distribution from a specific study is required, the `author` argument can
-#' be specified. For now this must match the author entry in the database
-#' exactly to be recognised.
+#' @description This function can extract an `epidist` object(s) directly from
+#' the library of epidemiological parameters without having to read in an
+#' `epiparam` object and pull out an `epidist` object from one of the entries.
 #'
-#' @details The current criteria for chosing a distribution when multiple
-#' entries exist in the database for a given disease and epidemiological
-#' distribution is to return the entry with the largest sample size.
+#' If a distribution from a specific study is required, the `author` argument
+#' can be specified.
 #'
-#' @param disease A character string specifying the disease
-#' @param epi_dist A character string specifying the epidemiological
-#' distribution
-#' @param author The author of the study reporting the distribution
+#' Multiple entries (`<epidist>` objects) can be returned, use the arguments
+#' to subset entries and use `single_epidist = TRUE` to force a single
+#' `<epidist>` to be returned.
 #'
-#' @return An `epidist` object
+#' @details `disease`, `epi_dist` and `author` are given as individual arguments
+#' as these are the most common variables to subset the parameter library by.
+#' The `subset` argument facilitates all other subsetting of rows to select the
+#' `<epidist>` object(s) desired. To subset based on multiple variables separate
+#' each expression with `&`.
+#'
+#' @param disease A `character` string specifying the disease.
+#' @param epi_dist A `character` string specifying the epidemiological
+#' distribution.
+#' @param author A `character` string specifying the author of the study
+#' reporting the distribution.
+#' @param subset Either `NULL` or a valid R expressions that evaluates to
+#' logicals to subset the rows of `<epiparam>`, or a function that can be
+#' applied directly to an `<epiparam>` object.
+#'
+#' This argument allows general `<data.frame>` subsetting that can be combined
+#' with the subsetting done with the `disease` and `epidist` arguments
+#' (and `author` if specified). If left as `NULL` (default) no subsetting is
+#' carried out.
+#'
+#' The expression should be specified without using the data object name
+#' (e.g. `df$var`) and instead just `var` should be supplied. In
+#' other words, this argument works the same as the `subset` argument in
+#' [`subset()`]. It is similar to `<data-masking>` using by the `dplyr` package.
+#'
+#' @param single_epidist A boolean `logical` determining whether a single
+#' `<epidist>` or multiple entries from the library can be returned if
+#' matched by the other arguments (`disease`, `epi_dist`, `author`). This
+#' argument is used to prevent multiple sets of parameters being returned
+#' when only one is wanted.
+#'
+#' **Note**: If multiple entries match the arguments supplied and
+#' `single_epidist = TRUE` then the `<epidist>` that is parameterised and
+#' has the largest sample size will be returned (see [`is_parameterised()`]).
+#' If multiple entries are equal after this sorting the first entry will
+#' be returned.
+#'
+#' @return An `epidist` object or list of `epidist` objects.
 #' @export
 #'
 #' @examples
@@ -36,18 +69,53 @@
 #' edist2 <- as_epidist(eparam)
 #' # check the two methods produce the same `epidist` object
 #' identical(edist, edist2)
+#'
+#' # example using custom subsetting
+#' eparam <- epidist_db(
+#'   disease = "SARS",
+#'   epi_dist = "offspring_distribution",
+#'   subset = sample_size > 40
+#' )
+#'
+#' # example using functional subsetting
+#' eparam <- epidist_db(
+#'   disease = "COVID-19",
+#'   epi_dist = "incubation_period",
+#'   subset = is_parameterised
+#' )
+#'
+#' # example forcing a single <epidist> to be returned
+#' eparam <- epidist_db(
+#'   disease = "SARS",
+#'   epi_dist = "offspring_distribution",
+#'   single_epidist = TRUE
+#' )
 epidist_db <- function(disease,
-                       epi_dist = c("incubation_period",
-                                    "onset_to_hospitalisation",
-                                    "onset_to_death",
-                                    "serial_interval",
-                                    "generation_time",
-                                    "offspring_distribution"),
-                       author = NULL) {
-
+                       epi_dist = c(
+                         "incubation_period",
+                         "onset_to_hospitalisation",
+                         "onset_to_death",
+                         "serial_interval",
+                         "generation_time",
+                         "offspring_distribution"
+                       ),
+                       author = NULL,
+                       subset = NULL,
+                       single_epidist = FALSE) {
   # check input
   checkmate::assert_string(disease)
   epi_dist <- match.arg(arg = epi_dist, several.ok = FALSE)
+  checkmate::assert_logical(single_epidist, len = 1)
+
+  # capture expression from subset and check type
+  expr <- substitute(subset)
+  if (is.character(expr)) {
+    stop(
+      "Subsetting is done with expressions that return logical values.\n",
+      "Remove quotation marks.",
+      call. = FALSE
+    )
+  }
 
   # read in database
   eparam <- epiparam(epi_dist = epi_dist)
@@ -66,10 +134,8 @@ epidist_db <- function(disease,
   # filter based on pathogen and delay distribution
   eparam <- eparam[clean_disease(eparam$disease) == disease, ]
 
-  # extract study or default to largest sample size
-  if (is.null(author)) {
-    eparam <- eparam[which.max(eparam$sample_size), ]
-  } else {
+  # extract study by author if given
+  if (!is.null(author)) {
     study <- match.arg(
       arg = author,
       choices = unique(eparam$author),
@@ -78,18 +144,63 @@ epidist_db <- function(disease,
     eparam <- eparam[eparam$author == study, ]
   }
 
-  # Throw warning if more than one row found and select first
-  if (nrow(eparam) > 1) {
-    warning(
-      "More than one study found. Selecting first one. ",
-      "Please report an issue with duplicated studies.",
+  # subset by subset conditions
+  if (is.call(expr)) {
+    set <- eval(expr = expr, envir = eparam, enclos = parent.frame())
+    eparam <- eparam[set, ]
+  } else if (is.function(subset)) {
+    set <- do.call(subset, args = list(eparam))
+    eparam <- eparam[set, ]
+  }
+
+  if (nrow(eparam) == 0) {
+    stop(
+      "No entries in the database meet the subset criteria.",
       call. = FALSE
     )
-    eparam <- eparam[1, ]
   }
 
   # convert epiparam to epidist
-  edist <- as_epidist(x = eparam)
+  if (nrow(eparam) == 1) {
+    edist <- as_epidist(x = eparam)
+    validate_epidist(edist)
+  } else {
+    edist <- suppressMessages(as_epidist(x = eparam))
+    is_param <- vapply(edist, is_parameterised, FUN.VALUE = logical(1))
+
+    if (single_epidist) {
+      # select parameterised entries
+      if (sum(is_param) >= 1) {
+        edist <- edist[is_param]
+      }
+      # select largest sample size
+      idx <- which.max(
+        vapply(
+          edist,
+          function(x) x$metadata$sample_size,
+          FUN.VALUE = numeric(1)
+        )
+      )
+      edist <- edist[[idx]]
+      validate_epidist(edist)
+
+      message(
+        "Using ", get_citation(edist), ". \n",
+        "To retrieve the short citation use the 'get_citation' function"
+      )
+    } else {
+      lapply(edist, validate_epidist)
+
+      message(
+        "Returning ", nrow(eparam), " results that match the criteria ",
+        "(", sum(is_param), " are parameterised). \n",
+        "Use subset to filter by entry variables or ",
+        "single_epidist to return a single entry. \n",
+        "To retrieve the short citation for each use the ",
+        "'get_citation' function"
+      )
+    }
+  }
 
   # return epidist
   edist