cmu-delphi · Aug 13, 2024
diff --git a/‎R/revision_analysis.R
+32-26 b/‎R/revision_analysis.R
+32-26
diff --git a/‎man/f_no_na.Rd
+14 b/‎man/f_no_na.Rd
+14
diff --git a/‎man/max_no_na.Rd
-14 b/‎man/max_no_na.Rd
-14
diff --git a/‎man/revision_summary.Rd
+14-2 b/‎man/revision_summary.Rd
+14-2
diff --git a/‎man/spread_vec.Rd
-12 b/‎man/spread_vec.Rd
-12
diff --git a/‎tests/testthat/_snaps/revision-latency-functions.md
+38-38 b/‎tests/testthat/_snaps/revision-latency-functions.md
+38-38
diff --git a/‎tests/testthat/test-revision-latency-functions.R
+1-1 b/‎tests/testthat/test-revision-latency-functions.R
+1-1
@@ -9,25 +9,35 @@
 #'   includes `NA`'s)
 #'  3. `max_lag`: the amount of time until the final (new) version (same caveat
 #'   for `drop_nas=FALSE`, though it is far less likely to matter)
-#'  4. `spread`: the difference between the smallest and largest values (this
+#'  4. `min_value`: the minimum value across revisions
+#'  5. `max_value`: the maximum value across revisions
+#'  6. `median_value`: the median value across revisions
+#'  7. `spread`: the difference between the smallest and largest values (this
 #'   always excludes `NA` values)
-#'  5. `rel_spread`: `spread` divided by the largest value (so it will
+#'  8. `rel_spread`: `spread` divided by the largest value (so it will
 #'   always be less than 1). Note that this need not be the final value. It will
 #'   be `NA` whenever `spread` is 0.
-#'  6. `time_near_latest`: This gives the lag when the value is within
+#'  9. `time_near_latest`: This gives the lag when the value is within
 #'   `within_latest` (default 20%) of the value at the latest time. For example,
 #'   consider the series (0,20, 99, 150, 102, 100); then `time_near_latest` is
 #'   the 5th index, since even though 99 is within 20%, it is outside the window
 #'   afterwards at 150.
 #' @param epi_arch an epi_archive to be analyzed
-#' @param ... <[`tidyselect`][dplyr_tidy_select]>, used to choose the column to summarize. If empty, it
-#'   chooses the first. Currently only implemented for one column at a time.
+#' @param ... <[`tidyselect`][dplyr_tidy_select]>, used to choose the column to
+#'   summarize. If empty, it chooses the first. Currently only implemented for
+#'   one column at a time.
 #' @param drop_nas bool, drop any `NA` values from the archive? After dropping
 #'   `NA`'s compactify is run again to make sure there are no duplicate values
 #'   from occasions when the signal is revised to `NA`, and then back to its
 #'   immediately-preceding value.
 #' @param print_inform bool, determines whether to print summary information, or
 #'   only return the full summary tibble
+#' @param min_waiting_period `difftime`, integer or `NULL`. Sets a cutoff: any
+#'   time_values not earlier than `min_waiting_period` before `versions_end` are
+#'   removed. `min_waiting_period` should characterize the typical time during
+#'   which revisions occur.  The default of 60 days corresponds to a typical
+#'   final value for case counts as reported in the context of insurance. To
+#'   avoid this filtering, either set to `NULL` or 0.
 #' @param within_latest double between 0 and 1. Determines the threshold
 #'   used for the `time_to`
 #' @param quick_revision difftime or integer (integer is treated as days), for
@@ -60,6 +70,7 @@ revision_summary <- function(epi_arch,
                              ...,
                              drop_nas = TRUE,
                              print_inform = TRUE,
+                             min_waiting_period = as.difftime(60, units = "days"),
                              within_latest = 0.2,
                              quick_revision = as.difftime(3, units = "days"),
                              few_revisions = 3,
@@ -92,6 +103,11 @@ revision_summary <- function(epi_arch,
   revision_behavior <-
     epi_arch$DT %>%
     select(c(geo_value, time_value, all_of(keys), version, !!arg))
+  if (!is.null(min_waiting_period)) {
+    revision_behavior <- revision_behavior %>%
+      filter(abs(time_value - as.Date(epi_arch$versions_end)) >= min_waiting_period)
+  }
+
   if (drop_nas) {
     # if we're dropping NA's, we should recompactify
     revision_behavior <-
@@ -113,18 +129,22 @@ revision_summary <- function(epi_arch,
       n_revisions = dplyr::n() - 1,
       min_lag = min(lag), # nolint: object_usage_linter
       max_lag = max(lag), # nolint: object_usage_linter
-      spread = spread_vec(pick(!!arg)),
-      rel_spread = spread / max_no_na(pick(!!arg)), # nolint: object_usage_linter
+      min_value = f_no_na(min, pick(!!arg)),
+      max_value = f_no_na(max, pick(!!arg)),
+      median_value = f_no_na(median, pick(!!arg)),
       time_to = time_within_x_latest(lag, pick(!!arg), prop = within_latest), # nolint: object_usage_linter
       .groups = "drop"
     ) %>%
     mutate(
+      spread = max_value - min_value, # nolint: object_usage_linter
+      rel_spread = spread / max_value, # nolint: object_usage_linter
       # TODO the units here may be a problem
       min_lag = as.difftime(min_lag, units = "days"), # nolint: object_usage_linter
       max_lag = as.difftime(max_lag, units = "days"), # nolint: object_usage_linter
       time_near_latest = as.difftime(time_to, units = "days") # nolint: object_usage_linter
     ) %>%
-    select(-time_to)
+    select(-time_to) %>%
+    relocate(time_value, geo_value, all_of(keys), n_revisions, min_lag, max_lag, time_near_latest, spread, rel_spread, min_value, max_value, median_value)
   if (print_inform) {
     cli_inform("Min lag (time to first version):")
     difftime_summary(revision_behavior$min_lag) %>% print()
@@ -203,31 +223,17 @@ get_last_run <- function(bool_vec, values_from) {
   values_from[[length(bool_vec) - tail(runs$lengths, n = 1) + 1]]
 }
 
-#' the default behavior returns a warning on empty lists, which we do not want,
-#' and there is no super clean way of preventing this
+#' use when the default behavior returns a warning on empty lists, which we do
+#' not want, and there is no super clean way of preventing this
 #' @keywords internal
-max_no_na <- function(x) {
+f_no_na <- function(f, x) {
   x <- x[!is.na(x)]
   if (length(x) == 0) {
     return(Inf)
   } else {
-    return(max(x))
+    return(f(x))
   }
 }
-#' the default behavior returns a warning on empty lists, which we do not want
-#' @keywords internal
-spread_vec <- function(x) {
-  x <- x[!is.na(x)]
-  if (length(x) == 0) {
-    return(-Inf)
-  } else {
-    res <- x %>%
-      range(na.rm = TRUE) %>%
-      diff(na.rm = TRUE)
-    return(res)
-  }
-}
-
 
 
 #' simple util for printing a fraction and it's percent
 
@@ -24,25 +24,25 @@
     Output
            min median     mean     max
         0 days 3 days 6.9 days 19 days
-      # A tibble: 7 x 8
-        time_value geo_value n_revisions min_lag max_lag spread rel_spread
-        <date>     <chr>           <dbl> <drtn>  <drtn>   <dbl>      <dbl>
-      1 2020-01-01 ak                  4 2 days  19 days    101      0.990
-      2 2020-01-01 al                  1 0 days  19 days     99      0.99 
-      3 2020-01-02 ak                  1 4 days   5 days      9      0.09 
-      4 2020-01-02 al                  0 0 days   0 days      0      0    
-      5 2020-01-03 ak                  0 3 days   3 days      0    NaN    
-      6 2020-01-03 al                  1 1 days   2 days      3      0.75 
-      7 2020-01-04 al                  0 1 days   1 days      0      0    
-        time_near_latest
-        <drtn>          
-      1 19 days         
-      2 19 days         
-      3  4 days         
-      4  0 days         
-      5  3 days         
-      6  2 days         
-      7  1 days         
+      # A tibble: 7 x 11
+        time_value geo_value n_revisions min_lag max_lag time_near_latest spread
+        <date>     <chr>           <dbl> <drtn>  <drtn>  <drtn>            <dbl>
+      1 2020-01-01 ak                  4 2 days  19 days 19 days             101
+      2 2020-01-01 al                  1 0 days  19 days 19 days              99
+      3 2020-01-02 ak                  1 4 days   5 days  4 days               9
+      4 2020-01-02 al                  0 0 days   0 days  0 days               0
+      5 2020-01-03 ak                  0 3 days   3 days  3 days               0
+      6 2020-01-03 al                  1 1 days   2 days  2 days               3
+      7 2020-01-04 al                  0 1 days   1 days  1 days               0
+        rel_spread min_value max_value median_value
+             <dbl>     <dbl>     <dbl>        <dbl>
+      1      0.990         1       102          6  
+      2      0.99          1       100         50.5
+      3      0.09         91       100         95.5
+      4      0             1         1          1  
+      5    NaN             0         0          0  
+      6      0.75          1         4          2.5
+      7      0             9         9          9  
 
 ---
 
@@ -72,23 +72,23 @@
     Output
            min median     mean     max
         0 days 3 days 6.9 days 19 days
-      # A tibble: 7 x 8
-        time_value geo_value n_revisions min_lag max_lag spread rel_spread
-        <date>     <chr>           <dbl> <drtn>  <drtn>   <dbl>      <dbl>
-      1 2020-01-01 ak                  6 2 days  19 days    101      0.990
-      2 2020-01-01 al                  1 0 days  19 days     99      0.99 
-      3 2020-01-02 ak                  1 4 days   5 days      9      0.09 
-      4 2020-01-02 al                  0 0 days   0 days      0      0    
-      5 2020-01-03 ak                  0 3 days   3 days      0    NaN    
-      6 2020-01-03 al                  1 1 days   2 days      3      0.75 
-      7 2020-01-04 al                  1 0 days   1 days      0      0    
-        time_near_latest
-        <drtn>          
-      1 19 days         
-      2 19 days         
-      3  4 days         
-      4  0 days         
-      5  3 days         
-      6  2 days         
-      7  1 days         
+      # A tibble: 7 x 11
+        time_value geo_value n_revisions min_lag max_lag time_near_latest spread
+        <date>     <chr>           <dbl> <drtn>  <drtn>  <drtn>            <dbl>
+      1 2020-01-01 ak                  6 2 days  19 days 19 days             101
+      2 2020-01-01 al                  1 0 days  19 days 19 days              99
+      3 2020-01-02 ak                  1 4 days   5 days  4 days               9
+      4 2020-01-02 al                  0 0 days   0 days  0 days               0
+      5 2020-01-03 ak                  0 3 days   3 days  3 days               0
+      6 2020-01-03 al                  1 1 days   2 days  2 days               3
+      7 2020-01-04 al                  1 0 days   1 days  1 days               0
+        rel_spread min_value max_value median_value
+             <dbl>     <dbl>     <dbl>        <dbl>
+      1      0.990         1       102          5.5
+      2      0.99          1       100         50.5
+      3      0.09         91       100         95.5
+      4      0             1         1          1  
+      5    NaN             0         0          0  
+      6      0.75          1         4          2.5
+      7      0             9         9          9  
 
@@ -28,7 +28,7 @@ dummy_ex <- tibble::tribble(
   "ak", as.Date("2020-01-03"), as.Date("2020-01-06"), 0,
   "ak", as.Date("2020-01-03"), as.Date("2020-01-07"), 0,
 ) %>%
-  as_epi_archive(compactify = FALSE)
+  as_epi_archive(versions_end = as.Date("2022-01-01"), compactify = FALSE)
 
 test_that("revision_summary works for a dummy dataset", {
   expect_snapshot(dummy_ex %>% revision_summary() %>% print(n = 10, width = 300))