tests(epi_slide_opt): on example data sets

brookslogan · brookslogan · commit dd84924a3391 · 2025-03-10T12:52:08.000-07:00
diff --git a/tests/testthat/test-epi_slide_opt_archive.R b/tests/testthat/test-epi_slide_opt_archive.R
@@ -35,7 +35,7 @@ test_that("epi_slide_opt_archive_one_epikey works as expected", {
     ),
     tibble(
       version = 13, time_value = 8:10, value = c(9, 9, 10),
-      slide_value = frollmean(c(6, 7, 9, 9, 10),  3,  algo = "exact")[-(1:2)]
+      slide_value = frollmean(c(6, 7, 9, 9, 10), 3, algo = "exact")[-(1:2)]
     ),
     tibble(
       version = 14, time_value = 11:13, value = c(NA, 12, 13), slide_value = rep(NA_real_, 3L)
@@ -89,7 +89,6 @@ test_that("epi_slide_opt.epi_archive is not confused by unique(DT$version) unsor
 })
 
 test_that("epi_slide_opt.epi_archive is not confused by unique(DT$time_value) unsorted", {
-
   start_date <- as.Date("2020-01-01")
   tibble(
     geo_value = c(1, 1, 2, 2),
@@ -109,26 +108,77 @@ test_that("epi_slide_opt.epi_archive is not confused by unique(DT$time_value) un
       ) %>%
         as_epi_archive()
     )
-
 })
 
-test_that("epi_slide_opt.epi_archive is equivalent to epix_slide reconversion on example data", {
-
-  case_death_rate_archive %>%
-    epi_slide_opt(case_rate, frollmean, .window_size = 7
-                  # , algo = "exact"
-                  ) %>%
-    .$DT %>%
-    as.data.frame() %>%
-    as_tibble() %>%
-    filter(!approx_equal(case_rate_7dav, case_rate_7d_av, 1e-6, TRUE)) %>%
-    dplyr::transmute(version, geo_value, time_value, case_rate_7dav, case_rate_7d_av,
-                     abs_diff = abs(case_rate_7dav - case_rate_7d_av)) %>%
-    {}
-
-    # TODO finish tests on example data sets
-
-  })
-
+test_that("epi_slide_opt.epi_archive gives expected results on example data", {
+  # vs. built-in case_rate_7d_av column.
+  #
+  # If we were to compare the keyset vs.
+  # the original, it changes, as the original contains some tiny deviations in
+  # values that don't seem achievable with available sliding functions. E.g., in
+  # the recomputed result, geo "ak" version "2020-11-01" changes time 2020-03-13
+  # from 0 to 0.138 and time 2020-03-14 from a slightly different value of 0.138
+  # to 0, while nearby times remained stable; in the original, this resulted in
+  # a tiny update to the 7d_av for 2020-03-14 but not following times somehow,
+  # while in the recomputation there are also minute updates to 2020-03-15 and
+  # 2020-03-16; 2020-03-17 onward have other case_rate changes factoring in.
+  # Compactifying and comparing with tolerances would help account for some of
+  # these differences, but only through writing this was it realized that both
+  # archives would need the recompactification with tolerance; it's not just
+  # epi_slide_opt.epi_archive's very rigid compactification that's the cause.
+  # (Side note: allowing configurable compactification tolerance in
+  # epi_slide_opt.epi_archive wasn't included due to either feeling strange
+  # applying the compactification tolerance to all columns rather than just
+  # computed columns, and a slowdown when using one approach to compactify just
+  # the new columns + also awkward not matching what's possible with just
+  # construction functions.)
+  #
+  # --> just compare essentially an epix_merge of the original & the recomputation:
+  case_death_rate_archive_time <- system.time(
+    case_death_rate_archive_result <- case_death_rate_archive %>%
+      epi_slide_opt(case_rate, frollmean, algo = "exact", .window_size = 7)
+  )
+  expect_equal(
+    case_death_rate_archive_result$DT$case_rate_7dav,
+    case_death_rate_archive_result$DT$case_rate_7d_av
+  )
+
+  # vs. computing via epix_slide:
+
+  mini_case_death_rate_archive <- case_death_rate_archive %>%
+    {
+      as_tibble(as.data.frame(.$DT))
+    } %>%
+    filter(geo_value %in% head(unique(geo_value), 4L)) %>%
+    as_epi_archive()
+
+  mini_case_death_rate_archive_time_opt <- system.time(
+    mini_case_death_rate_archive_result <- mini_case_death_rate_archive %>%
+      epi_slide_opt(case_rate, frollmean, .window_size = 7)
+  )
+
+  mini_case_death_rate_archive_time_gen <- system.time(
+    mini_case_death_rate_archive_expected <- mini_case_death_rate_archive %>%
+      epix_slide(~ .x %>% epi_slide_opt(case_rate, frollmean, .window_size = 7)) %>%
+      select(names(mini_case_death_rate_archive$DT), everything()) %>%
+      as_epi_archive()
+  )
+
+  expect_equal(mini_case_death_rate_archive_result, mini_case_death_rate_archive_expected)
+
+  archive_cases_dv_subset_time_opt <- system.time(
+    archive_cases_dv_subset_result <- archive_cases_dv_subset %>%
+      epi_slide_opt(percent_cli, frollmean, .window_size = 7)
+  )
+
+  archive_cases_dv_subset_time_gen <- system.time(
+    archive_cases_dv_subset_expected <- archive_cases_dv_subset %>%
+      epix_slide(~ .x %>% epi_slide_opt(percent_cli, frollmean, .window_size = 7)) %>%
+      select(geo_value, time_value, version, everything()) %>%
+      as_epi_archive()
+  )
+
+  expect_equal(archive_cases_dv_subset_result, archive_cases_dv_subset_expected)
+})
 
 # TODO grouped behavior checks