approx_equal: make "abs_tol=" mandatory, +validation, +docs

brookslogan · brookslogan · commit 08b1783b1f16 · 2025-03-10T03:41:49.000-07:00
diff --git a/NAMESPACE b/NAMESPACE
@@ -58,6 +58,7 @@ S3method(ungroup,epi_df)
 S3method(ungroup,grouped_epi_archive)
 S3method(unnest,epi_df)
 export("%>%")
+export(approx_equal)
 export(arrange)
 export(arrange_canonical)
 export(as_epi_archive)
@@ -125,8 +126,12 @@ importFrom(checkmate,assert_subset)
 importFrom(checkmate,assert_tibble)
 importFrom(checkmate,checkInt)
 importFrom(checkmate,check_atomic)
+importFrom(checkmate,check_character)
 importFrom(checkmate,check_data_frame)
+importFrom(checkmate,check_logical)
 importFrom(checkmate,check_names)
+importFrom(checkmate,check_null)
+importFrom(checkmate,check_numeric)
 importFrom(checkmate,expect_class)
 importFrom(checkmate,test_int)
 importFrom(checkmate,test_set_equal)
@@ -256,6 +261,7 @@ importFrom(tsibble,as_tsibble)
 importFrom(utils,capture.output)
 importFrom(utils,tail)
 importFrom(vctrs,"vec_slice<-")
+importFrom(vctrs,obj_is_vector)
 importFrom(vctrs,vec_cast)
 importFrom(vctrs,vec_cast_common)
 importFrom(vctrs,vec_data)
diff --git a/R/archive.R b/R/archive.R
@@ -489,12 +489,12 @@ update_is_locf <- function(arranged_updates_df, ukey_names, abs_tol) {
       FALSE, # first observation is not LOCF
       approx_equal0(ekts_tbl,
         inds1 = inds1, ekts_tbl, inds2 = inds2,
-        # check ekt cols without tolerance:
-        abs_tol = 0, na_equal = TRUE
+        # check ekt (key) cols with 0 tolerance:
+        na_equal = TRUE, abs_tol = 0
       ) &
         approx_equal0(vals_tbl,
           inds1 = inds1, vals_tbl, inds2 = inds2,
-          abs_tol = abs_tol, na_equal = TRUE
+          na_equal = TRUE, abs_tol = abs_tol
         )
     )
   }
diff --git a/R/epiprocess-package.R b/R/epiprocess-package.R
@@ -12,7 +12,11 @@
 #' @importFrom checkmate assert_subset
 #' @importFrom checkmate assert_tibble
 #' @importFrom checkmate check_atomic check_data_frame expect_class test_int
+#' @importFrom checkmate check_character
+#' @importFrom checkmate check_logical
 #' @importFrom checkmate check_names
+#' @importFrom checkmate check_null
+#' @importFrom checkmate check_numeric
 #' @importFrom checkmate test_subset test_set_equal vname
 #' @importFrom cli cli_abort cli_warn
 #' @importFrom cli pluralize
@@ -36,6 +40,7 @@
 #' @importFrom tibble is_tibble
 #' @importFrom tidyr nest
 #' @importFrom tools toTitleCase
+#' @importFrom vctrs obj_is_vector
 #' @importFrom vctrs vec_cast
 #' @importFrom vctrs vec_cast_common
 #' @importFrom vctrs vec_data
diff --git a/R/patch.R b/R/patch.R
@@ -1,22 +1,88 @@
 #' Test two vctrs vectors for equality with some tolerance in some cases
 #'
-#' @param vec1,vec2 vctrs vectors (includes data frames)
-#' @param abs_tol tolerance; will be used for bare numeric `vec1`, `vec2`, or
-#'   any such columns within `vec1`, `vec2` if they are data frames
+#' Similar to [`vctrs::vec_equal`]. Behavior may differ from `vec_equal` with
+#' non-`NA` `NaN`s involved, or for bare lists that contain named vectors, and
+#' the precise behavior in these cases may change and should not be relied upon.
+#'
+#' @param vec1,vec2 vctrs vectors (includes data frames). Take care when using
+#'   on named vectors or "keyed" data frames; [`vec_names()`] are largely
+#'   ignored, and key columns are treated as normal value columns (when they
+#'   should probably generate an error if they are not lined up correctly, or be
+#'   tested for exact rather than approximate equality).
 #' @param na_equal should `NA`s be considered equal to each other? (In
 #'   epiprocess, we usually want this to be `TRUE`, but that doesn't match the
 #'   [`vctrs::vec_equal()`] default, so this is mandatory.)
-#' @param .ptype as in [`vctrs::vec_equal()`]
-#' @param inds1,inds2 optional (row) indices into vec1 and vec2; output should
-#'   be consistent with `vec_slice`-ing to these indices beforehand, but can
-#'   give faster computation if `vec1` and `vec2` are data frames.
+#' @param abs_tol absolute tolerance; will be used for bare numeric `vec1`,
+#'   `vec2`, or any such columns within `vec1`, `vec2` if they are data frames.
+#' @param .ptype as in [`vctrs::vec_equal()`].
+#' @param inds1,inds2 optional (row) indices into vec1 and vec2 compatible with
+#'   [`vctrs::vec_slice()`]; output should be consistent with `vec_slice`-ing to
+#'   these indices beforehand, but can give faster computation if `vec1` and
+#'   `vec2` are data frames.
 #'
 #' @return logical vector, with length matching the result of recycling `vec1`
 #'   (at `inds1` if provided) and `vec2` (at `inds2` if provided); entries
-#'   should all be `TRUE` or `FALSE` if `na_equal = TRUE`. Behavior may differ
-#'   from `vec_equal` with non-`NA` `NaN`s involved, or for bare lists that
-#'   contain named vectors.
-approx_equal <- function(vec1, vec2, abs_tol, na_equal, .ptype = NULL, inds1 = NULL, inds2 = NULL) {
+#'   should all be `TRUE` or `FALSE` if `na_equal = TRUE`.
+#'
+#' @examples
+#'
+#' # On numeric vectors:
+#' approx_equal(
+#'   c(1, 2, 3, NA),
+#'   c(1, 2 + 1e-10, NA, NA),
+#'   na_equal = TRUE,
+#'   abs_tol = 1e-8
+#' )
+#'
+#' # On tibbles:
+#' tbl1 <- tibble(
+#'   a = 1:5,
+#'   b = list(1:5, 1:4, 1:3, 1:2, 1:1) %>% lapply(as.numeric),
+#'   c = tibble(
+#'     c1 = 1:5
+#'   ),
+#'   d = matrix(1:10, 5, 2)
+#' )
+#' tbl2 <- tbl1
+#' tbl2$a[[2]] <- tbl1$a[[2]] + 1e-10
+#' tbl2$b[[3]][[1]] <- tbl1$b[[3]][[1]] + 1e-10
+#' tbl2$c$c1[[4]] <- tbl1$c$c1[[4]] + 1e-10
+#' tbl2$d[[5, 2]] <- tbl1$d[[5, 2]] + 1e-10
+#' vctrs::vec_equal(tbl1, tbl2, na_equal = TRUE)
+#' approx_equal(tbl1, tbl2, na_equal = TRUE, abs_tol = 1e-12)
+#' approx_equal(tbl1, tbl2, na_equal = TRUE, abs_tol = 1e-8)
+#'
+#'
+#'
+#'
+#'
+#' # Type comparison within lists is stricter, matching vctrs:
+#' vctrs::vec_equal(list(1:2), list(as.numeric(1:2)))
+#' approx_equal(list(1:2), list(as.numeric(1:2)), FALSE, abs_tol = 0)
+#'
+#' @export
+approx_equal <- function(vec1, vec2, na_equal, .ptype = NULL, ..., abs_tol, inds1 = NULL, inds2 = NULL) {
+  if (!obj_is_vector(vec1)) cli_abort("`vec1` must be recognized by vctrs as a vector")
+  if (!obj_is_vector(vec2)) cli_abort("`vec2` must be recognized by vctrs as a vector")
+  # Leave vec size checking to vctrs recycling ops.
+  assert_logical(na_equal, any.missing = FALSE, len = 1L)
+  # Leave .ptype checks to cast operation.
+  check_dots_empty()
+  assert_numeric(abs_tol, lower = 0, len = 1L)
+  assert(
+    check_null(inds1),
+    check_numeric(inds1),
+    check_logical(inds1),
+    check_character(inds1)
+  )
+  assert(
+    check_null(inds2),
+    check_numeric(inds2),
+    check_logical(inds2),
+    check_character(inds2)
+  )
+  # Leave heavier index validation to the vctrs recycling & indexing ops.
+
   # Recycle inds if provided; vecs if not:
   common_size <- vec_size_common(
     if (is.null(inds1)) vec1 else inds1,
@@ -33,13 +99,13 @@ approx_equal <- function(vec1, vec2, abs_tol, na_equal, .ptype = NULL, inds1 = N
     inds2 <- vec_recycle(inds2, common_size)
   }
   vecs <- vec_cast_common(vec1, vec2, .to = .ptype)
-  approx_equal0(vecs[[1]], vecs[[2]], abs_tol, na_equal, inds1, inds2)
+  approx_equal0(vecs[[1]], vecs[[2]], na_equal, abs_tol, inds1, inds2)
 }
 
 #' Helper for [`approx_equal`] for vecs guaranteed to have the same ptype and size
 #'
 #' @keywords internal
-approx_equal0 <- function(vec1, vec2, abs_tol, na_equal, inds1 = NULL, inds2 = NULL) {
+approx_equal0 <- function(vec1, vec2, na_equal, abs_tol, inds1 = NULL, inds2 = NULL) {
   if (is_bare_numeric(vec1) && abs_tol != 0) {
     # perf: since we're working with bare numerics and logicals: we can use `[`
     # and `fifelse`. Matching vec_equal, we ignore names and other attributes.
@@ -66,7 +132,7 @@ approx_equal0 <- function(vec1, vec2, abs_tol, na_equal, inds1 = NULL, inds2 = N
       rep(TRUE, nrow(vec1))
     } else {
       Reduce(`&`, lapply(seq_len(ncol(vec1)), function(col_i) {
-        approx_equal0(vec1[[col_i]], vec2[[col_i]], abs_tol, na_equal, inds1, inds2)
+        approx_equal0(vec1[[col_i]], vec2[[col_i]], na_equal, abs_tol, inds1, inds2)
       }))
     }
   } else if (is_bare_list(vec1)) {
@@ -78,7 +144,7 @@ approx_equal0 <- function(vec1, vec2, abs_tol, na_equal, inds1 = NULL, inds2 = N
         # consistently inconsistent, we avoid dispatching to vec_equal for bare
         # lists even with abs_tol = 0:
         identical(vec_ptype(entry1), vec_ptype(entry2)) &&
-        all(approx_equal0(entry1, entry2, abs_tol, na_equal))
+        all(approx_equal0(entry1, entry2, na_equal, abs_tol))
     }, logical(1L))
   } else {
     # XXX No special handling for any other types/situations. Makes sense for
@@ -201,8 +267,8 @@ tbl_diff2 <- function(earlier_snapshot, later_tbl,
   combined_compactify_away[combined_ukey_is_repeat] <-
     approx_equal0(combined_vals,
       combined_vals,
-      abs_tol = compactify_abs_tol,
       na_equal = TRUE,
+      abs_tol = compactify_abs_tol,
       inds1 = combined_ukey_is_repeat,
       inds2 = ukey_repeat_first_i
     )
diff --git a/man/approx_equal.Rd b/man/approx_equal.Rd
diff --git a/man/approx_equal0.Rd b/man/approx_equal0.Rd
diff --git a/tests/testthat/test-archive.R b/tests/testthat/test-archive.R
@@ -220,5 +220,5 @@ test_that("`epi_archive` rejects dataframes where time_value and version columns
 test_that("is_locf replacement works as expected", {
   vec <- c(1, 1, 1e-10, 1.1e-10, NA, NA, NaN, NaN)
   is_repeated <- c(0, 1, 0, 1, 0, 1, 1, 1)
-  expect_equal(c(FALSE, approx_equal(head(vec, -1L), tail(vec, -1L), .Machine$double.eps^0.5, na_equal = TRUE)), as.logical(is_repeated))
+  expect_equal(c(FALSE, approx_equal(head(vec, -1L), tail(vec, -1L), na_equal = TRUE, abs_tol = .Machine$double.eps^0.5)), as.logical(is_repeated))
 })

Original file line number	Diff line number	Diff line change
`@@ -489,12 +489,12 @@ update_is_locf <- function(arranged_updates_df, ukey_names, abs_tol) {`
`489`	`489`	`FALSE, # first observation is not LOCF`
`490`	`490`	`approx_equal0(ekts_tbl,`
`491`	`491`	`inds1 = inds1, ekts_tbl, inds2 = inds2,`
`492`		`- # check ekt cols without tolerance:`
`493`		`- abs_tol = 0, na_equal = TRUE`
	`492`	`+ # check ekt (key) cols with 0 tolerance:`
	`493`	`+ na_equal = TRUE, abs_tol = 0`
`494`	`494`	`) &`
`495`	`495`	`approx_equal0(vals_tbl,`
`496`	`496`	`inds1 = inds1, vals_tbl, inds2 = inds2,`
`497`		`- abs_tol = abs_tol, na_equal = TRUE`
	`497`	`+ na_equal = TRUE, abs_tol = abs_tol`
`498`	`498`	`)`
`499`	`499`	`)`
`500`	`500`	`}`