Skip to content

Commit 71116cb

Browse files
committed
Simplify tbl_diff2 with tbl_fast_anti_join
1 parent d1ba68e commit 71116cb

File tree

2 files changed

+48
-93
lines changed

2 files changed

+48
-93
lines changed

R/epi_slide_opt_archive.R

+1-1
Original file line numberDiff line numberDiff line change
@@ -126,7 +126,7 @@ epi_slide_opt_archive_one_epikey <- function(
126126
!is.na(slide_inp_backrefs)
127127
}
128128
out_update <- slide[rows_should_keep, ]
129-
out_diff <- tbl_diff2(prev_out_snapshot, out_update, "time_value", "update")
129+
out_diff <- tbl_diff2_alt2(prev_out_snapshot, out_update, "time_value", "update")
130130
prev_inp_snapshot <<- inp_snapshot
131131
prev_out_snapshot <<- tbl_patch(prev_out_snapshot, out_diff, "time_value")
132132
out_diff$version <- version

R/patch.R

+47-92
Original file line numberDiff line numberDiff line change
@@ -183,6 +183,37 @@ vec_approx_equal0 <- function(vec1, vec2, na_equal, abs_tol, inds1 = NULL, inds2
183183
}
184184
}
185185

186+
#' Variation on [`dplyr::anti_join`] for speed + tolerance setting
187+
#'
188+
#' @param x tibble; `x[ukey_names]` must not have any duplicate rows
189+
#' @param y tibble; `y[ukey_names]` must not have any duplicate rows
190+
#' @param ukey_names chr; names of columns that form a unique key, for `x` and
191+
#' for `y`
192+
#' @param val_names chr; names of columns which should be treated as
193+
#' value/measurement columns, and compared with a tolerance
194+
#' @param abs_tol scalar non-negative numeric; absolute tolerance with which to
195+
#' compare value columns; see [`vec_approx_equal`]
196+
#' @return rows from `x` that either (a) don't have a (0-tolerance) matching
197+
#' ukey in `y`, or (b) have a matching ukey in `y`, but don't have
198+
#' approximately equal value column values
199+
tbl_fast_anti_join <- function(x, y, ukey_names, val_names, abs_tol = 0) {
200+
x <- x[c(ukey_names, val_names)]
201+
y <- y[c(ukey_names, val_names)]
202+
xy <- vec_rbind(x, y)
203+
if (abs_tol == 0) {
204+
x_exclude <- vec_duplicate_detect(xy)
205+
x_exclude <- vec_slice(x_exclude, seq_len(nrow(x)))
206+
} else {
207+
xy_dup_ids <- vec_duplicate_id(xy[ukey_names])
208+
xy_dup_inds2 <- which(xy_dup_ids != seq_along(xy_dup_ids))
209+
xy_dup_inds1 <- xy_dup_ids[xy_dup_inds2]
210+
x_exclude <- rep(FALSE, nrow(x))
211+
xy_vals <- xy[val_names]
212+
x_exclude[xy_dup_inds1] <- vec_approx_equal(xy_vals, inds1 = xy_dup_inds2, xy_vals, inds2 = xy_dup_inds1, na_equal = TRUE, abs_tol = abs_tol)
213+
}
214+
vec_slice(x, !x_exclude)
215+
}
216+
186217
#' Calculate compact patch to move from one snapshot/update to another
187218
#'
188219
#' @param earlier_snapshot tibble or `NULL`; `NULL` represents that there was no
@@ -214,117 +245,41 @@ tbl_diff2 <- function(earlier_snapshot, later_tbl,
214245
# use faster validation variants:
215246
if (!is_tibble(later_tbl)) {
216247
cli_abort("`later_tbl` must be a tibble",
217-
class = "epiprocess__tbl_diff2__later_tbl_invalid"
218-
)
248+
class = "epiprocess__tbl_diff2__later_tbl_invalid"
249+
)
219250
}
220251
if (is.null(earlier_snapshot)) {
221252
return(later_tbl)
222253
}
223254
if (!is_tibble(earlier_snapshot)) {
224255
cli_abort("`earlier_snapshot` must be a tibble or `NULL`",
225-
class = "epiprocess__tbl_diff2__earlier_tbl_class_invalid"
226-
)
256+
class = "epiprocess__tbl_diff2__earlier_tbl_class_invalid"
257+
)
227258
}
228259
if (!is.character(ukey_names) || !all(ukey_names %in% names(earlier_snapshot))) {
229260
cli_abort("`ukey_names` must be a subset of column names",
230-
class = "epiprocess__tbl_diff2__ukey_names_class_invalid"
231-
)
261+
class = "epiprocess__tbl_diff2__ukey_names_class_invalid"
262+
)
232263
}
233264
later_format <- arg_match0(later_format, c("snapshot", "update"))
234265
if (!(is.vector(compactify_abs_tol, mode = "numeric") &&
235-
length(compactify_abs_tol) == 1L && # nolint:indentation_linter
236-
compactify_abs_tol >= 0)) {
266+
length(compactify_abs_tol) == 1L && # nolint:indentation_linter
267+
compactify_abs_tol >= 0)) {
237268
# Give a specific message:
238269
assert_numeric(compactify_abs_tol, lower = 0, any.missing = FALSE, len = 1L)
239270
# Fallback e.g. for invalid classes not caught by assert_numeric:
240271
cli_abort("`compactify_abs_tol` must be a length-1 double/integer >= 0",
241-
class = "epiprocess__tbl_diff2__compactify_abs_tol_invalid"
242-
)
272+
class = "epiprocess__tbl_diff2__compactify_abs_tol_invalid")
243273
}
244274

245-
# Extract metadata:
246-
earlier_n <- nrow(earlier_snapshot)
247-
later_n <- nrow(later_tbl)
248-
tbl_names <- names(earlier_snapshot)
249-
val_names <- tbl_names[!tbl_names %in% ukey_names]
250-
251-
# More input validation:
252-
if (!identical(tbl_names, names(later_tbl))) {
253-
cli_abort(c(
254-
"`earlier_snapshot` and `later_tbl` should have identical column
255-
names and ordering.",
256-
"*" = "`earlier_snapshot` colnames: {format_chr_deparse(tbl_names)}",
257-
"*" = "`later_tbl` colnames: {format_chr_deparse(names(later_tbl))}"
258-
), class = "epiprocess__tbl_diff2__tbl_names_differ")
275+
all_names <- names(later_tbl)
276+
val_names <- all_names[! (all_names %in% ukey_names)]
277+
updates <- tbl_fast_anti_join(later_tbl, earlier_snapshot, ukey_names, val_names, compactify_abs_tol)
278+
if (later_format == "snapshot") {
279+
deletions <- tbl_fast_anti_join(earlier_snapshot[ukey_names], later_tbl[ukey_names], ukey_names, character(), 0)
280+
updates <- vec_rbind(updates, deletions) # fills vals with NAs
259281
}
260-
261-
combined_tbl <- vec_rbind(earlier_snapshot, later_tbl)
262-
combined_n <- nrow(combined_tbl)
263-
264-
# We'll also need epikeytimes and value columns separately:
265-
combined_ukeys <- combined_tbl[ukey_names]
266-
combined_vals <- combined_tbl[val_names]
267-
268-
# We have five types of rows in combined_tbl:
269-
# 1. From earlier_snapshot, no matching ukey in later_tbl (deletion; turn vals to
270-
# NAs to match epi_archive format)
271-
# 2. From earlier_snapshot, with matching ukey in later_tbl (context; exclude from
272-
# result)
273-
# 3. From later_tbl, with matching ukey in earlier_snapshot, with value "close" (change
274-
# that we'll compactify away)
275-
# 4. From later_tbl, with matching ukey in earlier_snapshot, value not "close" (change
276-
# that we'll record)
277-
# 5. From later_tbl, with no matching ukey in later_tbl (addition)
278-
279-
# For "snapshot" later_format, we need to filter to 1., 4., and 5., and alter
280-
# values for 1. For "update" later_format, we need to filter to 4. and 5.
281-
282-
# (For compactify_abs_tol = 0, we could potentially streamline things by dropping
283-
# ukey+val duplicates (cases 2. and 3.).)
284-
285-
# Row indices of first occurrence of each ukey; will be the same as
286-
# seq_len(combined_n) for each ukey's first appearance (cases 1., 2., or 5.);
287-
# for re-reported ukeys in `later_tbl` (cases 3. or 4.), it will point back to
288-
# the row index of the same ukey in `earlier_snapshot`:
289-
combined_ukey_firsts <- vec_duplicate_id(combined_ukeys)
290-
291-
# Which rows from combined are cases 3. or 4.?
292-
combined_ukey_is_repeat <- combined_ukey_firsts != seq_len(combined_n)
293-
# For each row in 3. or 4., row numbers of the ukey appearance in earlier:
294-
ukey_repeat_first_i <- combined_ukey_firsts[combined_ukey_is_repeat]
295-
296-
# Which rows from combined are in case 3.?
297-
combined_compactify_away <- rep(FALSE, combined_n)
298-
combined_compactify_away[combined_ukey_is_repeat] <-
299-
vec_approx_equal0(combined_vals,
300-
combined_vals,
301-
na_equal = TRUE,
302-
abs_tol = compactify_abs_tol,
303-
inds1 = combined_ukey_is_repeat,
304-
inds2 = ukey_repeat_first_i
305-
)
306-
307-
# Which rows from combined are in cases 3., 4., or 5.?
308-
combined_from_later <- vec_rep_each(c(FALSE, TRUE), c(earlier_n, later_n))
309-
310-
if (later_format == "update") {
311-
# Cases 4. and 5.:
312-
combined_tbl <- combined_tbl[combined_from_later & !combined_compactify_away, ]
313-
} else { # later_format is "snapshot"
314-
# Which rows from combined are in case 1.?
315-
combined_is_deletion <- vec_rep_each(c(TRUE, FALSE), c(earlier_n, later_n))
316-
combined_is_deletion[ukey_repeat_first_i] <- FALSE
317-
# Which rows from combined are in cases 1., 4., or 5.?
318-
combined_include <- combined_is_deletion | combined_from_later & !combined_compactify_away
319-
combined_tbl <- combined_tbl[combined_include, ]
320-
# Represent deletion in 1. with NA-ing of all value columns. (In some
321-
# previous approaches to epi_diff2, this seemed to be faster than using
322-
# vec_rbind(case_1_ukeys, cases_45_tbl) or bind_rows to fill with NAs, and more
323-
# general than data.table's rbind(case_1_ukeys, cases_45_tbl, fill = TRUE).)
324-
combined_tbl[combined_is_deletion[combined_include], val_names] <- NA
325-
}
326-
327-
combined_tbl
282+
updates
328283
}
329284

330285
#' Apply an update (e.g., from `tbl_diff2`) to a snapshot

0 commit comments

Comments
 (0)