Skip to content

Commit 1c5f3c9

Browse files
committed
fix(tbl_fast_anti_join): include non-ukey, non-val cols in result
1 parent 71116cb commit 1c5f3c9

File tree

1 file changed

+39
-21
lines changed

1 file changed

+39
-21
lines changed

R/patch.R

+39-21
Original file line numberDiff line numberDiff line change
@@ -196,7 +196,10 @@ vec_approx_equal0 <- function(vec1, vec2, na_equal, abs_tol, inds1 = NULL, inds2
196196
#' @return rows from `x` that either (a) don't have a (0-tolerance) matching
197197
#' ukey in `y`, or (b) have a matching ukey in `y`, but don't have
198198
#' approximately equal value column values
199+
#'
200+
#' @keywords internal
199201
tbl_fast_anti_join <- function(x, y, ukey_names, val_names, abs_tol = 0) {
202+
x_orig <- x
200203
x <- x[c(ukey_names, val_names)]
201204
y <- y[c(ukey_names, val_names)]
202205
xy <- vec_rbind(x, y)
@@ -211,7 +214,7 @@ tbl_fast_anti_join <- function(x, y, ukey_names, val_names, abs_tol = 0) {
211214
xy_vals <- xy[val_names]
212215
x_exclude[xy_dup_inds1] <- vec_approx_equal(xy_vals, inds1 = xy_dup_inds2, xy_vals, inds2 = xy_dup_inds1, na_equal = TRUE, abs_tol = abs_tol)
213216
}
214-
vec_slice(x, !x_exclude)
217+
vec_slice(x_orig, !x_exclude)
215218
}
216219

217220
#' Calculate compact patch to move from one snapshot/update to another
@@ -244,41 +247,53 @@ tbl_diff2 <- function(earlier_snapshot, later_tbl,
244247
# Most input validation + handle NULL earlier_snapshot. This is a small function so
245248
# use faster validation variants:
246249
if (!is_tibble(later_tbl)) {
247-
cli_abort("`later_tbl` must be a tibble",
248-
class = "epiprocess__tbl_diff2__later_tbl_invalid"
249-
)
250+
cli_abort(
251+
"`later_tbl` must be a tibble",
252+
class = "epiprocess__tbl_diff2__later_tbl_invalid"
253+
)
250254
}
251255
if (is.null(earlier_snapshot)) {
252256
return(later_tbl)
253257
}
254258
if (!is_tibble(earlier_snapshot)) {
255-
cli_abort("`earlier_snapshot` must be a tibble or `NULL`",
256-
class = "epiprocess__tbl_diff2__earlier_tbl_class_invalid"
257-
)
259+
cli_abort(
260+
"`earlier_snapshot` must be a tibble or `NULL`",
261+
class = "epiprocess__tbl_diff2__earlier_tbl_class_invalid"
262+
)
258263
}
259264
if (!is.character(ukey_names) || !all(ukey_names %in% names(earlier_snapshot))) {
260-
cli_abort("`ukey_names` must be a subset of column names",
261-
class = "epiprocess__tbl_diff2__ukey_names_class_invalid"
262-
)
265+
cli_abort(
266+
"`ukey_names` must be a subset of column names",
267+
class = "epiprocess__tbl_diff2__ukey_names_class_invalid"
268+
)
263269
}
264270
later_format <- arg_match0(later_format, c("snapshot", "update"))
265271
if (!(is.vector(compactify_abs_tol, mode = "numeric") &&
266-
length(compactify_abs_tol) == 1L && # nolint:indentation_linter
267-
compactify_abs_tol >= 0)) {
272+
length(compactify_abs_tol) == 1L && # nolint:indentation_linter
273+
compactify_abs_tol >= 0)) {
268274
# Give a specific message:
269275
assert_numeric(compactify_abs_tol, lower = 0, any.missing = FALSE, len = 1L)
270276
# Fallback e.g. for invalid classes not caught by assert_numeric:
271-
cli_abort("`compactify_abs_tol` must be a length-1 double/integer >= 0",
272-
class = "epiprocess__tbl_diff2__compactify_abs_tol_invalid")
277+
cli_abort(
278+
"`compactify_abs_tol` must be a length-1 double/integer >= 0",
279+
class = "epiprocess__tbl_diff2__compactify_abs_tol_invalid"
280+
)
273281
}
274282

275283
all_names <- names(later_tbl)
276-
val_names <- all_names[! (all_names %in% ukey_names)]
284+
val_names <- all_names[!(all_names %in% ukey_names)]
277285
updates <- tbl_fast_anti_join(later_tbl, earlier_snapshot, ukey_names, val_names, compactify_abs_tol)
278286
if (later_format == "snapshot") {
287+
# Interpret `later_tbl` as a full snapshot, rather than a diff / sparse
288+
# update. That means that any ukeys in `earlier_snapshot` that don't appear
289+
# in `later_tbl` were deleted in the later snapshot.
279290
deletions <- tbl_fast_anti_join(earlier_snapshot[ukey_names], later_tbl[ukey_names], ukey_names, character(), 0)
280-
updates <- vec_rbind(updates, deletions) # fills vals with NAs
291+
updates <- vec_rbind(updates, deletions) # fills val cols with NAs
281292
}
293+
# If `later_format == "update"`, we don't need to do anything special about
294+
# the above ukeys. The full snapshot for the later version would include the
295+
# corresponding rows unchanged, and the diff for these unchanged rows would be
296+
# empty.
282297
updates
283298
}
284299

@@ -299,20 +314,23 @@ tbl_patch <- function(snapshot, update, ukey_names) {
299314
# Most input validation. This is a small function so use faster validation
300315
# variants:
301316
if (!is_tibble(update)) {
302-
cli_abort("`update` must be a tibble",
317+
cli_abort(
318+
"`update` must be a tibble",
303319
class = "epiprocess__tbl_patch__update_class_invalid"
304320
)
305321
}
306322
if (is.null(snapshot)) {
307323
return(update)
308324
}
309325
if (!is_tibble(snapshot)) {
310-
cli_abort("`snapshot` must be a tibble",
326+
cli_abort(
327+
"`snapshot` must be a tibble",
311328
class = "epiprocess__tbl_patch__snapshot_class_invalid"
312329
)
313330
}
314331
if (!is.character(ukey_names) || !all(ukey_names %in% names(snapshot))) {
315-
cli_abort("`ukey_names` must be a subset of column names",
332+
cli_abort(
333+
"`ukey_names` must be a subset of column names",
316334
class = "epiprocess__tbl_patch__ukey_names_invalid"
317335
)
318336
}
@@ -333,8 +351,8 @@ tbl_patch <- function(snapshot, update, ukey_names) {
333351
# This is like `!duplicated()` but faster, and like `vec_unique_loc()` but guaranteeing
334352
# that we get the first appearance since `vec_duplicate_id()` guarantees that
335353
# it points to the first appearance.
336-
not_overwritten <- dup_ids == vec_seq_along(result_tbl)
337-
result_tbl <- result_tbl[not_overwritten, ]
354+
is_only_or_favored_appearance <- dup_ids == vec_seq_along(result_tbl)
355+
result_tbl <- result_tbl[is_only_or_favored_appearance, ]
338356

339357
result_tbl
340358
}

0 commit comments

Comments
 (0)