@@ -183,6 +183,37 @@ vec_approx_equal0 <- function(vec1, vec2, na_equal, abs_tol, inds1 = NULL, inds2
183
183
}
184
184
}
185
185
186
+ # ' Variation on [`dplyr::anti_join`] for speed + tolerance setting
187
+ # '
188
+ # ' @param x tibble; `x[ukey_names]` must not have any duplicate rows
189
+ # ' @param y tibble; `y[ukey_names]` must not have any duplicate rows
190
+ # ' @param ukey_names chr; names of columns that form a unique key, for `x` and
191
+ # ' for `y`
192
+ # ' @param val_names chr; names of columns which should be treated as
193
+ # ' value/measurement columns, and compared with a tolerance
194
+ # ' @param abs_tol scalar non-negative numeric; absolute tolerance with which to
195
+ # ' compare value columns; see [`vec_approx_equal`]
196
+ # ' @return rows from `x` that either (a) don't have a (0-tolerance) matching
197
+ # ' ukey in `y`, or (b) have a matching ukey in `y`, but don't have
198
+ # ' approximately equal value column values
199
+ tbl_fast_anti_join <- function (x , y , ukey_names , val_names , abs_tol = 0 ) {
200
+ x <- x [c(ukey_names , val_names )]
201
+ y <- y [c(ukey_names , val_names )]
202
+ xy <- vec_rbind(x , y )
203
+ if (abs_tol == 0 ) {
204
+ x_exclude <- vec_duplicate_detect(xy )
205
+ x_exclude <- vec_slice(x_exclude , seq_len(nrow(x )))
206
+ } else {
207
+ xy_dup_ids <- vec_duplicate_id(xy [ukey_names ])
208
+ xy_dup_inds2 <- which(xy_dup_ids != seq_along(xy_dup_ids ))
209
+ xy_dup_inds1 <- xy_dup_ids [xy_dup_inds2 ]
210
+ x_exclude <- rep(FALSE , nrow(x ))
211
+ xy_vals <- xy [val_names ]
212
+ x_exclude [xy_dup_inds1 ] <- vec_approx_equal(xy_vals , inds1 = xy_dup_inds2 , xy_vals , inds2 = xy_dup_inds1 , na_equal = TRUE , abs_tol = abs_tol )
213
+ }
214
+ vec_slice(x , ! x_exclude )
215
+ }
216
+
186
217
# ' Calculate compact patch to move from one snapshot/update to another
187
218
# '
188
219
# ' @param earlier_snapshot tibble or `NULL`; `NULL` represents that there was no
@@ -214,117 +245,41 @@ tbl_diff2 <- function(earlier_snapshot, later_tbl,
214
245
# use faster validation variants:
215
246
if (! is_tibble(later_tbl )) {
216
247
cli_abort(" `later_tbl` must be a tibble" ,
217
- class = " epiprocess__tbl_diff2__later_tbl_invalid"
218
- )
248
+ class = " epiprocess__tbl_diff2__later_tbl_invalid"
249
+ )
219
250
}
220
251
if (is.null(earlier_snapshot )) {
221
252
return (later_tbl )
222
253
}
223
254
if (! is_tibble(earlier_snapshot )) {
224
255
cli_abort(" `earlier_snapshot` must be a tibble or `NULL`" ,
225
- class = " epiprocess__tbl_diff2__earlier_tbl_class_invalid"
226
- )
256
+ class = " epiprocess__tbl_diff2__earlier_tbl_class_invalid"
257
+ )
227
258
}
228
259
if (! is.character(ukey_names ) || ! all(ukey_names %in% names(earlier_snapshot ))) {
229
260
cli_abort(" `ukey_names` must be a subset of column names" ,
230
- class = " epiprocess__tbl_diff2__ukey_names_class_invalid"
231
- )
261
+ class = " epiprocess__tbl_diff2__ukey_names_class_invalid"
262
+ )
232
263
}
233
264
later_format <- arg_match0(later_format , c(" snapshot" , " update" ))
234
265
if (! (is.vector(compactify_abs_tol , mode = " numeric" ) &&
235
- length(compactify_abs_tol ) == 1L && # nolint:indentation_linter
236
- compactify_abs_tol > = 0 )) {
266
+ length(compactify_abs_tol ) == 1L && # nolint:indentation_linter
267
+ compactify_abs_tol > = 0 )) {
237
268
# Give a specific message:
238
269
assert_numeric(compactify_abs_tol , lower = 0 , any.missing = FALSE , len = 1L )
239
270
# Fallback e.g. for invalid classes not caught by assert_numeric:
240
271
cli_abort(" `compactify_abs_tol` must be a length-1 double/integer >= 0" ,
241
- class = " epiprocess__tbl_diff2__compactify_abs_tol_invalid"
242
- )
272
+ class = " epiprocess__tbl_diff2__compactify_abs_tol_invalid" )
243
273
}
244
274
245
- # Extract metadata:
246
- earlier_n <- nrow(earlier_snapshot )
247
- later_n <- nrow(later_tbl )
248
- tbl_names <- names(earlier_snapshot )
249
- val_names <- tbl_names [! tbl_names %in% ukey_names ]
250
-
251
- # More input validation:
252
- if (! identical(tbl_names , names(later_tbl ))) {
253
- cli_abort(c(
254
- " `earlier_snapshot` and `later_tbl` should have identical column
255
- names and ordering." ,
256
- " *" = " `earlier_snapshot` colnames: {format_chr_deparse(tbl_names)}" ,
257
- " *" = " `later_tbl` colnames: {format_chr_deparse(names(later_tbl))}"
258
- ), class = " epiprocess__tbl_diff2__tbl_names_differ" )
275
+ all_names <- names(later_tbl )
276
+ val_names <- all_names [! (all_names %in% ukey_names )]
277
+ updates <- tbl_fast_anti_join(later_tbl , earlier_snapshot , ukey_names , val_names , compactify_abs_tol )
278
+ if (later_format == " snapshot" ) {
279
+ deletions <- tbl_fast_anti_join(earlier_snapshot [ukey_names ], later_tbl [ukey_names ], ukey_names , character (), 0 )
280
+ updates <- vec_rbind(updates , deletions ) # fills vals with NAs
259
281
}
260
-
261
- combined_tbl <- vec_rbind(earlier_snapshot , later_tbl )
262
- combined_n <- nrow(combined_tbl )
263
-
264
- # We'll also need epikeytimes and value columns separately:
265
- combined_ukeys <- combined_tbl [ukey_names ]
266
- combined_vals <- combined_tbl [val_names ]
267
-
268
- # We have five types of rows in combined_tbl:
269
- # 1. From earlier_snapshot, no matching ukey in later_tbl (deletion; turn vals to
270
- # NAs to match epi_archive format)
271
- # 2. From earlier_snapshot, with matching ukey in later_tbl (context; exclude from
272
- # result)
273
- # 3. From later_tbl, with matching ukey in earlier_snapshot, with value "close" (change
274
- # that we'll compactify away)
275
- # 4. From later_tbl, with matching ukey in earlier_snapshot, value not "close" (change
276
- # that we'll record)
277
- # 5. From later_tbl, with no matching ukey in later_tbl (addition)
278
-
279
- # For "snapshot" later_format, we need to filter to 1., 4., and 5., and alter
280
- # values for 1. For "update" later_format, we need to filter to 4. and 5.
281
-
282
- # (For compactify_abs_tol = 0, we could potentially streamline things by dropping
283
- # ukey+val duplicates (cases 2. and 3.).)
284
-
285
- # Row indices of first occurrence of each ukey; will be the same as
286
- # seq_len(combined_n) for each ukey's first appearance (cases 1., 2., or 5.);
287
- # for re-reported ukeys in `later_tbl` (cases 3. or 4.), it will point back to
288
- # the row index of the same ukey in `earlier_snapshot`:
289
- combined_ukey_firsts <- vec_duplicate_id(combined_ukeys )
290
-
291
- # Which rows from combined are cases 3. or 4.?
292
- combined_ukey_is_repeat <- combined_ukey_firsts != seq_len(combined_n )
293
- # For each row in 3. or 4., row numbers of the ukey appearance in earlier:
294
- ukey_repeat_first_i <- combined_ukey_firsts [combined_ukey_is_repeat ]
295
-
296
- # Which rows from combined are in case 3.?
297
- combined_compactify_away <- rep(FALSE , combined_n )
298
- combined_compactify_away [combined_ukey_is_repeat ] <-
299
- vec_approx_equal0(combined_vals ,
300
- combined_vals ,
301
- na_equal = TRUE ,
302
- abs_tol = compactify_abs_tol ,
303
- inds1 = combined_ukey_is_repeat ,
304
- inds2 = ukey_repeat_first_i
305
- )
306
-
307
- # Which rows from combined are in cases 3., 4., or 5.?
308
- combined_from_later <- vec_rep_each(c(FALSE , TRUE ), c(earlier_n , later_n ))
309
-
310
- if (later_format == " update" ) {
311
- # Cases 4. and 5.:
312
- combined_tbl <- combined_tbl [combined_from_later & ! combined_compactify_away , ]
313
- } else { # later_format is "snapshot"
314
- # Which rows from combined are in case 1.?
315
- combined_is_deletion <- vec_rep_each(c(TRUE , FALSE ), c(earlier_n , later_n ))
316
- combined_is_deletion [ukey_repeat_first_i ] <- FALSE
317
- # Which rows from combined are in cases 1., 4., or 5.?
318
- combined_include <- combined_is_deletion | combined_from_later & ! combined_compactify_away
319
- combined_tbl <- combined_tbl [combined_include , ]
320
- # Represent deletion in 1. with NA-ing of all value columns. (In some
321
- # previous approaches to epi_diff2, this seemed to be faster than using
322
- # vec_rbind(case_1_ukeys, cases_45_tbl) or bind_rows to fill with NAs, and more
323
- # general than data.table's rbind(case_1_ukeys, cases_45_tbl, fill = TRUE).)
324
- combined_tbl [combined_is_deletion [combined_include ], val_names ] <- NA
325
- }
326
-
327
- combined_tbl
282
+ updates
328
283
}
329
284
330
285
# ' Apply an update (e.g., from `tbl_diff2`) to a snapshot
0 commit comments