Skip to content

Commit

Permalink
Avoid triggering duckplyr's ALTREP row names (#6947)
Browse files Browse the repository at this point in the history
* Give `import_from()` an `optional` argument

* Implement `dplyr_attributes()` and `dplyr_set_attributes()`

* Switch to pairlist based approach

* Add note about ALTREP wrapper minimum size

* NEWS bullet
  • Loading branch information
DavisVaughan authored Nov 6, 2023
1 parent faacd32 commit 3d6c047
Show file tree
Hide file tree
Showing 8 changed files with 230 additions and 9 deletions.
3 changes: 3 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
# dplyr (development version)

* `dplyr_reconstruct()`'s default method has been rewritten to avoid
materializing duckplyr queries too early (#6947).

* Updated the `storms` data to include 2022 data (#6937, @steveharoz).

* Updated the `starwars` data to use a new API, because the old one is defunct.
Expand Down
8 changes: 2 additions & 6 deletions R/generics.R
Original file line number Diff line number Diff line change
Expand Up @@ -192,6 +192,7 @@ dplyr_col_modify.rowwise_df <- function(data, cols) {
dplyr_reconstruct <- function(data, template) {
# Strip attributes before dispatch to make it easier to implement
# methods and prevent unexpected leaking of irrelevant attributes.
# This also enforces that `data` is a well-formed data frame.
data <- dplyr_new_data_frame(data)
return(dplyr_reconstruct_dispatch(data, template))
UseMethod("dplyr_reconstruct", template)
Expand All @@ -202,12 +203,7 @@ dplyr_reconstruct_dispatch <- function(data, template) {

#' @export
dplyr_reconstruct.data.frame <- function(data, template) {
attrs <- attributes(template)
attrs$names <- names(data)
attrs$row.names <- .row_names_info(data, type = 0L)

attributes(data) <- attrs
data
.Call(ffi_dplyr_reconstruct, data, template)
}

#' @export
Expand Down
9 changes: 6 additions & 3 deletions R/zzz.R
Original file line number Diff line number Diff line change
Expand Up @@ -40,13 +40,16 @@
setHook(packageEvent("plyr", "attach"), NULL, "replace")
}

import_vctrs <- function(name) {
import_from(name, "vctrs")
import_vctrs <- function(name, optional = FALSE) {
import_from(name, "vctrs", optional = optional)
}
import_from <- function(name, package) {
import_from <- function(name, package, optional = FALSE) {
ns <- getNamespace(package)

if (!exists(name, mode = "function", envir = ns, inherits = FALSE)) {
if (optional) {
return(NULL)
}
abort(sprintf("No such '%s' function: `%s()`.", package, name))
}

Expand Down
4 changes: 4 additions & 0 deletions src/dplyr.h
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,10 @@ inline bool obj_is_list(SEXP x) {

}

SEXP ffi_dplyr_reconstruct(SEXP data, SEXP template_);
SEXP ffi_test_dplyr_attributes(SEXP x);
SEXP ffi_test_dplyr_set_attributes(SEXP x, SEXP attributes);

SEXP dplyr_expand_groups(SEXP old_groups, SEXP positions, SEXP s_nr);
SEXP dplyr_cumall(SEXP x);
SEXP dplyr_cumany(SEXP x);
Expand Down
4 changes: 4 additions & 0 deletions src/init.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,10 @@ SEXP dplyr_init_library(SEXP ns_dplyr, SEXP ns_vctrs, SEXP ns_rlang) {
static const R_CallMethodDef CallEntries[] = {
{"dplyr_init_library", (DL_FUNC)& dplyr_init_library, 3},

{"ffi_dplyr_reconstruct", (DL_FUNC)& ffi_dplyr_reconstruct, 2},
{"ffi_test_dplyr_attributes", (DL_FUNC)& ffi_test_dplyr_attributes, 1},
{"ffi_test_dplyr_set_attributes", (DL_FUNC)& ffi_test_dplyr_set_attributes, 2},

{"dplyr_expand_groups", (DL_FUNC)& dplyr_expand_groups, 3},
{"dplyr_cumall", (DL_FUNC)& dplyr_cumall, 1},
{"dplyr_cumany", (DL_FUNC)& dplyr_cumany, 1},
Expand Down
138 changes: 138 additions & 0 deletions src/reconstruct.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
#include "dplyr.h"

// Essentially, a C implementation of:
//
// ```
// attributes <- attributes(template)
// attributes$names <- names(data)
// attributes$row.names <- .row_names_info(data, type = 0L)
// attributes(data) <- attributes
// ```
//
// The problem with that is that:
// - `attributes()` ends up calling `Rf_getAttrib()`, which tries to check
// for internal `row.names` in `template` so they aren't leaked to the user.
// Unfortunately this materializes lazy ALTREP `row.names`, like those used
// by duckplyr.
// - `attributes<-()` ends up calling `Rf_setAttrib()`, which tries to check
// if it can make efficient internal `row.names`. Again, this materializes
// lazy ALTREP `row.names`, like those used by duckplyr.
//
// So we bypass that here by carefully manipulating the attribute pairlists.
//
// We expect that at this point, both `data` and `template_` are S3 data
// frames, both of which have `names` and `row.names` attributes. If this isn't
// true, we error.
// - For `data`, we enforce this in `dplyr_reconstruct()`'s generic by calling
// `dplyr_new_data_frame()` (ideally no intermediate method invalidates this).
// - For `template_`, we assume this since we got here through the S3 method
// `dplyr_reconstruct.data.frame()`, which dispatched off `template_`. A
// well-formed S3 data frame must have `names` and `row.names` attributes.
//
// https://github.com/tidyverse/dplyr/pull/6947
// https://github.com/tidyverse/dplyr/issues/6525#issuecomment-1303619152
// https://github.com/wch/r-source/blob/69b94f0c8ce9b2497f6d7a81922575f6c585b713/src/main/attrib.c#L176-L177
// https://github.com/wch/r-source/blob/69b94f0c8ce9b2497f6d7a81922575f6c585b713/src/main/attrib.c#L57
SEXP ffi_dplyr_reconstruct(SEXP data, SEXP template_) {
if (TYPEOF(data) != VECSXP) {
Rf_errorcall(R_NilValue, "Internal error: `data` must be a list.");
}
if (TYPEOF(template_) != VECSXP) {
Rf_errorcall(R_NilValue, "Internal error: `template` must be a list.");
}
if (!OBJECT(data)) {
Rf_errorcall(R_NilValue, "Internal error: `data` must be an object.");
}
if (!OBJECT(template_)) {
Rf_errorcall(R_NilValue, "Internal error: `template` must be an object.");
}

bool seen_names = false;
bool seen_row_names = false;

// Pull the `names` and `row.names` off `data`.
// These are the only 2 attributes from `data` that persist.
SEXP names = R_NilValue;
SEXP row_names = R_NilValue;

for (SEXP node = ATTRIB(data); node != R_NilValue; node = CDR(node)) {
SEXP tag = TAG(node);

if (tag == R_NamesSymbol) {
names = CAR(node);
MARK_NOT_MUTABLE(names);
seen_names = true;
}
if (tag == R_RowNamesSymbol) {
row_names = CAR(node);
MARK_NOT_MUTABLE(row_names);
seen_row_names = true;
}
}

if (!seen_names) {
Rf_errorcall(R_NilValue, "Internal error: `data` must have a `names` attribute.");
}
if (!seen_row_names) {
Rf_errorcall(R_NilValue, "Internal error: `data` must have a `row.names` attribute.");
}

seen_names = false;
seen_row_names = false;

// Now replace the `names` and `row.names` attributes in the `template_`
// attributes with the ones from `data`. This attribute set becomes the final
// one we set on `data`.
SEXP attributes = ATTRIB(template_);
attributes = PROTECT(Rf_shallow_duplicate(attributes));

for (SEXP node = attributes; node != R_NilValue; node = CDR(node)) {
SEXP tag = TAG(node);

if (tag == R_NamesSymbol) {
SETCAR(node, names);
seen_names = true;
}
if (tag == R_RowNamesSymbol) {
SETCAR(node, row_names);
seen_row_names = true;
}
}

if (!seen_names) {
Rf_errorcall(R_NilValue, "Internal error: `template` must have a `names` attribute.");
}
if (!seen_row_names) {
Rf_errorcall(R_NilValue, "Internal error: `template` must have a `row.names` attribute.");
}

// Make an ALTREP wrapper if possible, since the underlying data doesn't change.
// Won't actually make an ALTREP wrapper unless there are >64 columns
// (internally controlled by R).
#if R_VERSION >= R_Version(3, 6, 0)
data = PROTECT(R_shallow_duplicate_attr(data));
#else
data = PROTECT(Rf_shallow_duplicate(data));
#endif

SET_ATTRIB(data, attributes);

UNPROTECT(2);
return data;
}

// Very unsafe wrappers needed for testing.
// Bypass `Rf_getAttrib()` and `Rf_setAttrib()` calls to avoid forcing ALTREP
// `row.names`.
SEXP ffi_test_dplyr_attributes(SEXP x) {
return ATTRIB(x);
}
SEXP ffi_test_dplyr_set_attributes(SEXP x, SEXP attributes) {
if (TYPEOF(attributes) != LISTSXP) {
Rf_errorcall(R_NilValue, "`attributes` must be a pairlist.");
}
x = PROTECT(Rf_shallow_duplicate(x));
SET_ATTRIB(x, attributes);
UNPROTECT(1);
return x;
}
22 changes: 22 additions & 0 deletions tests/testthat/helper-lazy.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
skip_if_no_lazy_character <- function() {
skip_if(getRversion() <= "3.5.0")

new_lazy_character <- import_vctrs("new_lazy_character", optional = TRUE)
lazy_character_is_materialized <- import_vctrs("lazy_character_is_materialized", optional = TRUE)

if (is.null(new_lazy_character) || is.null(lazy_character_is_materialized)) {
skip("Lazy character helpers from vctrs are not available.")
}

invisible()
}

new_lazy_character <- function(fn) {
f <- import_vctrs("new_lazy_character")
f(fn)
}

lazy_character_is_materialized <- function(x) {
f <- import_vctrs("lazy_character_is_materialized")
f(x)
}
51 changes: 51 additions & 0 deletions tests/testthat/test-generics.R
Original file line number Diff line number Diff line change
Expand Up @@ -137,3 +137,54 @@ test_that("dplyr_reconstruct() strips attributes before dispatch", {
dplyr_reconstruct(df, df)
expect_identical(out, data.frame(x = 1, row.names = "a"))
})

test_that("`dplyr_reconstruct()` retains attribute ordering of `template`", {
df <- vctrs::data_frame(x = 1)
expect_identical(
attributes(dplyr_reconstruct(df, df)),
attributes(df)
)
})

test_that("`dplyr_reconstruct()` doesn't modify the original `data` in place", {
data <- new_data_frame(list(x = 1), foo = "bar")
template <- vctrs::data_frame(x = 1)

out <- dplyr_reconstruct(data, template)

expect_null(attr(out, "foo"))
expect_identical(attr(data, "foo"), "bar")
})

test_that("`dplyr_reconstruct()`, which gets and sets attributes, doesn't touch `row.names` (#6525)", {
skip_if_no_lazy_character()

dplyr_attributes <- function(x) {
.Call(ffi_test_dplyr_attributes, x)
}
dplyr_set_attributes <- function(x, attributes) {
.Call(ffi_test_dplyr_set_attributes, x, attributes)
}

df <- vctrs::data_frame(x = 1)

attributes <- attributes(df)
attributes$row.names <- new_lazy_character(function() "a")
attributes <- as.pairlist(attributes)

df_with_lazy_row_names <- dplyr_set_attributes(df, attributes)

# Ensure `data` row names aren't materialized
x <- dplyr_reconstruct(df_with_lazy_row_names, df)
attributes <- dplyr_attributes(df_with_lazy_row_names)
expect_false(lazy_character_is_materialized(attributes$row.names))

# `data` row names should also propagate into the result unmaterialized
attributes <- dplyr_attributes(x)
expect_false(lazy_character_is_materialized(attributes$row.names))

# Ensure `template` row names aren't materialized
x <- dplyr_reconstruct(df, df_with_lazy_row_names)
attributes <- dplyr_attributes(df_with_lazy_row_names)
expect_false(lazy_character_is_materialized(attributes$row.names))
})

0 comments on commit 3d6c047

Please sign in to comment.