Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Avoid triggering duckplyr's ALTREP row names #6947

Merged
merged 5 commits into from
Nov 6, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
# dplyr (development version)

* `dplyr_reconstruct()`'s default method has been rewritten to avoid
materializing duckplyr queries too early (#6947).

* Updated the `storms` data to include 2022 data (#6937, @steveharoz).

* Updated the `starwars` data to use a new API, because the old one is defunct.
Expand Down
8 changes: 2 additions & 6 deletions R/generics.R
Original file line number Diff line number Diff line change
Expand Up @@ -192,6 +192,7 @@ dplyr_col_modify.rowwise_df <- function(data, cols) {
dplyr_reconstruct <- function(data, template) {
# Strip attributes before dispatch to make it easier to implement
# methods and prevent unexpected leaking of irrelevant attributes.
# This also enforces that `data` is a well-formed data frame.
data <- dplyr_new_data_frame(data)
return(dplyr_reconstruct_dispatch(data, template))
UseMethod("dplyr_reconstruct", template)
Expand All @@ -202,12 +203,7 @@ dplyr_reconstruct_dispatch <- function(data, template) {

#' @export
dplyr_reconstruct.data.frame <- function(data, template) {
attrs <- attributes(template)
attrs$names <- names(data)
attrs$row.names <- .row_names_info(data, type = 0L)

attributes(data) <- attrs
data
.Call(ffi_dplyr_reconstruct, data, template)
}

#' @export
Expand Down
9 changes: 6 additions & 3 deletions R/zzz.R
Original file line number Diff line number Diff line change
Expand Up @@ -40,13 +40,16 @@
setHook(packageEvent("plyr", "attach"), NULL, "replace")
}

import_vctrs <- function(name) {
import_from(name, "vctrs")
import_vctrs <- function(name, optional = FALSE) {
import_from(name, "vctrs", optional = optional)
}
import_from <- function(name, package) {
import_from <- function(name, package, optional = FALSE) {
ns <- getNamespace(package)

if (!exists(name, mode = "function", envir = ns, inherits = FALSE)) {
if (optional) {
return(NULL)
}
abort(sprintf("No such '%s' function: `%s()`.", package, name))
}

Expand Down
4 changes: 4 additions & 0 deletions src/dplyr.h
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,10 @@ inline bool obj_is_list(SEXP x) {

}

SEXP ffi_dplyr_reconstruct(SEXP data, SEXP template_);
SEXP ffi_test_dplyr_attributes(SEXP x);
SEXP ffi_test_dplyr_set_attributes(SEXP x, SEXP attributes);

SEXP dplyr_expand_groups(SEXP old_groups, SEXP positions, SEXP s_nr);
SEXP dplyr_cumall(SEXP x);
SEXP dplyr_cumany(SEXP x);
Expand Down
4 changes: 4 additions & 0 deletions src/init.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,10 @@ SEXP dplyr_init_library(SEXP ns_dplyr, SEXP ns_vctrs, SEXP ns_rlang) {
static const R_CallMethodDef CallEntries[] = {
{"dplyr_init_library", (DL_FUNC)& dplyr_init_library, 3},

{"ffi_dplyr_reconstruct", (DL_FUNC)& ffi_dplyr_reconstruct, 2},
{"ffi_test_dplyr_attributes", (DL_FUNC)& ffi_test_dplyr_attributes, 1},
{"ffi_test_dplyr_set_attributes", (DL_FUNC)& ffi_test_dplyr_set_attributes, 2},

{"dplyr_expand_groups", (DL_FUNC)& dplyr_expand_groups, 3},
{"dplyr_cumall", (DL_FUNC)& dplyr_cumall, 1},
{"dplyr_cumany", (DL_FUNC)& dplyr_cumany, 1},
Expand Down
138 changes: 138 additions & 0 deletions src/reconstruct.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
#include "dplyr.h"

// Essentially, a C implementation of:
//
// ```
// attributes <- attributes(template)
// attributes$names <- names(data)
// attributes$row.names <- .row_names_info(data, type = 0L)
// attributes(data) <- attributes
// ```
//
// The problem with that is that:
// - `attributes()` ends up calling `Rf_getAttrib()`, which tries to check
// for internal `row.names` in `template` so they aren't leaked to the user.
// Unfortunately this materializes lazy ALTREP `row.names`, like those used
// by duckplyr.
// - `attributes<-()` ends up calling `Rf_setAttrib()`, which tries to check
// if it can make efficient internal `row.names`. Again, this materializes
// lazy ALTREP `row.names`, like those used by duckplyr.
//
// So we bypass that here by carefully manipulating the attribute pairlists.
//
// We expect that at this point, both `data` and `template_` are S3 data
// frames, both of which have `names` and `row.names` attributes. If this isn't
// true, we error.
// - For `data`, we enforce this in `dplyr_reconstruct()`'s generic by calling
// `dplyr_new_data_frame()` (ideally no intermediate method invalidates this).
// - For `template_`, we assume this since we got here through the S3 method
// `dplyr_reconstruct.data.frame()`, which dispatched off `template_`. A
// well-formed S3 data frame must have `names` and `row.names` attributes.
//
// https://github.com/tidyverse/dplyr/pull/6947
// https://github.com/tidyverse/dplyr/issues/6525#issuecomment-1303619152
// https://github.com/wch/r-source/blob/69b94f0c8ce9b2497f6d7a81922575f6c585b713/src/main/attrib.c#L176-L177
// https://github.com/wch/r-source/blob/69b94f0c8ce9b2497f6d7a81922575f6c585b713/src/main/attrib.c#L57
SEXP ffi_dplyr_reconstruct(SEXP data, SEXP template_) {
if (TYPEOF(data) != VECSXP) {
Rf_errorcall(R_NilValue, "Internal error: `data` must be a list.");
}
if (TYPEOF(template_) != VECSXP) {
Rf_errorcall(R_NilValue, "Internal error: `template` must be a list.");
}
if (!OBJECT(data)) {
Rf_errorcall(R_NilValue, "Internal error: `data` must be an object.");
}
if (!OBJECT(template_)) {
Rf_errorcall(R_NilValue, "Internal error: `template` must be an object.");
}

bool seen_names = false;
bool seen_row_names = false;

// Pull the `names` and `row.names` off `data`.
// These are the only 2 attributes from `data` that persist.
SEXP names = R_NilValue;
SEXP row_names = R_NilValue;

for (SEXP node = ATTRIB(data); node != R_NilValue; node = CDR(node)) {
SEXP tag = TAG(node);

if (tag == R_NamesSymbol) {
names = CAR(node);
MARK_NOT_MUTABLE(names);
seen_names = true;
}
if (tag == R_RowNamesSymbol) {
row_names = CAR(node);
MARK_NOT_MUTABLE(row_names);
seen_row_names = true;
}
}

if (!seen_names) {
Rf_errorcall(R_NilValue, "Internal error: `data` must have a `names` attribute.");
}
if (!seen_row_names) {
Rf_errorcall(R_NilValue, "Internal error: `data` must have a `row.names` attribute.");
}

seen_names = false;
seen_row_names = false;

// Now replace the `names` and `row.names` attributes in the `template_`
// attributes with the ones from `data`. This attribute set becomes the final
// one we set on `data`.
SEXP attributes = ATTRIB(template_);
attributes = PROTECT(Rf_shallow_duplicate(attributes));

for (SEXP node = attributes; node != R_NilValue; node = CDR(node)) {
SEXP tag = TAG(node);

if (tag == R_NamesSymbol) {
SETCAR(node, names);
seen_names = true;
}
if (tag == R_RowNamesSymbol) {
SETCAR(node, row_names);
seen_row_names = true;
}
}

if (!seen_names) {
Rf_errorcall(R_NilValue, "Internal error: `template` must have a `names` attribute.");
}
if (!seen_row_names) {
Rf_errorcall(R_NilValue, "Internal error: `template` must have a `row.names` attribute.");
}

// Make an ALTREP wrapper if possible, since the underlying data doesn't change.
// Won't actually make an ALTREP wrapper unless there are >64 columns
// (internally controlled by R).
#if R_VERSION >= R_Version(3, 6, 0)
data = PROTECT(R_shallow_duplicate_attr(data));
DavisVaughan marked this conversation as resolved.
Show resolved Hide resolved
#else
data = PROTECT(Rf_shallow_duplicate(data));
#endif

SET_ATTRIB(data, attributes);

UNPROTECT(2);
return data;
}

// Very unsafe wrappers needed for testing.
// Bypass `Rf_getAttrib()` and `Rf_setAttrib()` calls to avoid forcing ALTREP
// `row.names`.
SEXP ffi_test_dplyr_attributes(SEXP x) {
return ATTRIB(x);
}
SEXP ffi_test_dplyr_set_attributes(SEXP x, SEXP attributes) {
if (TYPEOF(attributes) != LISTSXP) {
Rf_errorcall(R_NilValue, "`attributes` must be a pairlist.");
}
x = PROTECT(Rf_shallow_duplicate(x));
SET_ATTRIB(x, attributes);
UNPROTECT(1);
return x;
}
22 changes: 22 additions & 0 deletions tests/testthat/helper-lazy.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
skip_if_no_lazy_character <- function() {
skip_if(getRversion() <= "3.5.0")

new_lazy_character <- import_vctrs("new_lazy_character", optional = TRUE)
lazy_character_is_materialized <- import_vctrs("lazy_character_is_materialized", optional = TRUE)

if (is.null(new_lazy_character) || is.null(lazy_character_is_materialized)) {
skip("Lazy character helpers from vctrs are not available.")
}

invisible()
}

new_lazy_character <- function(fn) {
f <- import_vctrs("new_lazy_character")
f(fn)
}

lazy_character_is_materialized <- function(x) {
f <- import_vctrs("lazy_character_is_materialized")
f(x)
}
51 changes: 51 additions & 0 deletions tests/testthat/test-generics.R
Original file line number Diff line number Diff line change
Expand Up @@ -137,3 +137,54 @@ test_that("dplyr_reconstruct() strips attributes before dispatch", {
dplyr_reconstruct(df, df)
expect_identical(out, data.frame(x = 1, row.names = "a"))
})

test_that("`dplyr_reconstruct()` retains attribute ordering of `template`", {
df <- vctrs::data_frame(x = 1)
expect_identical(
attributes(dplyr_reconstruct(df, df)),
attributes(df)
)
})

test_that("`dplyr_reconstruct()` doesn't modify the original `data` in place", {
data <- new_data_frame(list(x = 1), foo = "bar")
template <- vctrs::data_frame(x = 1)

out <- dplyr_reconstruct(data, template)

expect_null(attr(out, "foo"))
expect_identical(attr(data, "foo"), "bar")
})

test_that("`dplyr_reconstruct()`, which gets and sets attributes, doesn't touch `row.names` (#6525)", {
skip_if_no_lazy_character()

dplyr_attributes <- function(x) {
.Call(ffi_test_dplyr_attributes, x)
}
dplyr_set_attributes <- function(x, attributes) {
.Call(ffi_test_dplyr_set_attributes, x, attributes)
}

df <- vctrs::data_frame(x = 1)

attributes <- attributes(df)
attributes$row.names <- new_lazy_character(function() "a")
attributes <- as.pairlist(attributes)

df_with_lazy_row_names <- dplyr_set_attributes(df, attributes)

# Ensure `data` row names aren't materialized
x <- dplyr_reconstruct(df_with_lazy_row_names, df)
attributes <- dplyr_attributes(df_with_lazy_row_names)
expect_false(lazy_character_is_materialized(attributes$row.names))

# `data` row names should also propagate into the result unmaterialized
attributes <- dplyr_attributes(x)
expect_false(lazy_character_is_materialized(attributes$row.names))

# Ensure `template` row names aren't materialized
x <- dplyr_reconstruct(df, df_with_lazy_row_names)
attributes <- dplyr_attributes(df_with_lazy_row_names)
expect_false(lazy_character_is_materialized(attributes$row.names))
})
Loading