From ebc8bfe1bf0fd8a391b2ab6dd6da7d3733bb4966 Mon Sep 17 00:00:00 2001 From: Davis Vaughan Date: Thu, 2 Nov 2023 16:37:31 -0400 Subject: [PATCH] Switch to pairlist based approach --- R/generics.R | 6 +- R/utils.R | 7 -- src/attributes.cpp | 127 ------------------------------ src/dplyr.h | 5 +- src/init.cpp | 5 +- src/reconstruct.cpp | 136 +++++++++++++++++++++++++++++++++ tests/testthat/_snaps/utils.md | 32 -------- tests/testthat/test-generics.R | 51 +++++++++++++ tests/testthat/test-utils.R | 91 ---------------------- 9 files changed, 195 insertions(+), 265 deletions(-) delete mode 100644 src/attributes.cpp create mode 100644 src/reconstruct.cpp delete mode 100644 tests/testthat/_snaps/utils.md diff --git a/R/generics.R b/R/generics.R index 9e1d171161..a9155f1a44 100644 --- a/R/generics.R +++ b/R/generics.R @@ -192,6 +192,7 @@ dplyr_col_modify.rowwise_df <- function(data, cols) { dplyr_reconstruct <- function(data, template) { # Strip attributes before dispatch to make it easier to implement # methods and prevent unexpected leaking of irrelevant attributes. + # This also enforces that `data` is a well-formed data frame. data <- dplyr_new_data_frame(data) return(dplyr_reconstruct_dispatch(data, template)) UseMethod("dplyr_reconstruct", template) @@ -202,10 +203,7 @@ dplyr_reconstruct_dispatch <- function(data, template) { #' @export dplyr_reconstruct.data.frame <- function(data, template) { - attributes <- dplyr_attributes(template) - attributes$names <- names(data) - attributes$row.names <- .row_names_info(data, type = 0L) - dplyr_set_attributes(data, attributes) + .Call(ffi_dplyr_reconstruct, data, template) } #' @export diff --git a/R/utils.R b/R/utils.R index 29cb024f9f..e74ac8f32b 100644 --- a/R/utils.R +++ b/R/utils.R @@ -48,13 +48,6 @@ dplyr_new_tibble <- function(x, size) { new_data_frame(x = x, n = size, class = c("tbl_df", "tbl")) } -dplyr_attributes <- function(x) { - .Call(ffi_dplyr_attributes, x) -} -dplyr_set_attributes <- function(x, attributes) { - .Call(ffi_dplyr_set_attributes, x, attributes) -} - #' @param x A list #' @param fn An optional function of 1 argument to be applied to each list #' element of `x`. This allows you to further refine what elements should be diff --git a/src/attributes.cpp b/src/attributes.cpp deleted file mode 100644 index 0cb3f3b776..0000000000 --- a/src/attributes.cpp +++ /dev/null @@ -1,127 +0,0 @@ -#include "dplyr.h" - -// Push a new attribute at the end of `x`'s attributes. -// Caller is responsible for ensuring the `tag` doesn't already exist. -static inline -SEXP r_attrib_push(SEXP x, SEXP tag, SEXP value) { - SEXP attributes = Rf_cons(value, ATTRIB(x)); - SET_TAG(attributes, tag); - SET_ATTRIB(x, attributes); - return attributes; -} - -// Alternative to `attributes()`, but based heavily on the C level -// implementation, `do_attributes()`. -// -// Unlike `attributes()`, this does not inspect the row names in any way (i.e. -// typically to expand compact row names), which has the side effect of -// materializing duckplyr queries. -// https://github.com/tidyverse/dplyr/issues/6525#issuecomment-1303619152 -// https://github.com/wch/r-source/blob/69b94f0c8ce9b2497f6d7a81922575f6c585b713/src/main/attrib.c#L176-L177 -SEXP ffi_dplyr_attributes(SEXP x) { - SEXP attributes = ATTRIB(x); - R_xlen_t size = Rf_xlength(attributes); - - SEXP out = PROTECT(Rf_allocVector(VECSXP, size)); - - SEXP names = Rf_allocVector(STRSXP, size); - Rf_setAttrib(out, R_NamesSymbol, names); - - for (R_xlen_t i = 0; i < size; ++i, attributes = CDR(attributes)) { - SEXP name = TAG(attributes); - - if (TYPEOF(name) != SYMSXP) { - // Hopefully never happens, but `do_attributes()` has a special path - // for this, so we are extra careful not to crash - Rf_errorcall(R_NilValue, "Unexpected non-symbol attribute pairlist tag."); - } - - SET_STRING_ELT(names, i, PRINTNAME(name)); - - if (name == R_RowNamesSymbol) { - // Super special path for row names! Row names are passed through as is, - // with no modification, although we do mark the attribute as not mutable, - // like `Rf_getAttrib()`. - SEXP value = CAR(attributes); - MARK_NOT_MUTABLE(value); - SET_VECTOR_ELT(out, i, value); - continue; - } - - SEXP value = Rf_getAttrib(x, name); - SET_VECTOR_ELT(out, i, value); - } - - UNPROTECT(1); - return out; -} - -// Alternative to `attributes<-()`, but based heavily on the C level -// implementation, `do_attributesgets()`. -// -// Unlike `attributes<-()`, this does not inspect the row names in any way (i.e. -// typically to create compact row names), which has the side effect of -// materializing duckplyr queries. -// -// If we want this to be a full replacement, then it should also ensure that the -// `dim` attribute is set before the `dimnames` attribute, but we currently -// never expect those attributes to appear on data frames. -// https://github.com/wch/r-source/blob/69b94f0c8ce9b2497f6d7a81922575f6c585b713/src/main/attrib.c#L1401C39-L1404 -SEXP ffi_dplyr_set_attributes(SEXP x, SEXP attributes) { - if (x == R_NilValue) { - Rf_errorcall(R_NilValue, "Internal error: `x` can't be `NULL`."); - } - - // Make an ALTREP wrapper if possible, since the underlying data doesn't change -#if R_VERSION >= R_Version(3, 6, 0) - x = PROTECT(R_shallow_duplicate_attr(x)); -#else - x = PROTECT(Rf_shallow_duplicate(x)); -#endif - - // Remove existing attributes, and unset the object bit. - // It will be reset by `Rf_setAttrib()` if needed. - SET_ATTRIB(x, R_NilValue); - SET_OBJECT(x, 0); - - if (TYPEOF(attributes) != VECSXP) { - Rf_errorcall(R_NilValue, "Internal error: `attributes` must be a list."); - } - - const SEXP* v_attributes = VECTOR_PTR_RO(attributes); - - R_xlen_t size = Rf_xlength(attributes); - SEXP names = Rf_getAttrib(attributes, R_NamesSymbol); - - if (TYPEOF(names) != STRSXP) { - Rf_errorcall(R_NilValue, "Internal error: `attributes` must be named."); - } - - const SEXP* v_names = STRING_PTR_RO(names); - - // Check that each element of `names` is named - for (R_xlen_t i = 0; i < size; ++i) { - SEXP name = v_names[i]; - - if (name == NA_STRING || name == R_BlankString) { - Rf_errorcall(R_NilValue, "All attributes must have names. Attribute %i does not.", i + 1); - } - } - - for (R_xlen_t i = 0; i < size; ++i) { - SEXP name = Rf_installChar(v_names[i]); - SEXP attribute = v_attributes[i]; - - if (name == R_RowNamesSymbol) { - // Super special path for row names! - // Row names are pushed on untouched and unchecked. - r_attrib_push(x, name, attribute); - continue; - } - - Rf_setAttrib(x, name, attribute); - } - - UNPROTECT(1); - return x; -} diff --git a/src/dplyr.h b/src/dplyr.h index bfc075ba66..fc0359095d 100644 --- a/src/dplyr.h +++ b/src/dplyr.h @@ -96,8 +96,9 @@ inline bool obj_is_list(SEXP x) { } -SEXP ffi_dplyr_attributes(SEXP x); -SEXP ffi_dplyr_set_attributes(SEXP x, SEXP attributes); +SEXP ffi_dplyr_reconstruct(SEXP data, SEXP template_); +SEXP ffi_test_dplyr_attributes(SEXP x); +SEXP ffi_test_dplyr_set_attributes(SEXP x, SEXP attributes); SEXP dplyr_expand_groups(SEXP old_groups, SEXP positions, SEXP s_nr); SEXP dplyr_cumall(SEXP x); diff --git a/src/init.cpp b/src/init.cpp index f7971c6e2e..11f797fdce 100644 --- a/src/init.cpp +++ b/src/init.cpp @@ -95,8 +95,9 @@ SEXP dplyr_init_library(SEXP ns_dplyr, SEXP ns_vctrs, SEXP ns_rlang) { static const R_CallMethodDef CallEntries[] = { {"dplyr_init_library", (DL_FUNC)& dplyr_init_library, 3}, - {"ffi_dplyr_attributes", (DL_FUNC)& ffi_dplyr_attributes, 1}, - {"ffi_dplyr_set_attributes", (DL_FUNC)& ffi_dplyr_set_attributes, 2}, + {"ffi_dplyr_reconstruct", (DL_FUNC)& ffi_dplyr_reconstruct, 2}, + {"ffi_test_dplyr_attributes", (DL_FUNC)& ffi_test_dplyr_attributes, 1}, + {"ffi_test_dplyr_set_attributes", (DL_FUNC)& ffi_test_dplyr_set_attributes, 2}, {"dplyr_expand_groups", (DL_FUNC)& dplyr_expand_groups, 3}, {"dplyr_cumall", (DL_FUNC)& dplyr_cumall, 1}, diff --git a/src/reconstruct.cpp b/src/reconstruct.cpp new file mode 100644 index 0000000000..e3e56bbbdf --- /dev/null +++ b/src/reconstruct.cpp @@ -0,0 +1,136 @@ +#include "dplyr.h" + +// Essentially, a C implementation of: +// +// ``` +// attributes <- attributes(template) +// attributes$names <- names(data) +// attributes$row.names <- .row_names_info(data, type = 0L) +// attributes(data) <- attributes +// ``` +// +// The problem with that is that: +// - `attributes()` ends up calling `Rf_getAttrib()`, which tries to check +// for internal `row.names` in `template` so they aren't leaked to the user. +// Unfortunately this materializes lazy ALTREP `row.names`, like those used +// by duckplyr. +// - `attributes<-()` ends up calling `Rf_setAttrib()`, which tries to check +// if it can make efficient internal `row.names`. Again, this materializes +// lazy ALTREP `row.names`, like those used by duckplyr. +// +// So we bypass that here by carefully manipulating the attribute pairlists. +// +// We expect that at this point, both `data` and `template_` are S3 data +// frames, both of which have `names` and `row.names` attributes. If this isn't +// true, we error. +// - For `data`, we enforce this in `dplyr_reconstruct()`'s generic by calling +// `dplyr_new_data_frame()` (ideally no intermediate method invalidates this). +// - For `template_`, we assume this since we got here through the S3 method +// `dplyr_reconstruct.data.frame()`, which dispatched off `template_`. A +// well-formed S3 data frame must have `names` and `row.names` attributes. +// +// https://github.com/tidyverse/dplyr/pull/6947 +// https://github.com/tidyverse/dplyr/issues/6525#issuecomment-1303619152 +// https://github.com/wch/r-source/blob/69b94f0c8ce9b2497f6d7a81922575f6c585b713/src/main/attrib.c#L176-L177 +// https://github.com/wch/r-source/blob/69b94f0c8ce9b2497f6d7a81922575f6c585b713/src/main/attrib.c#L57 +SEXP ffi_dplyr_reconstruct(SEXP data, SEXP template_) { + if (TYPEOF(data) != VECSXP) { + Rf_errorcall(R_NilValue, "Internal error: `data` must be a list."); + } + if (TYPEOF(template_) != VECSXP) { + Rf_errorcall(R_NilValue, "Internal error: `template` must be a list."); + } + if (!OBJECT(data)) { + Rf_errorcall(R_NilValue, "Internal error: `data` must be an object."); + } + if (!OBJECT(template_)) { + Rf_errorcall(R_NilValue, "Internal error: `template` must be an object."); + } + + bool seen_names = false; + bool seen_row_names = false; + + // Pull the `names` and `row.names` off `data`. + // These are the only 2 attributes from `data` that persist. + SEXP names = R_NilValue; + SEXP row_names = R_NilValue; + + for (SEXP node = ATTRIB(data); node != R_NilValue; node = CDR(node)) { + SEXP tag = TAG(node); + + if (tag == R_NamesSymbol) { + names = CAR(node); + MARK_NOT_MUTABLE(names); + seen_names = true; + } + if (tag == R_RowNamesSymbol) { + row_names = CAR(node); + MARK_NOT_MUTABLE(row_names); + seen_row_names = true; + } + } + + if (!seen_names) { + Rf_errorcall(R_NilValue, "Internal error: `data` must have a `names` attribute."); + } + if (!seen_row_names) { + Rf_errorcall(R_NilValue, "Internal error: `data` must have a `row.names` attribute."); + } + + seen_names = false; + seen_row_names = false; + + // Now replace the `names` and `row.names` attributes in the `template_` + // attributes with the ones from `data`. This attribute set becomes the final + // one we set on `data`. + SEXP attributes = ATTRIB(template_); + attributes = PROTECT(Rf_shallow_duplicate(attributes)); + + for (SEXP node = attributes; node != R_NilValue; node = CDR(node)) { + SEXP tag = TAG(node); + + if (tag == R_NamesSymbol) { + SETCAR(node, names); + seen_names = true; + } + if (tag == R_RowNamesSymbol) { + SETCAR(node, row_names); + seen_row_names = true; + } + } + + if (!seen_names) { + Rf_errorcall(R_NilValue, "Internal error: `template` must have a `names` attribute."); + } + if (!seen_row_names) { + Rf_errorcall(R_NilValue, "Internal error: `template` must have a `row.names` attribute."); + } + + // Make an ALTREP wrapper if possible, since the underlying data doesn't change +#if R_VERSION >= R_Version(3, 6, 0) + data = PROTECT(R_shallow_duplicate_attr(data)); +#else + data = PROTECT(Rf_shallow_duplicate(data)); +#endif + + SET_ATTRIB(data, attributes); + + UNPROTECT(2); + return data; +} + +// Very unsafe wrappers needed for testing. +// Bypass `Rf_getAttrib()` and `Rf_setAttrib()` calls to avoid forcing ALTREP +// `row.names`. +SEXP ffi_test_dplyr_attributes(SEXP x) { + return ATTRIB(x); +} +SEXP ffi_test_dplyr_set_attributes(SEXP x, SEXP attributes) { + if (TYPEOF(attributes) != LISTSXP) { + Rf_errorcall(R_NilValue, "`attributes` must be a pairlist."); + } + x = PROTECT(Rf_shallow_duplicate(x)); + SET_ATTRIB(x, attributes); + UNPROTECT(1); + return x; +} diff --git a/tests/testthat/_snaps/utils.md b/tests/testthat/_snaps/utils.md deleted file mode 100644 index 19ac9a853e..0000000000 --- a/tests/testthat/_snaps/utils.md +++ /dev/null @@ -1,32 +0,0 @@ -# can't set attributes with `NULL` - - Code - dplyr_set_attributes(1, NULL) - Condition - Error: - ! Internal error: `attributes` must be a list. - -# can't set attributes on `NULL` - - Code - dplyr_set_attributes(NULL, list(x = 1)) - Condition - Error: - ! Internal error: `x` can't be `NULL`. - -# can't set attributes with unnamed elements - - Code - dplyr_set_attributes(1, list()) - Condition - Error: - ! Internal error: `attributes` must be named. - ---- - - Code - dplyr_set_attributes(1, list(x = 1, 2)) - Condition - Error: - ! All attributes must have names. Attribute 2 does not. - diff --git a/tests/testthat/test-generics.R b/tests/testthat/test-generics.R index a3c43a5b8a..2453420d44 100644 --- a/tests/testthat/test-generics.R +++ b/tests/testthat/test-generics.R @@ -137,3 +137,54 @@ test_that("dplyr_reconstruct() strips attributes before dispatch", { dplyr_reconstruct(df, df) expect_identical(out, data.frame(x = 1, row.names = "a")) }) + +test_that("`dplyr_reconstruct()` retains attribute ordering of `template`", { + df <- vctrs::data_frame(x = 1) + expect_identical( + attributes(dplyr_reconstruct(df, df)), + attributes(df) + ) +}) + +test_that("`dplyr_reconstruct()` doesn't modify the original `data` in place", { + data <- new_data_frame(list(x = 1), foo = "bar") + template <- vctrs::data_frame(x = 1) + + out <- dplyr_reconstruct(data, template) + + expect_null(attr(out, "foo")) + expect_identical(attr(data, "foo"), "bar") +}) + +test_that("`dplyr_reconstruct()`, which gets and sets attributes, doesn't touch `row.names` (#6525)", { + skip_if_no_lazy_character() + + dplyr_attributes <- function(x) { + .Call(ffi_test_dplyr_attributes, x) + } + dplyr_set_attributes <- function(x, attributes) { + .Call(ffi_test_dplyr_set_attributes, x, attributes) + } + + df <- vctrs::data_frame(x = 1) + + attributes <- attributes(df) + attributes$row.names <- new_lazy_character(function() "a") + attributes <- as.pairlist(attributes) + + df_with_lazy_row_names <- dplyr_set_attributes(df, attributes) + + # Ensure `data` row names aren't materialized + x <- dplyr_reconstruct(df_with_lazy_row_names, df) + attributes <- dplyr_attributes(df_with_lazy_row_names) + expect_false(lazy_character_is_materialized(attributes$row.names)) + + # `data` row names should also propagate into the result unmaterialized + attributes <- dplyr_attributes(x) + expect_false(lazy_character_is_materialized(attributes$row.names)) + + # Ensure `template` row names aren't materialized + x <- dplyr_reconstruct(df, df_with_lazy_row_names) + attributes <- dplyr_attributes(df_with_lazy_row_names) + expect_false(lazy_character_is_materialized(attributes$row.names)) +}) diff --git a/tests/testthat/test-utils.R b/tests/testthat/test-utils.R index b78c44b030..c5ee59045e 100644 --- a/tests/testthat/test-utils.R +++ b/tests/testthat/test-utils.R @@ -70,94 +70,3 @@ test_that("`list_flatten()` accepts a predicate `fn` to selectively flatten", { list(1, list(b = 2), 3, c = 4, d = list(e = 5), 6) ) }) - -# ------------------------------------------------------------------------------ -# dplyr_attributes() / dplyr_set_attributes() - -test_that("can get attributes", { - x <- list() - # Empty named list! - expect_identical(dplyr_attributes(x), set_names(list())) - - x <- structure(list(), foo = 1, bar = 2) - expect_identical(dplyr_attributes(x), list(foo = 1, bar = 2)) -}) - -test_that("can set attributes", { - x <- dplyr_set_attributes(1, list(foo = 1, bar = 2)) - expect_identical(dplyr_attributes(x), list(foo = 1, bar = 2)) -}) - -test_that("can't set attributes with `NULL`", { - expect_snapshot(error = TRUE, { - dplyr_set_attributes(1, NULL) - }) -}) - -test_that("can't set attributes on `NULL`", { - expect_snapshot(error = TRUE, { - dplyr_set_attributes(NULL, list(x = 1)) - }) -}) - -test_that("can't set attributes with unnamed elements", { - expect_snapshot(error = TRUE, { - dplyr_set_attributes(1, list()) - }) - expect_snapshot(error = TRUE, { - dplyr_set_attributes(1, list(x = 1, 2)) - }) -}) - -test_that("object bit is set when setting attributes", { - x <- list() - attributes <- list(class = "foo") - - expect_false(is.object(x)) - expect_true(is.object(dplyr_set_attributes(x, attributes))) -}) - -test_that("`row.names` are not touched whatsoever when getting or setting (#6525)", { - skip_if_no_lazy_character() - - df <- vctrs::data_frame(x = 1) - - attributes <- attributes(df) - attributes$row.names <- new_lazy_character(function() "a") - - # Start out unmaterialized - expect_false(lazy_character_is_materialized(attributes$row.names)) - - # Full roundtrip - df <- dplyr_set_attributes(df, attributes) - attributes <- dplyr_attributes(df) - - # Still unmaterialized - expect_false(lazy_character_is_materialized(attributes$row.names)) -}) - -test_that("`dplyr_reconstruct()`, which gets and sets attributes, doesn't touch `row.names` (#6525)", { - skip_if_no_lazy_character() - - df <- vctrs::data_frame(x = 1) - - attributes <- attributes(df) - attributes$row.names <- new_lazy_character(function() "a") - - df_with_lazy_row_names <- dplyr_set_attributes(df, attributes) - - # Ensure `data` row names aren't materialized - x <- dplyr_reconstruct(df_with_lazy_row_names, df) - attributes <- dplyr_attributes(df_with_lazy_row_names) - expect_false(lazy_character_is_materialized(attributes$row.names)) - - # `data` row names should also propagate into the result unmaterialized - attributes <- dplyr_attributes(x) - expect_false(lazy_character_is_materialized(attributes$row.names)) - - # Ensure `template` row names aren't materialized - x <- dplyr_reconstruct(df, df_with_lazy_row_names) - attributes <- dplyr_attributes(df_with_lazy_row_names) - expect_false(lazy_character_is_materialized(attributes$row.names)) -}) -