From bd97acfa2159b76668e7275f7c2794ccdc8c7206 Mon Sep 17 00:00:00 2001 From: Davis Vaughan Date: Mon, 30 Oct 2023 15:06:34 -0400 Subject: [PATCH] Implement `dplyr_attributes()` and `dplyr_set_attributes()` --- R/generics.R | 10 ++- R/utils.R | 7 ++ src/attributes.cpp | 127 +++++++++++++++++++++++++++++++++ src/dplyr.h | 3 + src/init.cpp | 3 + tests/testthat/_snaps/utils.md | 32 +++++++++ tests/testthat/helper-lazy.R | 22 ++++++ tests/testthat/test-utils.R | 91 +++++++++++++++++++++++ 8 files changed, 289 insertions(+), 6 deletions(-) create mode 100644 src/attributes.cpp create mode 100644 tests/testthat/_snaps/utils.md create mode 100644 tests/testthat/helper-lazy.R diff --git a/R/generics.R b/R/generics.R index e31073c72a..9e1d171161 100644 --- a/R/generics.R +++ b/R/generics.R @@ -202,12 +202,10 @@ dplyr_reconstruct_dispatch <- function(data, template) { #' @export dplyr_reconstruct.data.frame <- function(data, template) { - attrs <- attributes(template) - attrs$names <- names(data) - attrs$row.names <- .row_names_info(data, type = 0L) - - attributes(data) <- attrs - data + attributes <- dplyr_attributes(template) + attributes$names <- names(data) + attributes$row.names <- .row_names_info(data, type = 0L) + dplyr_set_attributes(data, attributes) } #' @export diff --git a/R/utils.R b/R/utils.R index e74ac8f32b..29cb024f9f 100644 --- a/R/utils.R +++ b/R/utils.R @@ -48,6 +48,13 @@ dplyr_new_tibble <- function(x, size) { new_data_frame(x = x, n = size, class = c("tbl_df", "tbl")) } +dplyr_attributes <- function(x) { + .Call(ffi_dplyr_attributes, x) +} +dplyr_set_attributes <- function(x, attributes) { + .Call(ffi_dplyr_set_attributes, x, attributes) +} + #' @param x A list #' @param fn An optional function of 1 argument to be applied to each list #' element of `x`. This allows you to further refine what elements should be diff --git a/src/attributes.cpp b/src/attributes.cpp new file mode 100644 index 0000000000..0cb3f3b776 --- /dev/null +++ b/src/attributes.cpp @@ -0,0 +1,127 @@ +#include "dplyr.h" + +// Push a new attribute at the end of `x`'s attributes. +// Caller is responsible for ensuring the `tag` doesn't already exist. +static inline +SEXP r_attrib_push(SEXP x, SEXP tag, SEXP value) { + SEXP attributes = Rf_cons(value, ATTRIB(x)); + SET_TAG(attributes, tag); + SET_ATTRIB(x, attributes); + return attributes; +} + +// Alternative to `attributes()`, but based heavily on the C level +// implementation, `do_attributes()`. +// +// Unlike `attributes()`, this does not inspect the row names in any way (i.e. +// typically to expand compact row names), which has the side effect of +// materializing duckplyr queries. +// https://github.com/tidyverse/dplyr/issues/6525#issuecomment-1303619152 +// https://github.com/wch/r-source/blob/69b94f0c8ce9b2497f6d7a81922575f6c585b713/src/main/attrib.c#L176-L177 +SEXP ffi_dplyr_attributes(SEXP x) { + SEXP attributes = ATTRIB(x); + R_xlen_t size = Rf_xlength(attributes); + + SEXP out = PROTECT(Rf_allocVector(VECSXP, size)); + + SEXP names = Rf_allocVector(STRSXP, size); + Rf_setAttrib(out, R_NamesSymbol, names); + + for (R_xlen_t i = 0; i < size; ++i, attributes = CDR(attributes)) { + SEXP name = TAG(attributes); + + if (TYPEOF(name) != SYMSXP) { + // Hopefully never happens, but `do_attributes()` has a special path + // for this, so we are extra careful not to crash + Rf_errorcall(R_NilValue, "Unexpected non-symbol attribute pairlist tag."); + } + + SET_STRING_ELT(names, i, PRINTNAME(name)); + + if (name == R_RowNamesSymbol) { + // Super special path for row names! Row names are passed through as is, + // with no modification, although we do mark the attribute as not mutable, + // like `Rf_getAttrib()`. + SEXP value = CAR(attributes); + MARK_NOT_MUTABLE(value); + SET_VECTOR_ELT(out, i, value); + continue; + } + + SEXP value = Rf_getAttrib(x, name); + SET_VECTOR_ELT(out, i, value); + } + + UNPROTECT(1); + return out; +} + +// Alternative to `attributes<-()`, but based heavily on the C level +// implementation, `do_attributesgets()`. +// +// Unlike `attributes<-()`, this does not inspect the row names in any way (i.e. +// typically to create compact row names), which has the side effect of +// materializing duckplyr queries. +// +// If we want this to be a full replacement, then it should also ensure that the +// `dim` attribute is set before the `dimnames` attribute, but we currently +// never expect those attributes to appear on data frames. +// https://github.com/wch/r-source/blob/69b94f0c8ce9b2497f6d7a81922575f6c585b713/src/main/attrib.c#L1401C39-L1404 +SEXP ffi_dplyr_set_attributes(SEXP x, SEXP attributes) { + if (x == R_NilValue) { + Rf_errorcall(R_NilValue, "Internal error: `x` can't be `NULL`."); + } + + // Make an ALTREP wrapper if possible, since the underlying data doesn't change +#if R_VERSION >= R_Version(3, 6, 0) + x = PROTECT(R_shallow_duplicate_attr(x)); +#else + x = PROTECT(Rf_shallow_duplicate(x)); +#endif + + // Remove existing attributes, and unset the object bit. + // It will be reset by `Rf_setAttrib()` if needed. + SET_ATTRIB(x, R_NilValue); + SET_OBJECT(x, 0); + + if (TYPEOF(attributes) != VECSXP) { + Rf_errorcall(R_NilValue, "Internal error: `attributes` must be a list."); + } + + const SEXP* v_attributes = VECTOR_PTR_RO(attributes); + + R_xlen_t size = Rf_xlength(attributes); + SEXP names = Rf_getAttrib(attributes, R_NamesSymbol); + + if (TYPEOF(names) != STRSXP) { + Rf_errorcall(R_NilValue, "Internal error: `attributes` must be named."); + } + + const SEXP* v_names = STRING_PTR_RO(names); + + // Check that each element of `names` is named + for (R_xlen_t i = 0; i < size; ++i) { + SEXP name = v_names[i]; + + if (name == NA_STRING || name == R_BlankString) { + Rf_errorcall(R_NilValue, "All attributes must have names. Attribute %i does not.", i + 1); + } + } + + for (R_xlen_t i = 0; i < size; ++i) { + SEXP name = Rf_installChar(v_names[i]); + SEXP attribute = v_attributes[i]; + + if (name == R_RowNamesSymbol) { + // Super special path for row names! + // Row names are pushed on untouched and unchecked. + r_attrib_push(x, name, attribute); + continue; + } + + Rf_setAttrib(x, name, attribute); + } + + UNPROTECT(1); + return x; +} diff --git a/src/dplyr.h b/src/dplyr.h index 8c0035c29c..bfc075ba66 100644 --- a/src/dplyr.h +++ b/src/dplyr.h @@ -96,6 +96,9 @@ inline bool obj_is_list(SEXP x) { } +SEXP ffi_dplyr_attributes(SEXP x); +SEXP ffi_dplyr_set_attributes(SEXP x, SEXP attributes); + SEXP dplyr_expand_groups(SEXP old_groups, SEXP positions, SEXP s_nr); SEXP dplyr_cumall(SEXP x); SEXP dplyr_cumany(SEXP x); diff --git a/src/init.cpp b/src/init.cpp index 09e699f5f5..f7971c6e2e 100644 --- a/src/init.cpp +++ b/src/init.cpp @@ -95,6 +95,9 @@ SEXP dplyr_init_library(SEXP ns_dplyr, SEXP ns_vctrs, SEXP ns_rlang) { static const R_CallMethodDef CallEntries[] = { {"dplyr_init_library", (DL_FUNC)& dplyr_init_library, 3}, + {"ffi_dplyr_attributes", (DL_FUNC)& ffi_dplyr_attributes, 1}, + {"ffi_dplyr_set_attributes", (DL_FUNC)& ffi_dplyr_set_attributes, 2}, + {"dplyr_expand_groups", (DL_FUNC)& dplyr_expand_groups, 3}, {"dplyr_cumall", (DL_FUNC)& dplyr_cumall, 1}, {"dplyr_cumany", (DL_FUNC)& dplyr_cumany, 1}, diff --git a/tests/testthat/_snaps/utils.md b/tests/testthat/_snaps/utils.md new file mode 100644 index 0000000000..19ac9a853e --- /dev/null +++ b/tests/testthat/_snaps/utils.md @@ -0,0 +1,32 @@ +# can't set attributes with `NULL` + + Code + dplyr_set_attributes(1, NULL) + Condition + Error: + ! Internal error: `attributes` must be a list. + +# can't set attributes on `NULL` + + Code + dplyr_set_attributes(NULL, list(x = 1)) + Condition + Error: + ! Internal error: `x` can't be `NULL`. + +# can't set attributes with unnamed elements + + Code + dplyr_set_attributes(1, list()) + Condition + Error: + ! Internal error: `attributes` must be named. + +--- + + Code + dplyr_set_attributes(1, list(x = 1, 2)) + Condition + Error: + ! All attributes must have names. Attribute 2 does not. + diff --git a/tests/testthat/helper-lazy.R b/tests/testthat/helper-lazy.R new file mode 100644 index 0000000000..ea086992d6 --- /dev/null +++ b/tests/testthat/helper-lazy.R @@ -0,0 +1,22 @@ +skip_if_no_lazy_character <- function() { + skip_if(getRversion() <= "3.5.0") + + new_lazy_character <- import_vctrs("new_lazy_character", optional = TRUE) + lazy_character_is_materialized <- import_vctrs("lazy_character_is_materialized", optional = TRUE) + + if (is.null(new_lazy_character) || is.null(lazy_character_is_materialized)) { + skip("Lazy character helpers from vctrs are not available.") + } + + invisible() +} + +new_lazy_character <- function(fn) { + f <- import_vctrs("new_lazy_character") + f(fn) +} + +lazy_character_is_materialized <- function(x) { + f <- import_vctrs("lazy_character_is_materialized") + f(x) +} diff --git a/tests/testthat/test-utils.R b/tests/testthat/test-utils.R index c5ee59045e..b78c44b030 100644 --- a/tests/testthat/test-utils.R +++ b/tests/testthat/test-utils.R @@ -70,3 +70,94 @@ test_that("`list_flatten()` accepts a predicate `fn` to selectively flatten", { list(1, list(b = 2), 3, c = 4, d = list(e = 5), 6) ) }) + +# ------------------------------------------------------------------------------ +# dplyr_attributes() / dplyr_set_attributes() + +test_that("can get attributes", { + x <- list() + # Empty named list! + expect_identical(dplyr_attributes(x), set_names(list())) + + x <- structure(list(), foo = 1, bar = 2) + expect_identical(dplyr_attributes(x), list(foo = 1, bar = 2)) +}) + +test_that("can set attributes", { + x <- dplyr_set_attributes(1, list(foo = 1, bar = 2)) + expect_identical(dplyr_attributes(x), list(foo = 1, bar = 2)) +}) + +test_that("can't set attributes with `NULL`", { + expect_snapshot(error = TRUE, { + dplyr_set_attributes(1, NULL) + }) +}) + +test_that("can't set attributes on `NULL`", { + expect_snapshot(error = TRUE, { + dplyr_set_attributes(NULL, list(x = 1)) + }) +}) + +test_that("can't set attributes with unnamed elements", { + expect_snapshot(error = TRUE, { + dplyr_set_attributes(1, list()) + }) + expect_snapshot(error = TRUE, { + dplyr_set_attributes(1, list(x = 1, 2)) + }) +}) + +test_that("object bit is set when setting attributes", { + x <- list() + attributes <- list(class = "foo") + + expect_false(is.object(x)) + expect_true(is.object(dplyr_set_attributes(x, attributes))) +}) + +test_that("`row.names` are not touched whatsoever when getting or setting (#6525)", { + skip_if_no_lazy_character() + + df <- vctrs::data_frame(x = 1) + + attributes <- attributes(df) + attributes$row.names <- new_lazy_character(function() "a") + + # Start out unmaterialized + expect_false(lazy_character_is_materialized(attributes$row.names)) + + # Full roundtrip + df <- dplyr_set_attributes(df, attributes) + attributes <- dplyr_attributes(df) + + # Still unmaterialized + expect_false(lazy_character_is_materialized(attributes$row.names)) +}) + +test_that("`dplyr_reconstruct()`, which gets and sets attributes, doesn't touch `row.names` (#6525)", { + skip_if_no_lazy_character() + + df <- vctrs::data_frame(x = 1) + + attributes <- attributes(df) + attributes$row.names <- new_lazy_character(function() "a") + + df_with_lazy_row_names <- dplyr_set_attributes(df, attributes) + + # Ensure `data` row names aren't materialized + x <- dplyr_reconstruct(df_with_lazy_row_names, df) + attributes <- dplyr_attributes(df_with_lazy_row_names) + expect_false(lazy_character_is_materialized(attributes$row.names)) + + # `data` row names should also propagate into the result unmaterialized + attributes <- dplyr_attributes(x) + expect_false(lazy_character_is_materialized(attributes$row.names)) + + # Ensure `template` row names aren't materialized + x <- dplyr_reconstruct(df, df_with_lazy_row_names) + attributes <- dplyr_attributes(df_with_lazy_row_names) + expect_false(lazy_character_is_materialized(attributes$row.names)) +}) +