Skip to content

Commit

Permalink
Implement dplyr_attributes() and dplyr_set_attributes()
Browse files Browse the repository at this point in the history
  • Loading branch information
DavisVaughan committed Oct 30, 2023
1 parent 049359c commit 48cc6d8
Show file tree
Hide file tree
Showing 8 changed files with 289 additions and 6 deletions.
10 changes: 4 additions & 6 deletions R/generics.R
Original file line number Diff line number Diff line change
Expand Up @@ -202,12 +202,10 @@ dplyr_reconstruct_dispatch <- function(data, template) {

#' @export
dplyr_reconstruct.data.frame <- function(data, template) {
attrs <- attributes(template)
attrs$names <- names(data)
attrs$row.names <- .row_names_info(data, type = 0L)

attributes(data) <- attrs
data
attributes <- dplyr_attributes(template)
attributes$names <- names(data)
attributes$row.names <- .row_names_info(data, type = 0L)
dplyr_set_attributes(data, attributes)
}

#' @export
Expand Down
7 changes: 7 additions & 0 deletions R/utils.R
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,13 @@ dplyr_new_tibble <- function(x, size) {
new_data_frame(x = x, n = size, class = c("tbl_df", "tbl"))
}

dplyr_attributes <- function(x) {
.Call(ffi_dplyr_attributes, x)
}
dplyr_set_attributes <- function(x, attributes) {
.Call(ffi_dplyr_set_attributes, x, attributes)
}

#' @param x A list
#' @param fn An optional function of 1 argument to be applied to each list
#' element of `x`. This allows you to further refine what elements should be
Expand Down
127 changes: 127 additions & 0 deletions src/attributes.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
#include "dplyr.h"

// Push a new attribute at the end of `x`'s attributes.
// Caller is responsible for ensuring the `tag` doesn't already exist.
static inline
SEXP r_attrib_push(SEXP x, SEXP tag, SEXP value) {
SEXP attributes = Rf_cons(value, ATTRIB(x));
SET_TAG(attributes, tag);
SET_ATTRIB(x, attributes);
return attributes;
}

// Alternative to `attributes()`, but based heavily on the C level
// implementation, `do_attributes()`.
//
// Unlike `attributes()`, this does not inspect the row names in any way (i.e.
// typically to expand compact row names), which has the side effect of
// materializing duckplyr queries.
// https://github.com/tidyverse/dplyr/issues/6525#issuecomment-1303619152
// https://github.com/wch/r-source/blob/69b94f0c8ce9b2497f6d7a81922575f6c585b713/src/main/attrib.c#L176-L177
SEXP ffi_dplyr_attributes(SEXP x) {
SEXP attributes = ATTRIB(x);
R_xlen_t size = Rf_xlength(attributes);

SEXP out = PROTECT(Rf_allocVector(VECSXP, size));

SEXP names = Rf_allocVector(STRSXP, size);
Rf_setAttrib(out, R_NamesSymbol, names);

for (R_xlen_t i = 0; i < size; ++i, attributes = CDR(attributes)) {
SEXP name = TAG(attributes);

if (TYPEOF(name) != SYMSXP) {
// Hopefully never happens, but `do_attributes()` has a special path
// for this, so we are extra careful not to crash
Rf_errorcall(R_NilValue, "Unexpected non-symbol attribute pairlist tag.");
}

SET_STRING_ELT(names, i, PRINTNAME(name));

if (name == R_RowNamesSymbol) {
// Super special path for row names! Row names are passed through as is,
// with no modification, although we do mark the attribute as not mutable,
// like `Rf_getAttrib()`.
SEXP value = CAR(attributes);
MARK_NOT_MUTABLE(value);
SET_VECTOR_ELT(out, i, value);
continue;
}

SEXP value = Rf_getAttrib(x, name);
SET_VECTOR_ELT(out, i, value);
}

UNPROTECT(1);
return out;
}

// Alternative to `attributes<-()`, but based heavily on the C level
// implementation, `do_attributesgets()`.
//
// Unlike `attributes<-()`, this does not inspect the row names in any way (i.e.
// typically to create compact row names), which has the side effect of
// materializing duckplyr queries.
//
// If we want this to be a full replacement, then it should also ensure that the
// `dim` attribute is set before the `dimnames` attribute, but we currently
// never expect those attributes to appear on data frames.
// https://github.com/wch/r-source/blob/69b94f0c8ce9b2497f6d7a81922575f6c585b713/src/main/attrib.c#L1401C39-L1404
SEXP ffi_dplyr_set_attributes(SEXP x, SEXP attributes) {
if (x == R_NilValue) {
Rf_errorcall(R_NilValue, "Internal error: `x` can't be `NULL`.");
}

// Make an ALTREP wrapper if possible, since the underlying data doesn't change
#if R_VERSION >= R_Version(3, 6, 0)
x = PROTECT(R_shallow_duplicate_attr(x));
#else
x = PROTECT(Rf_shallow_duplicate(x));
#endif

// Remove existing attributes, and unset the object bit.
// It will be reset by `Rf_setAttrib()` if needed.
SET_ATTRIB(x, R_NilValue);
SET_OBJECT(x, 0);

if (TYPEOF(attributes) != VECSXP) {
Rf_errorcall(R_NilValue, "Internal error: `attributes` must be a list.");
}

const SEXP* v_attributes = VECTOR_PTR_RO(attributes);

R_xlen_t size = Rf_xlength(attributes);
SEXP names = Rf_getAttrib(attributes, R_NamesSymbol);

if (TYPEOF(names) != STRSXP) {
Rf_errorcall(R_NilValue, "Internal error: `attributes` must be named.");
}

const SEXP* v_names = STRING_PTR_RO(names);

// Check that each element of `names` is named
for (R_xlen_t i = 0; i < size; ++i) {
SEXP name = v_names[i];

if (name == NA_STRING || name == R_BlankString) {
Rf_errorcall(R_NilValue, "All attributes must have names. Attribute %i does not.", i + 1);
}
}

for (R_xlen_t i = 0; i < size; ++i) {
SEXP name = Rf_installChar(v_names[i]);
SEXP attribute = v_attributes[i];

if (name == R_RowNamesSymbol) {
// Super special path for row names!
// Row names are pushed on untouched and unchecked.
r_attrib_push(x, name, attribute);
continue;
}

Rf_setAttrib(x, name, attribute);
}

UNPROTECT(1);
return x;
}
3 changes: 3 additions & 0 deletions src/dplyr.h
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,9 @@ inline bool obj_is_list(SEXP x) {

}

SEXP ffi_dplyr_attributes(SEXP x);
SEXP ffi_dplyr_set_attributes(SEXP x, SEXP attributes);

SEXP dplyr_expand_groups(SEXP old_groups, SEXP positions, SEXP s_nr);
SEXP dplyr_cumall(SEXP x);
SEXP dplyr_cumany(SEXP x);
Expand Down
3 changes: 3 additions & 0 deletions src/init.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,9 @@ SEXP dplyr_init_library(SEXP ns_dplyr, SEXP ns_vctrs, SEXP ns_rlang) {
static const R_CallMethodDef CallEntries[] = {
{"dplyr_init_library", (DL_FUNC)& dplyr_init_library, 3},

{"ffi_dplyr_attributes", (DL_FUNC)& ffi_dplyr_attributes, 1},
{"ffi_dplyr_set_attributes", (DL_FUNC)& ffi_dplyr_set_attributes, 2},

{"dplyr_expand_groups", (DL_FUNC)& dplyr_expand_groups, 3},
{"dplyr_cumall", (DL_FUNC)& dplyr_cumall, 1},
{"dplyr_cumany", (DL_FUNC)& dplyr_cumany, 1},
Expand Down
32 changes: 32 additions & 0 deletions tests/testthat/_snaps/utils.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
# can't set attributes with `NULL`

Code
dplyr_set_attributes(1, NULL)
Condition
Error:
! Internal error: `attributes` must be a list.

# can't set attributes on `NULL`

Code
dplyr_set_attributes(NULL, list(x = 1))
Condition
Error:
! Internal error: `x` can't be `NULL`.

# can't set attributes with unnamed elements

Code
dplyr_set_attributes(1, list())
Condition
Error:
! Internal error: `attributes` must be named.

---

Code
dplyr_set_attributes(1, list(x = 1, 2))
Condition
Error:
! All attributes must have names. Attribute 2 does not.

22 changes: 22 additions & 0 deletions tests/testthat/helper-lazy.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
skip_if_no_lazy_character <- function() {
skip_if(getRversion() <= "3.5.0")

new_lazy_character <- import_vctrs("new_lazy_character", optional = TRUE)
lazy_character_is_materialized <- import_vctrs("lazy_character_is_materialized", optional = TRUE)

if (is.null(new_lazy_character) || is.null(lazy_character_is_materialized)) {
skip("Lazy character helpers from vctrs are not available.")
}

invisible()
}

new_lazy_character <- function(fn) {
f <- import_vctrs("new_lazy_character")
f(fn)
}

lazy_character_is_materialized <- function(x) {
f <- import_vctrs("lazy_character_is_materialized")
f(x)
}
91 changes: 91 additions & 0 deletions tests/testthat/test-utils.R
Original file line number Diff line number Diff line change
Expand Up @@ -70,3 +70,94 @@ test_that("`list_flatten()` accepts a predicate `fn` to selectively flatten", {
list(1, list(b = 2), 3, c = 4, d = list(e = 5), 6)
)
})

# ------------------------------------------------------------------------------
# dplyr_attributes() / dplyr_set_attributes()

test_that("can get attributes", {
x <- list()
# Empty named list!
expect_identical(dplyr_attributes(x), set_names(list()))

x <- structure(list(), foo = 1, bar = 2)
expect_identical(dplyr_attributes(x), list(foo = 1, bar = 2))
})

test_that("can set attributes", {
x <- dplyr_set_attributes(1, list(foo = 1, bar = 2))
expect_identical(dplyr_attributes(x), list(foo = 1, bar = 2))
})

test_that("can't set attributes with `NULL`", {
expect_snapshot(error = TRUE, {
dplyr_set_attributes(1, NULL)
})
})

test_that("can't set attributes on `NULL`", {
expect_snapshot(error = TRUE, {
dplyr_set_attributes(NULL, list(x = 1))
})
})

test_that("can't set attributes with unnamed elements", {
expect_snapshot(error = TRUE, {
dplyr_set_attributes(1, list())
})
expect_snapshot(error = TRUE, {
dplyr_set_attributes(1, list(x = 1, 2))
})
})

test_that("object bit is set when setting attributes", {
x <- list()
attributes <- list(class = "foo")

expect_false(is.object(x))
expect_true(is.object(dplyr_set_attributes(x, attributes)))
})

test_that("`row.names` are not touched whatsoever when getting or setting (#6525)", {
skip_if_no_lazy_character()

df <- vctrs::data_frame(x = 1)

attributes <- attributes(df)
attributes$row.names <- new_lazy_character(function() "a")

# Start out unmaterialized
expect_false(lazy_character_is_materialized(attributes$row.names))

# Full roundtrip
df <- dplyr_set_attributes(df, attributes)
attributes <- dplyr_attributes(df)

# Still unmaterialized
expect_false(lazy_character_is_materialized(attributes$row.names))
})

test_that("`dplyr_reconstruct()`, which gets and sets attributes, doesn't touch `row.names` (#6525)", {
skip_if_no_lazy_character()

df <- vctrs::data_frame(x = 1)

attributes <- attributes(df)
attributes$row.names <- new_lazy_character(function() "a")

df_with_lazy_row_names <- dplyr_set_attributes(df, attributes)

# Ensure `data` row names aren't materialized
x <- dplyr_reconstruct(df_with_lazy_row_names, df)
attributes <- dplyr_attributes(df_with_lazy_row_names)
expect_false(lazy_character_is_materialized(attributes$row.names))

# `data` row names should also propagate into the result unmaterialized
attributes <- dplyr_attributes(x)
expect_false(lazy_character_is_materialized(attributes$row.names))

# Ensure `template` row names aren't materialized
x <- dplyr_reconstruct(df, df_with_lazy_row_names)
attributes <- dplyr_attributes(df_with_lazy_row_names)
expect_false(lazy_character_is_materialized(attributes$row.names))
})

0 comments on commit 48cc6d8

Please sign in to comment.