Skip to content

Commit

Permalink
Switch to pairlist based approach
Browse files Browse the repository at this point in the history
  • Loading branch information
DavisVaughan committed Nov 6, 2023
1 parent bd97acf commit ebc8bfe
Show file tree
Hide file tree
Showing 9 changed files with 195 additions and 265 deletions.
6 changes: 2 additions & 4 deletions R/generics.R
Original file line number Diff line number Diff line change
Expand Up @@ -192,6 +192,7 @@ dplyr_col_modify.rowwise_df <- function(data, cols) {
dplyr_reconstruct <- function(data, template) {
# Strip attributes before dispatch to make it easier to implement
# methods and prevent unexpected leaking of irrelevant attributes.
# This also enforces that `data` is a well-formed data frame.
data <- dplyr_new_data_frame(data)
return(dplyr_reconstruct_dispatch(data, template))
UseMethod("dplyr_reconstruct", template)
Expand All @@ -202,10 +203,7 @@ dplyr_reconstruct_dispatch <- function(data, template) {

#' @export
dplyr_reconstruct.data.frame <- function(data, template) {
attributes <- dplyr_attributes(template)
attributes$names <- names(data)
attributes$row.names <- .row_names_info(data, type = 0L)
dplyr_set_attributes(data, attributes)
.Call(ffi_dplyr_reconstruct, data, template)
}

#' @export
Expand Down
7 changes: 0 additions & 7 deletions R/utils.R
Original file line number Diff line number Diff line change
Expand Up @@ -48,13 +48,6 @@ dplyr_new_tibble <- function(x, size) {
new_data_frame(x = x, n = size, class = c("tbl_df", "tbl"))
}

dplyr_attributes <- function(x) {
.Call(ffi_dplyr_attributes, x)
}
dplyr_set_attributes <- function(x, attributes) {
.Call(ffi_dplyr_set_attributes, x, attributes)
}

#' @param x A list
#' @param fn An optional function of 1 argument to be applied to each list
#' element of `x`. This allows you to further refine what elements should be
Expand Down
127 changes: 0 additions & 127 deletions src/attributes.cpp

This file was deleted.

5 changes: 3 additions & 2 deletions src/dplyr.h
Original file line number Diff line number Diff line change
Expand Up @@ -96,8 +96,9 @@ inline bool obj_is_list(SEXP x) {

}

SEXP ffi_dplyr_attributes(SEXP x);
SEXP ffi_dplyr_set_attributes(SEXP x, SEXP attributes);
SEXP ffi_dplyr_reconstruct(SEXP data, SEXP template_);
SEXP ffi_test_dplyr_attributes(SEXP x);
SEXP ffi_test_dplyr_set_attributes(SEXP x, SEXP attributes);

SEXP dplyr_expand_groups(SEXP old_groups, SEXP positions, SEXP s_nr);
SEXP dplyr_cumall(SEXP x);
Expand Down
5 changes: 3 additions & 2 deletions src/init.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -95,8 +95,9 @@ SEXP dplyr_init_library(SEXP ns_dplyr, SEXP ns_vctrs, SEXP ns_rlang) {
static const R_CallMethodDef CallEntries[] = {
{"dplyr_init_library", (DL_FUNC)& dplyr_init_library, 3},

{"ffi_dplyr_attributes", (DL_FUNC)& ffi_dplyr_attributes, 1},
{"ffi_dplyr_set_attributes", (DL_FUNC)& ffi_dplyr_set_attributes, 2},
{"ffi_dplyr_reconstruct", (DL_FUNC)& ffi_dplyr_reconstruct, 2},
{"ffi_test_dplyr_attributes", (DL_FUNC)& ffi_test_dplyr_attributes, 1},
{"ffi_test_dplyr_set_attributes", (DL_FUNC)& ffi_test_dplyr_set_attributes, 2},

{"dplyr_expand_groups", (DL_FUNC)& dplyr_expand_groups, 3},
{"dplyr_cumall", (DL_FUNC)& dplyr_cumall, 1},
Expand Down
136 changes: 136 additions & 0 deletions src/reconstruct.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
#include "dplyr.h"

// Essentially, a C implementation of:
//
// ```
// attributes <- attributes(template)
// attributes$names <- names(data)
// attributes$row.names <- .row_names_info(data, type = 0L)
// attributes(data) <- attributes
// ```
//
// The problem with that is that:
// - `attributes()` ends up calling `Rf_getAttrib()`, which tries to check
// for internal `row.names` in `template` so they aren't leaked to the user.
// Unfortunately this materializes lazy ALTREP `row.names`, like those used
// by duckplyr.
// - `attributes<-()` ends up calling `Rf_setAttrib()`, which tries to check
// if it can make efficient internal `row.names`. Again, this materializes
// lazy ALTREP `row.names`, like those used by duckplyr.
//
// So we bypass that here by carefully manipulating the attribute pairlists.
//
// We expect that at this point, both `data` and `template_` are S3 data
// frames, both of which have `names` and `row.names` attributes. If this isn't
// true, we error.
// - For `data`, we enforce this in `dplyr_reconstruct()`'s generic by calling
// `dplyr_new_data_frame()` (ideally no intermediate method invalidates this).
// - For `template_`, we assume this since we got here through the S3 method
// `dplyr_reconstruct.data.frame()`, which dispatched off `template_`. A
// well-formed S3 data frame must have `names` and `row.names` attributes.
//
// https://github.com/tidyverse/dplyr/pull/6947
// https://github.com/tidyverse/dplyr/issues/6525#issuecomment-1303619152
// https://github.com/wch/r-source/blob/69b94f0c8ce9b2497f6d7a81922575f6c585b713/src/main/attrib.c#L176-L177
// https://github.com/wch/r-source/blob/69b94f0c8ce9b2497f6d7a81922575f6c585b713/src/main/attrib.c#L57
SEXP ffi_dplyr_reconstruct(SEXP data, SEXP template_) {
if (TYPEOF(data) != VECSXP) {
Rf_errorcall(R_NilValue, "Internal error: `data` must be a list.");
}
if (TYPEOF(template_) != VECSXP) {
Rf_errorcall(R_NilValue, "Internal error: `template` must be a list.");
}
if (!OBJECT(data)) {
Rf_errorcall(R_NilValue, "Internal error: `data` must be an object.");
}
if (!OBJECT(template_)) {
Rf_errorcall(R_NilValue, "Internal error: `template` must be an object.");
}

bool seen_names = false;
bool seen_row_names = false;

// Pull the `names` and `row.names` off `data`.
// These are the only 2 attributes from `data` that persist.
SEXP names = R_NilValue;
SEXP row_names = R_NilValue;

for (SEXP node = ATTRIB(data); node != R_NilValue; node = CDR(node)) {
SEXP tag = TAG(node);

if (tag == R_NamesSymbol) {
names = CAR(node);
MARK_NOT_MUTABLE(names);
seen_names = true;
}
if (tag == R_RowNamesSymbol) {
row_names = CAR(node);
MARK_NOT_MUTABLE(row_names);
seen_row_names = true;
}
}

if (!seen_names) {
Rf_errorcall(R_NilValue, "Internal error: `data` must have a `names` attribute.");
}
if (!seen_row_names) {
Rf_errorcall(R_NilValue, "Internal error: `data` must have a `row.names` attribute.");
}

seen_names = false;
seen_row_names = false;

// Now replace the `names` and `row.names` attributes in the `template_`
// attributes with the ones from `data`. This attribute set becomes the final
// one we set on `data`.
SEXP attributes = ATTRIB(template_);
attributes = PROTECT(Rf_shallow_duplicate(attributes));

for (SEXP node = attributes; node != R_NilValue; node = CDR(node)) {
SEXP tag = TAG(node);

if (tag == R_NamesSymbol) {
SETCAR(node, names);
seen_names = true;
}
if (tag == R_RowNamesSymbol) {
SETCAR(node, row_names);
seen_row_names = true;
}
}

if (!seen_names) {
Rf_errorcall(R_NilValue, "Internal error: `template` must have a `names` attribute.");
}
if (!seen_row_names) {
Rf_errorcall(R_NilValue, "Internal error: `template` must have a `row.names` attribute.");
}

// Make an ALTREP wrapper if possible, since the underlying data doesn't change
#if R_VERSION >= R_Version(3, 6, 0)
data = PROTECT(R_shallow_duplicate_attr(data));
#else
data = PROTECT(Rf_shallow_duplicate(data));
#endif

SET_ATTRIB(data, attributes);

UNPROTECT(2);
return data;
}

// Very unsafe wrappers needed for testing.
// Bypass `Rf_getAttrib()` and `Rf_setAttrib()` calls to avoid forcing ALTREP
// `row.names`.
SEXP ffi_test_dplyr_attributes(SEXP x) {
return ATTRIB(x);
}
SEXP ffi_test_dplyr_set_attributes(SEXP x, SEXP attributes) {
if (TYPEOF(attributes) != LISTSXP) {
Rf_errorcall(R_NilValue, "`attributes` must be a pairlist.");
}
x = PROTECT(Rf_shallow_duplicate(x));
SET_ATTRIB(x, attributes);
UNPROTECT(1);
return x;
}
32 changes: 0 additions & 32 deletions tests/testthat/_snaps/utils.md

This file was deleted.

Loading

0 comments on commit ebc8bfe

Please sign in to comment.