diff --git a/NAMESPACE b/NAMESPACE index 8c8425f62..6ed41ba03 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -59,6 +59,7 @@ export(nafill) export(setnafill) export(.Last.updated) export(fcoalesce) +export(cbindlist, setcbindlist) export(substitute2) #export(DT) # mtcars |> DT(i,j,by) #4872 #5472 export(fctr) diff --git a/R/mergelist.R b/R/mergelist.R new file mode 100644 index 000000000..65c344aaa --- /dev/null +++ b/R/mergelist.R @@ -0,0 +1,12 @@ +cbindlist_impl_ = function(l, copy) { + ans = .Call(Ccbindlist, l, copy) + if (anyDuplicated(names(ans))) { ## invalidate key and index + setattr(ans, "sorted", NULL) + setattr(ans, "index", NULL) + } + setDT(ans) + ans +} + +cbindlist = function(l) cbindlist_impl_(l, copy=TRUE) +setcbindlist = function(l) cbindlist_impl_(l, copy=FALSE) diff --git a/inst/tests/mergelist.Rraw b/inst/tests/mergelist.Rraw new file mode 100644 index 000000000..8a7ddbd1d --- /dev/null +++ b/inst/tests/mergelist.Rraw @@ -0,0 +1,71 @@ +require(methods) + +if (exists("test.data.table", .GlobalEnv, inherits=FALSE)) { + if ((tt<-compiler::enableJIT(-1))>0) + cat("This is dev mode and JIT is enabled (level ", tt, ") so there will be a brief pause around the first test.\n", sep="") +} else { + require(data.table) + test = data.table:::test +} + +# cbindlist, setcbindlist + +local({ + l = list( + d1 = data.table(x=1:3, v1=1L), + d2 = data.table(y=3:1, v2=2L), + d3 = data.table(z=2:4, v3=3L) + ) + ans = cbindlist(l) + expected = data.table(l$d1, l$d2, l$d3) + test(11.01, ans, expected) + test(11.02, intersect(vapply(ans, address, ""), unlist(lapply(l, vapply, address, ""))), character()) + ans = setcbindlist(l) + expected = setDT(c(l$d1, l$d2, l$d3)) + test(11.03, ans, expected) + test(11.04, length(intersect(vapply(ans, address, ""), unlist(lapply(l, vapply, address, "")))), ncol(expected)) +}) + +test(11.05, cbindlist(list(data.table(a=1L), data.table(), data.table(d=2L), data.table(f=3L))), data.table(a=1L, d=2L, f=3L)) +## codecov +test(12.01, cbindlist(data.frame(a=1L)), error="must be a list") +test(12.02, cbindlist(TRUE), error="must be a list") +test(12.03, cbindlist(list(data.table(a=1L), 1L)), error="is not a data.table") +test(12.04, options = c(datatable.verbose=TRUE), cbindlist(list(data.table(a=1:2), data.table(b=1:2))), data.table(a=1:2, b=1:2), output="cbindlist.*took") +test(12.05, cbindlist(list(data.table(), data.table(a=1:2), data.table(b=1:2))), data.table(a=1:2, b=1:2)) +test(12.06, cbindlist(list(data.table(), data.table(a=1:2), list(b=1:2))), data.table(a=1:2, b=1:2)) +test(12.07, cbindlist(list(data.table(a=integer()), list(b=integer()))), data.table(a=integer(), b=integer())) +## duplicated names +test(12.08, cbindlist(list(data.table(a=1L, b=2L), data.table(b=3L, d=4L))), data.table(a=1L, b=2L, b=3L, d=4L)) +local({ + # also test that keys, indices are wiped + ans = cbindlist(list(setindexv(data.table(a=2:1, b=1:2), "a"), data.table(a=1:2, b=2:1, key="a"), data.table(a=2:1, b=1:2))) + test(12.09, ans, data.table(a=2:1, b=1:2, a=1:2, b=2:1, a=2:1, b=1:2)) + test(12.10, indices(ans), NULL) +}) +## recycling, first ensure cbind recycling that we want to match to +test(12.11, cbind(data.table(x=integer()), data.table(a=1:2)), data.table(x=c(NA_integer_, NA), a=1:2)) +test(12.12, cbind(data.table(x=1L), data.table(a=1:2)), data.table(x=c(1L, 1L), a=1:2)) +test(12.13, cbindlist(list(data.table(a=integer()), data.table(b=1:2))), error="Recycling.*not yet implemented") +test(12.14, cbindlist(list(data.table(a=1L), data.table(b=1:2))), error="Recycling.*not yet implemented") +test(12.15, setcbindlist(list(data.table(a=integer()), data.table(b=1:2))), error="have to have the same number of rows") +test(12.16, setcbindlist(list(data.table(a=1L), data.table(b=1:2))), error="have to have the same number of rows") + +## retain indices +local({ + l = list( + data.table(id1=1:5, id2=5:1, id3=1:5, v1=1:5), + data.table(id4=5:1, id5=1:5, v2=1:5), + data.table(id6=5:1, id7=1:5, v3=1:5), + data.table(id8=5:1, id9=5:1, v4=1:5) + ) + setkeyv(l[[1L]], "id1"); setindexv(l[[1L]], list("id1", "id2", "id3", c("id1","id2","id3"))); setindexv(l[[3L]], list("id6", "id7")); setindexv(l[[4L]], "id9") + ii = lapply(l, indices) + ans = cbindlist(l) + test(13.1, key(ans), "id1") + test(13.2, indices(ans), c("id1", "id2", "id3", "id1__id2__id3", "id6", "id7", "id9")) + test(13.3, ii, lapply(l, indices)) ## this tests that original indices have not been touched, shallow_duplicate in mergeIndexAttrib +}) +test(13.4, cbindlist(list(data.table(a=1:2), data.table(b=3:4, key="b"))), data.table(a=1:2, b=3:4, key="b")) +# TODO(#7116): this could be supported +# test(13.5, cbindlist(list(data.table(a=1:2, key="a"), data.table(b=3:4, key="b"))), data.table(a=1:2, b=3:4, key=c("a", "b"))) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 35ece3b4a..989fcb0ec 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -21291,3 +21291,15 @@ unlink(f) test(2325.2, fread('"foo","bar","baz"\n"a","b","c"', na.strings=c('"foo"', '"bar"', '"baz"'), header=FALSE), data.table(V1=c(NA, "a"), V2=c(NA, "b"), V3=c(NA, "c"))) + +## ensure setDT will retain key and indices when it is called on the list (cbindlist assumes this) +local({ + d = data.table(x=1:2, y=2:1, z=2:1, v1=1:2) + setkeyv(d, "x"); setindexv(d, list("y", "z")) + a = attributes(d) + attributes(d) = a[!names(a) %in% c("class", ".internal.selfref", "row.names")] + test(2326.1, class(d), "list") + setDT(d) + test(2326.2, key(d), "x") + test(2326.3, indices(d), c("y", "z")) +}) diff --git a/man/cbindlist.Rd b/man/cbindlist.Rd new file mode 100644 index 000000000..a26563ec1 --- /dev/null +++ b/man/cbindlist.Rd @@ -0,0 +1,41 @@ +\name{cbindlist} +\alias{cbindlist} +\alias{setcbindlist} +\alias{cbind} +\alias{cbind.data.table} +\title{Column bind multiple data.tables} +\description{ + Column bind multiple \code{data.table}s. +} +\usage{ + cbindlist(l) + setcbindlist(l) +} +\arguments{ + \item{l}{ \code{list} of \code{data.table}s to merge. } +} +\details{ + Column bind only stacks input elements. Works like \code{\link{data.table}}, but takes \code{list} type on input. Zero-column tables in \code{l} are omitted. Tables in \code{l} should have matching row count; recycling of length-1 rows is not yet implemented. Indices of the input tables are transferred to the resulting table, as well as the \emph{key} of the first keyed table. +} +\value{ + A new \code{data.table} based on the stacked objects. + + For \code{setcbindlist}, columns in the output will be shared with the input, i.e., \emph{no copy is made}. +} +\note{ + No attempt is made to deduplicate resulting names. If the result has any duplicate names, keys and indices are removed. +} +\seealso{ + \code{\link{data.table}}, \code{\link{rbindlist}}, \code{\link{setDT}} +} +\examples{ +d1 = data.table(x=1:3, v1=1L, key="x") +d2 = data.table(y=3:1, v2=2L, key="y") +d3 = data.table(z=2:4, v3=3L) +cbindlist(list(d1, d2, d3)) +cbindlist(list(d1, d1)) +d4 = setcbindlist(list(d1)) +d4[, v1:=2L] +identical(d4, d1) +} +\keyword{ data } diff --git a/src/data.table.h b/src/data.table.h index 56b3e34dc..e63fcd855 100644 --- a/src/data.table.h +++ b/src/data.table.h @@ -298,6 +298,9 @@ SEXP substitute_call_arg_namesR(SEXP expr, SEXP env); //negate.c SEXP notchin(SEXP x, SEXP table); +// mergelist.c +SEXP cbindlist(SEXP x, SEXP copyArg); + // functions called from R level .Call/.External and registered in init.c // these now live here to pass -Wstrict-prototypes, #5477 // all arguments must be SEXP since they are called from R level diff --git a/src/init.c b/src/init.c index faf07081c..13fb3e424 100644 --- a/src/init.c +++ b/src/init.c @@ -149,6 +149,7 @@ R_CallMethodDef callMethods[] = { {"CstartsWithAny", (DL_FUNC)&startsWithAny, -1}, {"CconvertDate", (DL_FUNC)&convertDate, -1}, {"Cnotchin", (DL_FUNC)¬chin, -1}, +{"Ccbindlist", (DL_FUNC) &cbindlist, -1}, {"Cwarn_matrix_column_r", (DL_FUNC)&warn_matrix_column_r, -1}, {NULL, NULL, 0} }; diff --git a/src/mergelist.c b/src/mergelist.c new file mode 100644 index 000000000..510da548c --- /dev/null +++ b/src/mergelist.c @@ -0,0 +1,78 @@ +#include "data.table.h" + +void mergeIndexAttrib(SEXP to, SEXP from) { + if (!isInteger(to) || LENGTH(to)!=0) + internal_error(__func__, "'to' must be integer() already"); // # nocov + if (isNull(from)) + return; + SEXP t = ATTRIB(to), f = ATTRIB(from); + if (isNull(t)) // target has no attributes -> overwrite + SET_ATTRIB(to, shallow_duplicate(f)); + else { + for (t = ATTRIB(to); CDR(t) != R_NilValue; t = CDR(t)); // traverse to end of attributes list of to + SETCDR(t, shallow_duplicate(f)); + } +} + +SEXP cbindlist(SEXP x, SEXP copyArg) { + if (!isNewList(x) || isFrame(x)) + error(_("'%s' must be a list"), "x"); + bool copy = (bool)LOGICAL(copyArg)[0]; + const bool verbose = GetVerbose(); + double tic = 0; + if (verbose) + tic = omp_get_wtime(); + int nx = length(x), nans = 0, nr = -1, *nnx = (int*)R_alloc(nx, sizeof(int)); + bool recycle = false; + for (int i=0; i