diff --git a/R-package/DESCRIPTION b/R-package/DESCRIPTION
index bbaf3e75da4e..66e2b5692190 100644
--- a/R-package/DESCRIPTION
+++ b/R-package/DESCRIPTION
@@ -65,6 +65,6 @@ Imports:
data.table (>= 1.9.6),
jsonlite (>= 1.0)
Roxygen: list(markdown = TRUE)
-RoxygenNote: 7.3.0
+RoxygenNote: 7.3.1
Encoding: UTF-8
SystemRequirements: GNU make, C++17
diff --git a/R-package/NAMESPACE b/R-package/NAMESPACE
index 398b0da5a056..49f93bb57274 100644
--- a/R-package/NAMESPACE
+++ b/R-package/NAMESPACE
@@ -34,6 +34,11 @@ export(slice)
export(xgb.DMatrix)
export(xgb.DMatrix.hasinfo)
export(xgb.DMatrix.save)
+export(xgb.DataIter)
+export(xgb.ExternalDMatrix)
+export(xgb.ProxyDMatrix)
+export(xgb.QuantileDMatrix)
+export(xgb.QuantileDMatrix.from_iterator)
export(xgb.attr)
export(xgb.attributes)
export(xgb.config)
diff --git a/R-package/R/xgb.DMatrix.R b/R-package/R/xgb.DMatrix.R
index 7c4c30bd3035..da036b952b83 100644
--- a/R-package/R/xgb.DMatrix.R
+++ b/R-package/R/xgb.DMatrix.R
@@ -1,13 +1,42 @@
#' Construct xgb.DMatrix object
#'
-#' Construct xgb.DMatrix object from either a dense matrix, a sparse matrix, or a local file.
-#' Supported input file formats are either a LIBSVM text file or a binary file that was created previously by
-#' \code{\link{xgb.DMatrix.save}}).
-#'
-#' @param data a \code{matrix} object (either numeric or integer), a \code{dgCMatrix} object,
-#' a \code{dgRMatrix} object,
-#' a \code{dsparseVector} object (only when making predictions from a fitted model, will be
-#' interpreted as a row vector), or a character string representing a filename.
+#' Construct an 'xgb.DMatrix' object from a given data source, which can then be passed to functions
+#' such as \link{xgb.train} or \link{predict.xgb.Booster}.
+#'
+#' Function 'xgb.QuantileDMatrix' will construct a DMatrix with quantization for the histogram
+#' method already applied to it, which can be used to reduce memory usage (compared to using a
+#' a regular DMatrix first and then creating a quantization out of it) when using the histogram
+#' method (`tree_method = "hist"`, which is the default algorithm), but is not usable for the
+#' sorted-indices method (`tree_method = "exact"`), nor for the approximate method
+#' (`tree_method = "approx"`).
+#' @param data Data from which to create a DMatrix, which can then be used for fitting models or
+#' for getting predictions out of a fitted model.
+#'
+#' Supported input types are as follows:\itemize{
+#' \item `matrix` objects, with types `numeric`, `integer`, or `logical`.
+#' \item `data.frame` objects, with columns of types `numeric`, `integer`, `logical`, or `factor`.
+#'
+#' If passing `enable_categorical=TRUE`, columns with `factor` type will be treated as categorical.
+#' Otherwise, if passing `enable_categorical=FALSE` and the data contains `factor` columns, an error
+#' will be thrown.
+#'
+#' Note that xgboost uses base-0 encoding for categorical types, hence `factor` types (which use base-1
+#' encoding') will be converted inside the function call. Be aware that the encoding used for `factor`
+#' types is not kept as part of the model, so in subsequent calls to `predict`, it is the user's
+#' responsibility to ensure that factor columns have the same levels as the ones from which the DMatrix
+#' was constructed.
+#'
+#' Other column types are not supported.
+#' \item CSR matrices, as class `dgRMatrix` from package `Matrix`.
+#' \item CSC matrices, as class `dgCMatrix` from package `Matrix`. These are \bold{not} supported for
+#' 'xgb.QuantileDMatrix'.
+#' \item Single-row CSR matrices, as class `dsparseVector` from package `Matrix`, which is interpreted
+#' as a single row (only when making predictions from a fitted model).
+#' \item Text files in SVMLight / LibSVM formats, passed as a path to the file. These are \bold{not}
+#' supported for xgb.QuantileDMatrix'.
+#' \item Binary files generated by \link{xgb.DMatrix.save}, passed as a path to the file. These are
+#' \bold{not} supported for xgb.QuantileDMatrix'.
+#' }
#' @param label Label of the training data.
#' @param weight Weight for each instance.
#'
@@ -18,11 +47,32 @@
#' @param base_margin Base margin used for boosting from existing model.
#'
#' In the case of multi-output models, one can also pass multi-dimensional base_margin.
-#' @param missing a float value to represents missing values in data (used only when input is a dense matrix).
-#' It is useful when a 0 or some other extreme value represents missing values in data.
+#' @param missing A float value to represents missing values in data (not used when creating DMatrix
+#' from text files).
+#' It is useful to change when a zero, infinite, or some other extreme value represents missing
+#' values in data.
#' @param silent whether to suppress printing an informational message after loading from a file.
#' @param feature_names Set names for features. Overrides column names in data
#' frame and matrix.
+#'
+#' Note: columns are not referenced by name when calling `predict`, so the column order there
+#' must be the same as in the DMatrix construction, regardless of the column names.
+#' @param feature_types Set types for features.
+#'
+#' If `data` is a `data.frame` and passing `enable_categorical=TRUE`, the types will be deduced
+#' automatically from the column types.
+#'
+#' Otherwise, one can pass a character vector with the same length as number of columns in `data`,
+#' with the following possible values:\itemize{
+#' \item "c", which represents categorical columns.
+#' \item "q", which represents numeric columns.
+#' \item "int", which represents integer columns.
+#' \item "i", which represents logical (boolean) columns.
+#' }
+#'
+#' Note that, while categorical types are treated differently from the rest for model fitting
+#' purposes, the other types do not influence the generated model, but have effects in other
+#' functionalities such as feature importances.
#' @param nthread Number of threads used for creating DMatrix.
#' @param group Group size for all ranking group.
#' @param qid Query ID for data samples, used for ranking.
@@ -41,6 +91,8 @@
#' If 'data' is not a data frame, this argument is ignored.
#'
#' JSON/UBJSON serialization format is required for this.
+#' @return An 'xgb.DMatrix' object. If calling 'xgb.QuantileDMatrix', it will have additional
+#' subclass 'xgb.QuantileDMatrix'.
#'
#' @details
#' Note that DMatrix objects are not serializable through R functions such as \code{saveRDS} or \code{save}.
@@ -60,6 +112,7 @@
#' xgb.DMatrix.save(dtrain, fname)
#' dtrain <- xgb.DMatrix(fname)
#' @export
+#' @rdname xgb.DMatrix
xgb.DMatrix <- function(
data,
label = NULL,
@@ -68,6 +121,7 @@ xgb.DMatrix <- function(
missing = NA,
silent = FALSE,
feature_names = colnames(data),
+ feature_types = NULL,
nthread = NULL,
group = NULL,
qid = NULL,
@@ -79,7 +133,7 @@ xgb.DMatrix <- function(
if (!is.null(group) && !is.null(qid)) {
stop("Either one of 'group' or 'qid' should be NULL")
}
- ctypes <- NULL
+ nthread <- as.integer(NVL(nthread, -1L))
if (typeof(data) == "character") {
if (length(data) > 1) {
stop(
@@ -91,7 +145,7 @@ xgb.DMatrix <- function(
handle <- .Call(XGDMatrixCreateFromFile_R, data, as.integer(silent))
} else if (is.matrix(data)) {
handle <- .Call(
- XGDMatrixCreateFromMat_R, data, missing, as.integer(NVL(nthread, -1))
+ XGDMatrixCreateFromMat_R, data, missing, nthread
)
} else if (inherits(data, "dgCMatrix")) {
handle <- .Call(
@@ -101,7 +155,7 @@ xgb.DMatrix <- function(
data@x,
nrow(data),
missing,
- as.integer(NVL(nthread, -1))
+ nthread
)
} else if (inherits(data, "dgRMatrix")) {
handle <- .Call(
@@ -111,7 +165,7 @@ xgb.DMatrix <- function(
data@x,
ncol(data),
missing,
- as.integer(NVL(nthread, -1))
+ nthread
)
} else if (inherits(data, "dsparseVector")) {
indptr <- c(0L, as.integer(length(data@i)))
@@ -123,51 +177,99 @@ xgb.DMatrix <- function(
data@x,
length(data),
missing,
- as.integer(NVL(nthread, -1))
+ nthread
)
} else if (is.data.frame(data)) {
- ctypes <- sapply(data, function(x) {
- if (is.factor(x)) {
+ tmp <- .process.df.for.dmatrix(data, enable_categorical, feature_types)
+ feature_types <- tmp$feature_types
+ handle <- .Call(
+ XGDMatrixCreateFromDF_R, tmp$lst, missing, nthread
+ )
+ rm(tmp)
+ } else {
+ stop("xgb.DMatrix does not support construction from ", typeof(data))
+ }
+
+ dmat <- handle
+ attributes(dmat) <- list(
+ class = "xgb.DMatrix",
+ fields = new.env()
+ )
+ .set.dmatrix.fields(
+ dmat = dmat,
+ label = label,
+ weight = weight,
+ base_margin = base_margin,
+ feature_names = feature_names,
+ feature_types = feature_types,
+ group = group,
+ qid = qid,
+ label_lower_bound = label_lower_bound,
+ label_upper_bound = label_upper_bound,
+ feature_weights = feature_weights
+ )
+
+ return(dmat)
+}
+
+.process.df.for.dmatrix <- function(df, enable_categorical, feature_types) {
+ if (!nrow(df) || !ncol(df)) {
+ stop("'data' is an empty data.frame.")
+ }
+ if (!is.null(feature_types)) {
+ if (!is.character(feature_types) || length(feature_types) != ncol(df)) {
+ stop(
+ "'feature_types' must be a character vector with one entry per column in 'data'."
+ )
+ }
+ } else {
+ feature_types <- sapply(df, function(col) {
+ if (is.factor(col)) {
if (!enable_categorical) {
stop(
"When factor type is used, the parameter `enable_categorical`",
" must be set to TRUE."
)
}
- "c"
- } else if (is.integer(x)) {
- "int"
- } else if (is.logical(x)) {
- "i"
+ return("c")
+ } else if (is.integer(col)) {
+ return("int")
+ } else if (is.logical(col)) {
+ return("i")
} else {
- if (!is.numeric(x)) {
+ if (!is.numeric(col)) {
stop("Invalid type in dataframe.")
}
- "float"
+ return("float")
}
})
- ## as.data.frame somehow converts integer/logical into real.
- data <- as.data.frame(sapply(data, function(x) {
- if (is.factor(x)) {
- ## XGBoost uses 0-based indexing.
- as.numeric(x) - 1
- } else {
- x
- }
- }))
- handle <- .Call(
- XGDMatrixCreateFromDF_R, data, missing, as.integer(NVL(nthread, -1))
- )
- } else {
- stop("xgb.DMatrix does not support construction from ", typeof(data))
}
- dmat <- handle
- attributes(dmat) <- list(
- class = "xgb.DMatrix",
- fields = new.env()
- )
+ lst <- lapply(df, function(col) {
+ is_factor <- is.factor(col)
+ col <- as.numeric(col)
+ if (is_factor) {
+ col <- col - 1
+ }
+ return(col)
+ })
+
+ return(list(lst = lst, feature_types = feature_types))
+}
+.set.dmatrix.fields <- function(
+ dmat,
+ label,
+ weight,
+ base_margin,
+ feature_names,
+ feature_types,
+ group,
+ qid,
+ label_lower_bound,
+ label_upper_bound,
+ feature_weights
+) {
if (!is.null(label)) {
setinfo(dmat, "label", label)
}
@@ -180,6 +282,9 @@ xgb.DMatrix <- function(
if (!is.null(feature_names)) {
setinfo(dmat, "feature_name", feature_names)
}
+ if (!is.null(feature_types)) {
+ setinfo(dmat, "feature_type", feature_types)
+ }
if (!is.null(group)) {
setinfo(dmat, "group", group)
}
@@ -195,10 +300,515 @@ xgb.DMatrix <- function(
if (!is.null(feature_weights)) {
setinfo(dmat, "feature_weights", feature_weights)
}
- if (!is.null(ctypes)) {
- setinfo(dmat, "feature_type", ctypes)
+}
+
+#' @param ref The training dataset that provides quantile information, needed when creating
+#' validation/test dataset with `xgb.QuantileDMatrix`. Supplying the training DMatrix
+#' as a reference means that the same quantisation applied to the training data is
+#' applied to the validation/test data
+#' @param max_bin The number of histogram bin, should be consistent with the training parameter
+#' `max_bin`.
+#'
+#' This is only supported when constructing a QuantileDMatrix.
+#' @export
+#' @rdname xgb.DMatrix
+xgb.QuantileDMatrix <- function(
+ data,
+ label = NULL,
+ weight = NULL,
+ base_margin = NULL,
+ missing = NA,
+ feature_names = colnames(data),
+ feature_types = NULL,
+ nthread = NULL,
+ group = NULL,
+ qid = NULL,
+ label_lower_bound = NULL,
+ label_upper_bound = NULL,
+ feature_weights = NULL,
+ enable_categorical = FALSE,
+ ref = NULL,
+ max_bin = NULL
+) {
+ nthread <- as.integer(NVL(nthread, -1L))
+ if (!is.null(ref) && !inherits(ref, "xgb.DMatrix")) {
+ stop("'ref' must be an xgb.DMatrix object.")
+ }
+
+ # Note: when passing an integer matrix, it won't get casted to numeric.
+ # Since 'int' values as understood by languages like C cannot have missing values,
+ # R represents missingness there by assigning them a value equal to the minimum
+ # integer. The 'missing' value here is set before the data, so in case of integers,
+ # need to make the conversion manually beforehand.
+ if (is.matrix(data) && storage.mode(data) %in% c("integer", "logical") && is.na(missing)) {
+ missing <- .Call(XGGetRNAIntAsDouble)
+ }
+
+ iterator_env <- as.environment(
+ list(
+ data = data,
+ label = label,
+ weight = weight,
+ base_margin = base_margin,
+ missing = missing,
+ feature_names = feature_names,
+ feature_types = feature_types,
+ group = group,
+ qid = qid,
+ label_lower_bound = label_lower_bound,
+ label_upper_bound = label_upper_bound,
+ feature_weights = feature_weights,
+ enable_categorical = enable_categorical
+ )
+ )
+ data_iterator <- .single.data.iterator(iterator_env)
+
+ # Note: the ProxyDMatrix has its finalizer assigned in the R externalptr
+ # object, but that finalizer will only be called once the object is
+ # garbage-collected, which doesn't happen immediately after it goes out
+ # of scope, hence this piece of code to tigger its destruction earlier
+ # and free memory right away.
+ proxy_handle <- .make.proxy.handle()
+ on.exit({
+ .Call(XGDMatrixFree_R, proxy_handle)
+ })
+ iterator_next <- function() {
+ return(xgb.ProxyDMatrix.internal(proxy_handle, data_iterator))
+ }
+ iterator_reset <- function() {
+ return(data_iterator$f_reset(iterator_env))
+ }
+ calling_env <- environment()
+
+ dmat <- .Call(
+ XGQuantileDMatrixCreateFromCallback_R,
+ iterator_next,
+ iterator_reset,
+ calling_env,
+ proxy_handle,
+ nthread,
+ missing,
+ max_bin,
+ ref
+ )
+ attributes(dmat) <- list(
+ class = c("xgb.DMatrix", "xgb.QuantileDMatrix"),
+ fields = attributes(proxy_handle)$fields
+ )
+ return(dmat)
+}
+
+#' @title XGBoost Data Iterator
+#' @description Interface to create a custom data iterator in order to construct a DMatrix
+#' from external memory.
+#'
+#' This function is responsible for generating an R object structure containing callback
+#' functions and an environment shared with them.
+#'
+#' The output structure from this function is then meant to be passed to \link{xgb.ExternalDMatrix},
+#' which will consume the data and create a DMatrix from it by executing the callback functions.
+#'
+#' For more information, and for a usage example, see the documentation for \link{xgb.ExternalDMatrix}.
+#' @param env An R environment to pass to the callback functions supplied here, which can be
+#' used to keep track of variables to determine how to handle the batches.
+#'
+#' For example, one might want to keep track of an iteration number in this environment in order
+#' to know which part of the data to pass next.
+#' @param f_next `function(env)` which is responsible for:\itemize{
+#' \item Accessing or retrieving the next batch of data in the iterator.
+#' \item Supplying this data by calling function \link{xgb.ProxyDMatrix} on it and returning the result.
+#' \item Keeping track of where in the iterator batch it is or will go next, which can for example
+#' be done by modifiying variables in the `env` variable that is passed here.
+#' \item Signaling whether there are more batches to be consumed or not, by returning `NULL`
+#' when the stream of data ends (all batches in the iterator have been consumed), or the result from
+#' calling \link{xgb.ProxyDMatrix} when there are more batches in the line to be consumed.
+#' }
+#' @param f_reset `function(env)` which is responsible for reseting the data iterator
+#' (i.e. taking it back to the first batch, called before and after the sequence of batches
+#' has been consumed).
+#'
+#' Note that, after resetting the iterator, the batches will be accessed again, so the same data
+#' (and in the same order) must be passed in subsequent iterations.
+#' @return An `xgb.DataIter` object, containing the same inputs supplied here, which can then
+#' be passed to \link{xgb.ExternalDMatrix}.
+#' @seealso \link{xgb.ExternalDMatrix}, \link{xgb.ProxyDMatrix}.
+#' @export
+xgb.DataIter <- function(env = new.env(), f_next, f_reset) {
+ if (!is.function(f_next)) {
+ stop("'f_next' must be a function.")
+ }
+ if (!is.function(f_reset)) {
+ stop("'f_reset' must be a function.")
+ }
+ out <- list(
+ env = env,
+ f_next = f_next,
+ f_reset = f_reset
+ )
+ class(out) <- "xgb.DataIter"
+ return(out)
+}
+
+.qdm.single.fnext <- function(env) {
+ curr_iter <- env[["iter"]]
+ if (curr_iter >= 1L) {
+ return(NULL)
+ }
+
+ on.exit({
+ env[["iter"]] <- curr_iter + 1L
+ })
+ return(
+ xgb.ProxyDMatrix(
+ data = env[["data"]],
+ label = env[["label"]],
+ weight = env[["weight"]],
+ base_margin = env[["base_margin"]],
+ feature_names = env[["feature_names"]],
+ feature_types = env[["feature_types"]],
+ group = env[["group"]],
+ qid = env[["qid"]],
+ label_lower_bound = env[["label_lower_bound"]],
+ label_upper_bound = env[["label_upper_bound"]],
+ feature_weights = env[["feature_weights"]],
+ enable_categorical = env[["enable_categorical"]]
+ )
+ )
+}
+
+.qdm.single.freset <- function(env) {
+ env[["iter"]] <- 0L
+ return(invisible(NULL))
+}
+
+.single.data.iterator <- function(env) {
+ env[["iter"]] <- 0L
+ return(xgb.DataIter(env, .qdm.single.fnext, .qdm.single.freset))
+}
+
+# Only for internal usage
+.make.proxy.handle <- function() {
+ out <- .Call(XGProxyDMatrixCreate_R)
+ attributes(out) <- list(
+ class = c("xgb.DMatrix", "xgb.ProxyDMatrixHandle"),
+ fields = new.env()
+ )
+ return(out)
+}
+
+#' @title Proxy DMatrix Updater
+#' @description Helper function to supply data in batches of a data iterator when
+#' constructing a DMatrix from external memory through \link{xgb.ExternalDMatrix}
+#' or through \link{xgb.QuantileDMatrix.from_iterator}.
+#'
+#' This function is \bold{only} meant to be called inside of a callback function (which
+#' is passed as argument to function \link{xgb.DataIter} to construct a data iterator)
+#' when constructing a DMatrix through external memory - otherwise, one should call
+#' \link{xgb.DMatrix} or \link{xgb.QuantileDMatrix}.
+#'
+#' The object that results from calling this function directly is \bold{not} like the other
+#' `xgb.DMatrix` variants - i.e. cannot be used to train a model, nor to get predictions - only
+#' possible usage is to supply data to an iterator, from which a DMatrix is then constructed.
+#'
+#' For more information and for example usage, see the documentation for \link{xgb.ExternalDMatrix}.
+#' @inheritParams xgb.DMatrix
+#' @param data Batch of data belonging to this batch.
+#'
+#' Note that not all of the input types supported by \link{xgb.DMatrix} are possible
+#' to pass here. Supported types are:\itemize{
+#' \item `matrix`, with types `numeric`, `integer`, and `logical`. Note that for types
+#' `integer` and `logical`, missing values might not be automatically recognized as
+#' as such - see the documentation for parameter `missing` in \link{xgb.ExternalDMatrix}
+#' for details on this.
+#' \item `data.frame`, with the same types as supported by 'xgb.DMatrix' and same
+#' conversions applied to it. See the documentation for parameter `data` in
+#' \link{xgb.DMatrix} for details on it.
+#' \item CSR matrices, as class `dgRMatrix` from package `Matrix`.
+#' }
+#' @return An object of class `xgb.ProxyDMatrix`, which is just a list containing the
+#' data and parameters passed here. It does \bold{not} inherit from `xgb.DMatrix`.
+#' @seealso \link{xgb.DataIter}, \link{xgb.ExternalDMatrix}.
+#' @export
+xgb.ProxyDMatrix <- function(
+ data,
+ label = NULL,
+ weight = NULL,
+ base_margin = NULL,
+ feature_names = colnames(data),
+ feature_types = NULL,
+ group = NULL,
+ qid = NULL,
+ label_lower_bound = NULL,
+ label_upper_bound = NULL,
+ feature_weights = NULL,
+ enable_categorical = FALSE
+) {
+ stopifnot(inherits(data, c("matrix", "data.frame", "dgRMatrix")))
+ out <- list(
+ data = data,
+ label = label,
+ weight = weight,
+ base_margin = base_margin,
+ feature_names = feature_names,
+ feature_types = feature_types,
+ group = group,
+ qid = qid,
+ label_lower_bound = label_lower_bound,
+ label_upper_bound = label_upper_bound,
+ feature_weights = feature_weights,
+ enable_categorical = enable_categorical
+ )
+ class(out) <- "xgb.ProxyDMatrix"
+ return(out)
+}
+
+xgb.ProxyDMatrix.internal <- function(proxy_handle, data_iterator) {
+ lst <- data_iterator$f_next(data_iterator$env)
+ if (is.null(lst)) {
+ return(0L)
+ }
+ if (!inherits(lst, "xgb.ProxyDMatrix")) {
+ stop("DataIter 'f_next' must return either NULL or the result from calling 'xgb.ProxyDMatrix'.")
+ }
+
+ if (!is.null(lst$group) && !is.null(lst$qid)) {
+ stop("Either one of 'group' or 'qid' should be NULL")
+ }
+ if (is.data.frame(lst$data)) {
+ tmp <- .process.df.for.dmatrix(lst$data, lst$enable_categorical, lst$feature_types)
+ lst$feature_types <- tmp$feature_types
+ .Call(XGProxyDMatrixSetDataColumnar_R, proxy_handle, tmp$lst)
+ rm(tmp)
+ } else if (is.matrix(lst$data)) {
+ .Call(XGProxyDMatrixSetDataDense_R, proxy_handle, lst$data)
+ } else if (inherits(lst$data, "dgRMatrix")) {
+ tmp <- list(p = lst$data@p, j = lst$data@j, x = lst$data@x, ncol = ncol(lst$data))
+ .Call(XGProxyDMatrixSetDataCSR_R, proxy_handle, tmp)
+ } else {
+ stop("'data' has unsupported type.")
+ }
+
+ .set.dmatrix.fields(
+ dmat = proxy_handle,
+ label = lst$label,
+ weight = lst$weight,
+ base_margin = lst$base_margin,
+ feature_names = lst$feature_names,
+ feature_types = lst$feature_types,
+ group = lst$group,
+ qid = lst$qid,
+ label_lower_bound = lst$label_lower_bound,
+ label_upper_bound = lst$label_upper_bound,
+ feature_weights = lst$feature_weights
+ )
+
+ return(1L)
+}
+
+#' @title DMatrix from External Data
+#' @description Create a special type of xgboost 'DMatrix' object from external data
+#' supplied by an \link{xgb.DataIter} object, potentially passed in batches from a
+#' bigger set that might not fit entirely in memory.
+#'
+#' The data supplied by the iterator is accessed on-demand as needed, multiple times,
+#' without being concatenated, but note that fields like 'label' \bold{will} be
+#' concatenated from multiple calls to the data iterator.
+#'
+#' For more information, see the guide 'Using XGBoost External Memory Version':
+#' \url{https://xgboost.readthedocs.io/en/stable/tutorials/external_memory.html}
+#' @inheritParams xgb.DMatrix
+#' @param data_iterator A data iterator structure as returned by \link{xgb.DataIter},
+#' which includes an environment shared between function calls, and functions to access
+#' the data in batches on-demand.
+#' @param cache_prefix The path of cache file, caller must initialize all the directories in this path.
+#' @param missing A float value to represents missing values in data.
+#'
+#' Note that, while functions like \link{xgb.DMatrix} can take a generic `NA` and interpret it
+#' correctly for different types like `numeric` and `integer`, if an `NA` value is passed here,
+#' it will not be adapted for different input types.
+#'
+#' For example, in R `integer` types, missing values are represented by integer number `-2147483648`
+#' (since machine 'integer' types do not have an inherent 'NA' value) - hence, if one passes `NA`,
+#' which is interpreted as a floating-point NaN by 'xgb.ExternalDMatrix' and by
+#' 'xgb.QuantileDMatrix.from_iterator', these integer missing values will not be treated as missing.
+#' This should not pose any problem for `numeric` types, since they do have an inheret NaN value.
+#' @return An 'xgb.DMatrix' object, with subclass 'xgb.ExternalDMatrix', in which the data is not
+#' held internally but accessed through the iterator when needed.
+#' @seealso \link{xgb.DataIter}, \link{xgb.ProxyDMatrix}, \link{xgb.QuantileDMatrix.from_iterator}
+#' @examples
+#' library(xgboost)
+#' data(mtcars)
+#'
+#' # this custom environment will be passed to the iterator
+#' # functions at each call. It's up to the user to keep
+#' # track of the iteration number in this environment.
+#' iterator_env <- as.environment(
+#' list(
+#' iter = 0,
+#' x = mtcars[, -1],
+#' y = mtcars[, 1]
+#' )
+#' )
+#'
+#' # Data is passed in two batches.
+#' # In this example, batches are obtained by subsetting the 'x' variable.
+#' # This is not advantageous to do, since the data is already loaded in memory
+#' # and can be passed in full in one go, but there can be situations in which
+#' # only a subset of the data will fit in the computer's memory, and it can
+#' # be loaded in batches that are accessed one-at-a-time only.
+#' iterator_next <- function(iterator_env) {
+#' curr_iter <- iterator_env[["iter"]]
+#' if (curr_iter >= 2) {
+#' # there are only two batches, so this signals end of the stream
+#' return(NULL)
+#' }
+#'
+#' if (curr_iter == 0) {
+#' x_batch <- iterator_env[["x"]][1:16, ]
+#' y_batch <- iterator_env[["y"]][1:16]
+#' } else {
+#' x_batch <- iterator_env[["x"]][17:32, ]
+#' y_batch <- iterator_env[["y"]][17:32]
+#' }
+#' on.exit({
+#' iterator_env[["iter"]] <- curr_iter + 1
+#' })
+#'
+#' # Function 'xgb.ProxyDMatrix' must be called manually
+#' # at each batch with all the appropriate attributes,
+#' # such as feature names and feature types.
+#' return(xgb.ProxyDMatrix(data = x_batch, label = y_batch))
+#' }
+#'
+#' # This moves the iterator back to its beginning
+#' iterator_reset <- function(iterator_env) {
+#' iterator_env[["iter"]] <- 0
+#' }
+#'
+#' data_iterator <- xgb.DataIter(
+#' env = iterator_env,
+#' f_next = iterator_next,
+#' f_reset = iterator_reset
+#' )
+#' cache_prefix <- tempdir()
+#'
+#' # DMatrix will be constructed from the iterator's batches
+#' dm <- xgb.ExternalDMatrix(data_iterator, cache_prefix, nthread = 1)
+#'
+#' # After construction, can be used as a regular DMatrix
+#' params <- list(nthread = 1, objective = "reg:squarederror")
+#' model <- xgb.train(data = dm, nrounds = 2, params = params)
+#'
+#' # Predictions can also be called on it, and should be the same
+#' # as if the data were passed differently.
+#' pred_dm <- predict(model, dm)
+#' pred_mat <- predict(model, as.matrix(mtcars[, -1]))
+#' @export
+xgb.ExternalDMatrix <- function(
+ data_iterator,
+ cache_prefix = tempdir(),
+ missing = NA,
+ nthread = NULL
+) {
+ stopifnot(inherits(data_iterator, "xgb.DataIter"))
+ stopifnot(is.character(cache_prefix))
+
+ cache_prefix <- path.expand(cache_prefix)
+ nthread <- as.integer(NVL(nthread, -1L))
+
+ proxy_handle <- .make.proxy.handle()
+ on.exit({
+ .Call(XGDMatrixFree_R, proxy_handle)
+ })
+ iterator_next <- function() {
+ return(xgb.ProxyDMatrix.internal(proxy_handle, data_iterator))
+ }
+ iterator_reset <- function() {
+ return(data_iterator$f_reset(data_iterator$env))
}
+ calling_env <- environment()
+ dmat <- .Call(
+ XGDMatrixCreateFromCallback_R,
+ iterator_next,
+ iterator_reset,
+ calling_env,
+ proxy_handle,
+ nthread,
+ missing,
+ cache_prefix
+ )
+
+ attributes(dmat) <- list(
+ class = c("xgb.DMatrix", "xgb.ExternalDMatrix"),
+ fields = attributes(proxy_handle)$fields
+ )
+ return(dmat)
+}
+
+
+#' @title QuantileDMatrix from External Data
+#' @description Create an `xgb.QuantileDMatrix` object (exact same class as would be returned by
+#' calling function \link{xgb.QuantileDMatrix}, with the same advantages and limitations) from
+#' external data supplied by an \link{xgb.DataIter} object, potentially passed in batches from
+#' a bigger set that might not fit entirely in memory, same way as \link{xgb.ExternalDMatrix}.
+#'
+#' Note that, while external data will only be loaded through the iterator (thus the full data
+#' might not be held entirely in-memory), the quantized representation of the data will get
+#' created in-memory, being concatenated from multiple calls to the data iterator. The quantized
+#' version is typically lighter than the original data, so there might be cases in which this
+#' representation could potentially fit in memory even if the full data doesn't.
+#'
+#' For more information, see the guide 'Using XGBoost External Memory Version':
+#' \url{https://xgboost.readthedocs.io/en/stable/tutorials/external_memory.html}
+#' @inheritParams xgb.ExternalDMatrix
+#' @inheritParams xgb.QuantileDMatrix
+#' @return An 'xgb.DMatrix' object, with subclass 'xgb.QuantileDMatrix'.
+#' @seealso \link{xgb.DataIter}, \link{xgb.ProxyDMatrix}, \link{xgb.ExternalDMatrix},
+#' \link{xgb.QuantileDMatrix}
+#' @export
+xgb.QuantileDMatrix.from_iterator <- function( # nolint
+ data_iterator,
+ missing = NA,
+ nthread = NULL,
+ ref = NULL,
+ max_bin = NULL
+) {
+ stopifnot(inherits(data_iterator, "xgb.DataIter"))
+ if (!is.null(ref) && !inherits(ref, "xgb.DMatrix")) {
+ stop("'ref' must be an xgb.DMatrix object.")
+ }
+
+ nthread <- as.integer(NVL(nthread, -1L))
+
+ proxy_handle <- .make.proxy.handle()
+ on.exit({
+ .Call(XGDMatrixFree_R, proxy_handle)
+ })
+ iterator_next <- function() {
+ return(xgb.ProxyDMatrix.internal(proxy_handle, data_iterator))
+ }
+ iterator_reset <- function() {
+ return(data_iterator$f_reset(data_iterator$env))
+ }
+ calling_env <- environment()
+
+ dmat <- .Call(
+ XGQuantileDMatrixCreateFromCallback_R,
+ iterator_next,
+ iterator_reset,
+ calling_env,
+ proxy_handle,
+ nthread,
+ missing,
+ max_bin,
+ ref
+ )
+
+ attributes(dmat) <- list(
+ class = c("xgb.DMatrix", "xgb.QuantileDMatrix"),
+ fields = attributes(proxy_handle)$fields
+ )
return(dmat)
}
@@ -712,7 +1322,17 @@ print.xgb.DMatrix <- function(x, verbose = FALSE, ...) {
cat("INVALID xgb.DMatrix object. Must be constructed anew.\n")
return(invisible(x))
}
- cat('xgb.DMatrix dim:', nrow(x), 'x', ncol(x), ' info: ')
+ class_print <- if (inherits(x, "xgb.QuantileDMatrix")) {
+ "xgb.QuantileDMatrix"
+ } else if (inherits(x, "xgb.ExternalDMatrix")) {
+ "xgb.ExternalDMatrix"
+ } else if (inherits(x, "xgb.ProxyDMatrix")) {
+ "xgb.ProxyDMatrix"
+ } else {
+ "xgb.DMatrix"
+ }
+
+ cat(class_print, ' dim:', nrow(x), 'x', ncol(x), ' info: ')
infos <- character(0)
if (xgb.DMatrix.hasinfo(x, 'label')) infos <- 'label'
if (xgb.DMatrix.hasinfo(x, 'weight')) infos <- c(infos, 'weight')
diff --git a/R-package/man/xgb.DMatrix.Rd b/R-package/man/xgb.DMatrix.Rd
index eb667377f0b3..ceb60dc42906 100644
--- a/R-package/man/xgb.DMatrix.Rd
+++ b/R-package/man/xgb.DMatrix.Rd
@@ -2,6 +2,7 @@
% Please edit documentation in R/xgb.DMatrix.R
\name{xgb.DMatrix}
\alias{xgb.DMatrix}
+\alias{xgb.QuantileDMatrix}
\title{Construct xgb.DMatrix object}
\usage{
xgb.DMatrix(
@@ -12,6 +13,7 @@ xgb.DMatrix(
missing = NA,
silent = FALSE,
feature_names = colnames(data),
+ feature_types = NULL,
nthread = NULL,
group = NULL,
qid = NULL,
@@ -20,12 +22,55 @@ xgb.DMatrix(
feature_weights = NULL,
enable_categorical = FALSE
)
+
+xgb.QuantileDMatrix(
+ data,
+ label = NULL,
+ weight = NULL,
+ base_margin = NULL,
+ missing = NA,
+ feature_names = colnames(data),
+ feature_types = NULL,
+ nthread = NULL,
+ group = NULL,
+ qid = NULL,
+ label_lower_bound = NULL,
+ label_upper_bound = NULL,
+ feature_weights = NULL,
+ enable_categorical = FALSE,
+ ref = NULL,
+ max_bin = NULL
+)
}
\arguments{
-\item{data}{a \code{matrix} object (either numeric or integer), a \code{dgCMatrix} object,
-a \code{dgRMatrix} object,
-a \code{dsparseVector} object (only when making predictions from a fitted model, will be
-interpreted as a row vector), or a character string representing a filename.}
+\item{data}{Data from which to create a DMatrix, which can then be used for fitting models or
+for getting predictions out of a fitted model.
+
+Supported input types are as follows:\itemize{
+\item \code{matrix} objects, with types \code{numeric}, \code{integer}, or \code{logical}.
+\item \code{data.frame} objects, with columns of types \code{numeric}, \code{integer}, \code{logical}, or \code{factor}.
+
+If passing \code{enable_categorical=TRUE}, columns with \code{factor} type will be treated as categorical.
+Otherwise, if passing \code{enable_categorical=FALSE} and the data contains \code{factor} columns, an error
+will be thrown.
+
+Note that xgboost uses base-0 encoding for categorical types, hence \code{factor} types (which use base-1
+encoding') will be converted inside the function call. Be aware that the encoding used for \code{factor}
+types is not kept as part of the model, so in subsequent calls to \code{predict}, it is the user's
+responsibility to ensure that factor columns have the same levels as the ones from which the DMatrix
+was constructed.
+
+Other column types are not supported.
+\item CSR matrices, as class \code{dgRMatrix} from package \code{Matrix}.
+\item CSC matrices, as class \code{dgCMatrix} from package \code{Matrix}. These are \bold{not} supported for
+'xgb.QuantileDMatrix'.
+\item Single-row CSR matrices, as class \code{dsparseVector} from package \code{Matrix}, which is interpreted
+as a single row (only when making predictions from a fitted model).
+\item Text files in SVMLight / LibSVM formats, passed as a path to the file. These are \bold{not}
+supported for xgb.QuantileDMatrix'.
+\item Binary files generated by \link{xgb.DMatrix.save}, passed as a path to the file. These are
+\bold{not} supported for xgb.QuantileDMatrix'.
+}}
\item{label}{Label of the training data.}
@@ -41,13 +86,36 @@ so it doesn't make sense to assign weights to individual data points.}
\if{html}{\out{
}}\preformatted{ In the case of multi-output models, one can also pass multi-dimensional base_margin.
}\if{html}{\out{
}}}
-\item{missing}{a float value to represents missing values in data (used only when input is a dense matrix).
-It is useful when a 0 or some other extreme value represents missing values in data.}
+\item{missing}{A float value to represents missing values in data (not used when creating DMatrix
+from text files).
+It is useful to change when a zero, infinite, or some other extreme value represents missing
+values in data.}
\item{silent}{whether to suppress printing an informational message after loading from a file.}
\item{feature_names}{Set names for features. Overrides column names in data
-frame and matrix.}
+frame and matrix.
+
+\if{html}{\out{}}\preformatted{ Note: columns are not referenced by name when calling `predict`, so the column order there
+ must be the same as in the DMatrix construction, regardless of the column names.
+}\if{html}{\out{
}}}
+
+\item{feature_types}{Set types for features.
+
+If \code{data} is a \code{data.frame} and passing \code{enable_categorical=TRUE}, the types will be deduced
+automatically from the column types.
+
+Otherwise, one can pass a character vector with the same length as number of columns in \code{data},
+with the following possible values:\itemize{
+\item "c", which represents categorical columns.
+\item "q", which represents numeric columns.
+\item "int", which represents integer columns.
+\item "i", which represents logical (boolean) columns.
+}
+
+Note that, while categorical types are treated differently from the rest for model fitting
+purposes, the other types do not influence the generated model, but have effects in other
+functionalities such as feature importances.}
\item{nthread}{Number of threads used for creating DMatrix.}
@@ -74,13 +142,33 @@ frame and matrix.}
JSON/UBJSON serialization format is required for this.
}\if{html}{\out{}}}
+
+\item{ref}{The training dataset that provides quantile information, needed when creating
+validation/test dataset with \code{xgb.QuantileDMatrix}. Supplying the training DMatrix
+as a reference means that the same quantisation applied to the training data is
+applied to the validation/test data}
+
+\item{max_bin}{The number of histogram bin, should be consistent with the training parameter
+\code{max_bin}.
+
+This is only supported when constructing a QuantileDMatrix.}
+}
+\value{
+An 'xgb.DMatrix' object. If calling 'xgb.QuantileDMatrix', it will have additional
+subclass 'xgb.QuantileDMatrix'.
}
\description{
-Construct xgb.DMatrix object from either a dense matrix, a sparse matrix, or a local file.
-Supported input file formats are either a LIBSVM text file or a binary file that was created previously by
-\code{\link{xgb.DMatrix.save}}).
+Construct an 'xgb.DMatrix' object from a given data source, which can then be passed to functions
+such as \link{xgb.train} or \link{predict.xgb.Booster}.
}
\details{
+Function 'xgb.QuantileDMatrix' will construct a DMatrix with quantization for the histogram
+method already applied to it, which can be used to reduce memory usage (compared to using a
+a regular DMatrix first and then creating a quantization out of it) when using the histogram
+method (\code{tree_method = "hist"}, which is the default algorithm), but is not usable for the
+sorted-indices method (\code{tree_method = "exact"}), nor for the approximate method
+(\code{tree_method = "approx"}).
+
Note that DMatrix objects are not serializable through R functions such as \code{saveRDS} or \code{save}.
If a DMatrix gets serialized and then de-serialized (for example, when saving data in an R session or caching
chunks in an Rmd file), the resulting object will not be usable anymore and will need to be reconstructed
diff --git a/R-package/man/xgb.DataIter.Rd b/R-package/man/xgb.DataIter.Rd
new file mode 100644
index 000000000000..29cf5acc9cf4
--- /dev/null
+++ b/R-package/man/xgb.DataIter.Rd
@@ -0,0 +1,51 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/xgb.DMatrix.R
+\name{xgb.DataIter}
+\alias{xgb.DataIter}
+\title{XGBoost Data Iterator}
+\usage{
+xgb.DataIter(env = new.env(), f_next, f_reset)
+}
+\arguments{
+\item{env}{An R environment to pass to the callback functions supplied here, which can be
+used to keep track of variables to determine how to handle the batches.
+
+For example, one might want to keep track of an iteration number in this environment in order
+to know which part of the data to pass next.}
+
+\item{f_next}{\verb{function(env)} which is responsible for:\itemize{
+\item Accessing or retrieving the next batch of data in the iterator.
+\item Supplying this data by calling function \link{xgb.ProxyDMatrix} on it and returning the result.
+\item Keeping track of where in the iterator batch it is or will go next, which can for example
+be done by modifiying variables in the \code{env} variable that is passed here.
+\item Signaling whether there are more batches to be consumed or not, by returning \code{NULL}
+when the stream of data ends (all batches in the iterator have been consumed), or the result from
+calling \link{xgb.ProxyDMatrix} when there are more batches in the line to be consumed.
+}}
+
+\item{f_reset}{\verb{function(env)} which is responsible for reseting the data iterator
+(i.e. taking it back to the first batch, called before and after the sequence of batches
+has been consumed).
+
+Note that, after resetting the iterator, the batches will be accessed again, so the same data
+(and in the same order) must be passed in subsequent iterations.}
+}
+\value{
+An \code{xgb.DataIter} object, containing the same inputs supplied here, which can then
+be passed to \link{xgb.ExternalDMatrix}.
+}
+\description{
+Interface to create a custom data iterator in order to construct a DMatrix
+from external memory.
+
+This function is responsible for generating an R object structure containing callback
+functions and an environment shared with them.
+
+The output structure from this function is then meant to be passed to \link{xgb.ExternalDMatrix},
+which will consume the data and create a DMatrix from it by executing the callback functions.
+
+For more information, and for a usage example, see the documentation for \link{xgb.ExternalDMatrix}.
+}
+\seealso{
+\link{xgb.ExternalDMatrix}, \link{xgb.ProxyDMatrix}.
+}
diff --git a/R-package/man/xgb.ExternalDMatrix.Rd b/R-package/man/xgb.ExternalDMatrix.Rd
new file mode 100644
index 000000000000..3e7844990b50
--- /dev/null
+++ b/R-package/man/xgb.ExternalDMatrix.Rd
@@ -0,0 +1,122 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/xgb.DMatrix.R
+\name{xgb.ExternalDMatrix}
+\alias{xgb.ExternalDMatrix}
+\title{DMatrix from External Data}
+\usage{
+xgb.ExternalDMatrix(
+ data_iterator,
+ cache_prefix = tempdir(),
+ missing = NA,
+ nthread = NULL
+)
+}
+\arguments{
+\item{data_iterator}{A data iterator structure as returned by \link{xgb.DataIter},
+which includes an environment shared between function calls, and functions to access
+the data in batches on-demand.}
+
+\item{cache_prefix}{The path of cache file, caller must initialize all the directories in this path.}
+
+\item{missing}{A float value to represents missing values in data.
+
+Note that, while functions like \link{xgb.DMatrix} can take a generic \code{NA} and interpret it
+correctly for different types like \code{numeric} and \code{integer}, if an \code{NA} value is passed here,
+it will not be adapted for different input types.
+
+For example, in R \code{integer} types, missing values are represented by integer number \code{-2147483648}
+(since machine 'integer' types do not have an inherent 'NA' value) - hence, if one passes \code{NA},
+which is interpreted as a floating-point NaN by 'xgb.ExternalDMatrix' and by
+'xgb.QuantileDMatrix.from_iterator', these integer missing values will not be treated as missing.
+This should not pose any problem for \code{numeric} types, since they do have an inheret NaN value.}
+
+\item{nthread}{Number of threads used for creating DMatrix.}
+}
+\value{
+An 'xgb.DMatrix' object, with subclass 'xgb.ExternalDMatrix', in which the data is not
+held internally but accessed through the iterator when needed.
+}
+\description{
+Create a special type of xgboost 'DMatrix' object from external data
+supplied by an \link{xgb.DataIter} object, potentially passed in batches from a
+bigger set that might not fit entirely in memory.
+
+The data supplied by the iterator is accessed on-demand as needed, multiple times,
+without being concatenated, but note that fields like 'label' \bold{will} be
+concatenated from multiple calls to the data iterator.
+
+For more information, see the guide 'Using XGBoost External Memory Version':
+\url{https://xgboost.readthedocs.io/en/stable/tutorials/external_memory.html}
+}
+\examples{
+library(xgboost)
+data(mtcars)
+
+# this custom environment will be passed to the iterator
+# functions at each call. It's up to the user to keep
+# track of the iteration number in this environment.
+iterator_env <- as.environment(
+ list(
+ iter = 0,
+ x = mtcars[, -1],
+ y = mtcars[, 1]
+ )
+)
+
+# Data is passed in two batches.
+# In this example, batches are obtained by subsetting the 'x' variable.
+# This is not advantageous to do, since the data is already loaded in memory
+# and can be passed in full in one go, but there can be situations in which
+# only a subset of the data will fit in the computer's memory, and it can
+# be loaded in batches that are accessed one-at-a-time only.
+iterator_next <- function(iterator_env) {
+ curr_iter <- iterator_env[["iter"]]
+ if (curr_iter >= 2) {
+ # there are only two batches, so this signals end of the stream
+ return(NULL)
+ }
+
+ if (curr_iter == 0) {
+ x_batch <- iterator_env[["x"]][1:16, ]
+ y_batch <- iterator_env[["y"]][1:16]
+ } else {
+ x_batch <- iterator_env[["x"]][17:32, ]
+ y_batch <- iterator_env[["y"]][17:32]
+ }
+ on.exit({
+ iterator_env[["iter"]] <- curr_iter + 1
+ })
+
+ # Function 'xgb.ProxyDMatrix' must be called manually
+ # at each batch with all the appropriate attributes,
+ # such as feature names and feature types.
+ return(xgb.ProxyDMatrix(data = x_batch, label = y_batch))
+}
+
+# This moves the iterator back to its beginning
+iterator_reset <- function(iterator_env) {
+ iterator_env[["iter"]] <- 0
+}
+
+data_iterator <- xgb.DataIter(
+ env = iterator_env,
+ f_next = iterator_next,
+ f_reset = iterator_reset
+)
+cache_prefix <- tempdir()
+
+# DMatrix will be constructed from the iterator's batches
+dm <- xgb.ExternalDMatrix(data_iterator, cache_prefix, nthread = 1)
+
+# After construction, can be used as a regular DMatrix
+params <- list(nthread = 1, objective = "reg:squarederror")
+model <- xgb.train(data = dm, nrounds = 2, params = params)
+
+# Predictions can also be called on it, and should be the same
+# as if the data were passed differently.
+pred_dm <- predict(model, dm)
+pred_mat <- predict(model, as.matrix(mtcars[, -1]))
+}
+\seealso{
+\link{xgb.DataIter}, \link{xgb.ProxyDMatrix}, \link{xgb.QuantileDMatrix.from_iterator}
+}
diff --git a/R-package/man/xgb.ProxyDMatrix.Rd b/R-package/man/xgb.ProxyDMatrix.Rd
new file mode 100644
index 000000000000..5a9b6251af40
--- /dev/null
+++ b/R-package/man/xgb.ProxyDMatrix.Rd
@@ -0,0 +1,121 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/xgb.DMatrix.R
+\name{xgb.ProxyDMatrix}
+\alias{xgb.ProxyDMatrix}
+\title{Proxy DMatrix Updater}
+\usage{
+xgb.ProxyDMatrix(
+ data,
+ label = NULL,
+ weight = NULL,
+ base_margin = NULL,
+ feature_names = colnames(data),
+ feature_types = NULL,
+ group = NULL,
+ qid = NULL,
+ label_lower_bound = NULL,
+ label_upper_bound = NULL,
+ feature_weights = NULL,
+ enable_categorical = FALSE
+)
+}
+\arguments{
+\item{data}{Batch of data belonging to this batch.
+
+Note that not all of the input types supported by \link{xgb.DMatrix} are possible
+to pass here. Supported types are:\itemize{
+\item \code{matrix}, with types \code{numeric}, \code{integer}, and \code{logical}. Note that for types
+\code{integer} and \code{logical}, missing values might not be automatically recognized as
+as such - see the documentation for parameter \code{missing} in \link{xgb.ExternalDMatrix}
+for details on this.
+\item \code{data.frame}, with the same types as supported by 'xgb.DMatrix' and same
+conversions applied to it. See the documentation for parameter \code{data} in
+\link{xgb.DMatrix} for details on it.
+\item CSR matrices, as class \code{dgRMatrix} from package \code{Matrix}.
+}}
+
+\item{label}{Label of the training data.}
+
+\item{weight}{Weight for each instance.
+
+Note that, for ranking task, weights are per-group. In ranking task, one weight
+is assigned to each group (not each data point). This is because we
+only care about the relative ordering of data points within each group,
+so it doesn't make sense to assign weights to individual data points.}
+
+\item{base_margin}{Base margin used for boosting from existing model.
+
+\if{html}{\out{}}\preformatted{ In the case of multi-output models, one can also pass multi-dimensional base_margin.
+}\if{html}{\out{
}}}
+
+\item{feature_names}{Set names for features. Overrides column names in data
+frame and matrix.
+
+\if{html}{\out{}}\preformatted{ Note: columns are not referenced by name when calling `predict`, so the column order there
+ must be the same as in the DMatrix construction, regardless of the column names.
+}\if{html}{\out{
}}}
+
+\item{feature_types}{Set types for features.
+
+If \code{data} is a \code{data.frame} and passing \code{enable_categorical=TRUE}, the types will be deduced
+automatically from the column types.
+
+Otherwise, one can pass a character vector with the same length as number of columns in \code{data},
+with the following possible values:\itemize{
+\item "c", which represents categorical columns.
+\item "q", which represents numeric columns.
+\item "int", which represents integer columns.
+\item "i", which represents logical (boolean) columns.
+}
+
+Note that, while categorical types are treated differently from the rest for model fitting
+purposes, the other types do not influence the generated model, but have effects in other
+functionalities such as feature importances.}
+
+\item{group}{Group size for all ranking group.}
+
+\item{qid}{Query ID for data samples, used for ranking.}
+
+\item{label_lower_bound}{Lower bound for survival training.}
+
+\item{label_upper_bound}{Upper bound for survival training.}
+
+\item{feature_weights}{Set feature weights for column sampling.}
+
+\item{enable_categorical}{Experimental support of specializing for categorical features.
+
+\if{html}{\out{}}\preformatted{ If passing 'TRUE' and 'data' is a data frame,
+ columns of categorical types will automatically
+ be set to be of categorical type (feature_type='c') in the resulting DMatrix.
+
+ If passing 'FALSE' and 'data' is a data frame with categorical columns,
+ it will result in an error being thrown.
+
+ If 'data' is not a data frame, this argument is ignored.
+
+ JSON/UBJSON serialization format is required for this.
+}\if{html}{\out{
}}}
+}
+\value{
+An object of class \code{xgb.ProxyDMatrix}, which is just a list containing the
+data and parameters passed here. It does \bold{not} inherit from \code{xgb.DMatrix}.
+}
+\description{
+Helper function to supply data in batches of a data iterator when
+constructing a DMatrix from external memory through \link{xgb.ExternalDMatrix}
+or through \link{xgb.QuantileDMatrix.from_iterator}.
+
+This function is \bold{only} meant to be called inside of a callback function (which
+is passed as argument to function \link{xgb.DataIter} to construct a data iterator)
+when constructing a DMatrix through external memory - otherwise, one should call
+\link{xgb.DMatrix} or \link{xgb.QuantileDMatrix}.
+
+The object that results from calling this function directly is \bold{not} like the other
+\code{xgb.DMatrix} variants - i.e. cannot be used to train a model, nor to get predictions - only
+possible usage is to supply data to an iterator, from which a DMatrix is then constructed.
+
+For more information and for example usage, see the documentation for \link{xgb.ExternalDMatrix}.
+}
+\seealso{
+\link{xgb.DataIter}, \link{xgb.ExternalDMatrix}.
+}
diff --git a/R-package/man/xgb.QuantileDMatrix.from_iterator.Rd b/R-package/man/xgb.QuantileDMatrix.from_iterator.Rd
new file mode 100644
index 000000000000..21f24576dcb1
--- /dev/null
+++ b/R-package/man/xgb.QuantileDMatrix.from_iterator.Rd
@@ -0,0 +1,65 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/xgb.DMatrix.R
+\name{xgb.QuantileDMatrix.from_iterator}
+\alias{xgb.QuantileDMatrix.from_iterator}
+\title{QuantileDMatrix from External Data}
+\usage{
+xgb.QuantileDMatrix.from_iterator(
+ data_iterator,
+ missing = NA,
+ nthread = NULL,
+ ref = NULL,
+ max_bin = NULL
+)
+}
+\arguments{
+\item{data_iterator}{A data iterator structure as returned by \link{xgb.DataIter},
+which includes an environment shared between function calls, and functions to access
+the data in batches on-demand.}
+
+\item{missing}{A float value to represents missing values in data.
+
+Note that, while functions like \link{xgb.DMatrix} can take a generic \code{NA} and interpret it
+correctly for different types like \code{numeric} and \code{integer}, if an \code{NA} value is passed here,
+it will not be adapted for different input types.
+
+For example, in R \code{integer} types, missing values are represented by integer number \code{-2147483648}
+(since machine 'integer' types do not have an inherent 'NA' value) - hence, if one passes \code{NA},
+which is interpreted as a floating-point NaN by 'xgb.ExternalDMatrix' and by
+'xgb.QuantileDMatrix.from_iterator', these integer missing values will not be treated as missing.
+This should not pose any problem for \code{numeric} types, since they do have an inheret NaN value.}
+
+\item{nthread}{Number of threads used for creating DMatrix.}
+
+\item{ref}{The training dataset that provides quantile information, needed when creating
+validation/test dataset with \code{xgb.QuantileDMatrix}. Supplying the training DMatrix
+as a reference means that the same quantisation applied to the training data is
+applied to the validation/test data}
+
+\item{max_bin}{The number of histogram bin, should be consistent with the training parameter
+\code{max_bin}.
+
+This is only supported when constructing a QuantileDMatrix.}
+}
+\value{
+An 'xgb.DMatrix' object, with subclass 'xgb.QuantileDMatrix'.
+}
+\description{
+Create an \code{xgb.QuantileDMatrix} object (exact same class as would be returned by
+calling function \link{xgb.QuantileDMatrix}, with the same advantages and limitations) from
+external data supplied by an \link{xgb.DataIter} object, potentially passed in batches from
+a bigger set that might not fit entirely in memory, same way as \link{xgb.ExternalDMatrix}.
+
+Note that, while external data will only be loaded through the iterator (thus the full data
+might not be held entirely in-memory), the quantized representation of the data will get
+created in-memory, being concatenated from multiple calls to the data iterator. The quantized
+version is typically lighter than the original data, so there might be cases in which this
+representation could potentially fit in memory even if the full data doesn't.
+
+For more information, see the guide 'Using XGBoost External Memory Version':
+\url{https://xgboost.readthedocs.io/en/stable/tutorials/external_memory.html}
+}
+\seealso{
+\link{xgb.DataIter}, \link{xgb.ProxyDMatrix}, \link{xgb.ExternalDMatrix},
+\link{xgb.QuantileDMatrix}
+}
diff --git a/R-package/src/init.c b/R-package/src/init.c
index fff5d9f901d2..a9f3f3e380c2 100644
--- a/R-package/src/init.c
+++ b/R-package/src/init.c
@@ -54,6 +54,14 @@ extern SEXP XGDMatrixCreateFromDF_R(SEXP, SEXP, SEXP);
extern SEXP XGDMatrixGetStrFeatureInfo_R(SEXP, SEXP);
extern SEXP XGDMatrixNumCol_R(SEXP);
extern SEXP XGDMatrixNumRow_R(SEXP);
+extern SEXP XGProxyDMatrixCreate_R();
+extern SEXP XGProxyDMatrixSetDataDense_R(SEXP, SEXP);
+extern SEXP XGProxyDMatrixSetDataCSR_R(SEXP, SEXP);
+extern SEXP XGProxyDMatrixSetDataColumnar_R(SEXP, SEXP);
+extern SEXP XGDMatrixCreateFromCallback_R(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP);
+extern SEXP XGQuantileDMatrixCreateFromCallback_R(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP);
+extern SEXP XGDMatrixFree_R(SEXP);
+extern SEXP XGGetRNAIntAsDouble();
extern SEXP XGDMatrixGetQuantileCut_R(SEXP);
extern SEXP XGDMatrixNumNonMissing_R(SEXP);
extern SEXP XGDMatrixGetDataAsCSR_R(SEXP);
@@ -105,6 +113,14 @@ static const R_CallMethodDef CallEntries[] = {
{"XGDMatrixGetStrFeatureInfo_R", (DL_FUNC) &XGDMatrixGetStrFeatureInfo_R, 2},
{"XGDMatrixNumCol_R", (DL_FUNC) &XGDMatrixNumCol_R, 1},
{"XGDMatrixNumRow_R", (DL_FUNC) &XGDMatrixNumRow_R, 1},
+ {"XGProxyDMatrixCreate_R", (DL_FUNC) &XGProxyDMatrixCreate_R, 0},
+ {"XGProxyDMatrixSetDataDense_R", (DL_FUNC) &XGProxyDMatrixSetDataDense_R, 2},
+ {"XGProxyDMatrixSetDataCSR_R", (DL_FUNC) &XGProxyDMatrixSetDataCSR_R, 2},
+ {"XGProxyDMatrixSetDataColumnar_R", (DL_FUNC) &XGProxyDMatrixSetDataColumnar_R, 2},
+ {"XGDMatrixCreateFromCallback_R", (DL_FUNC) &XGDMatrixCreateFromCallback_R, 7},
+ {"XGQuantileDMatrixCreateFromCallback_R", (DL_FUNC) &XGQuantileDMatrixCreateFromCallback_R, 8},
+ {"XGDMatrixFree_R", (DL_FUNC) &XGDMatrixFree_R, 1},
+ {"XGGetRNAIntAsDouble", (DL_FUNC) &XGGetRNAIntAsDouble, 0},
{"XGDMatrixGetQuantileCut_R", (DL_FUNC) &XGDMatrixGetQuantileCut_R, 1},
{"XGDMatrixNumNonMissing_R", (DL_FUNC) &XGDMatrixNumNonMissing_R, 1},
{"XGDMatrixGetDataAsCSR_R", (DL_FUNC) &XGDMatrixGetDataAsCSR_R, 1},
diff --git a/R-package/src/xgboost_R.cc b/R-package/src/xgboost_R.cc
index 1d01b9aae967..c91fb94c447c 100644
--- a/R-package/src/xgboost_R.cc
+++ b/R-package/src/xgboost_R.cc
@@ -27,7 +27,12 @@
#include "./xgboost_R.h" // Must follow other includes.
namespace {
-struct ErrorWithUnwind : public std::exception {};
+
+/* Note: this class is used as a throwable exception.
+Some xgboost C functions that use callbacks will catch exceptions
+that happen inside of the callback execution, hence it purposefully
+doesn't inherit from 'std::exception' even if used as such. */
+struct ErrorWithUnwind {};
void ThrowExceptionFromRError(void *, Rboolean jump) {
if (jump) {
@@ -51,6 +56,27 @@ SEXP SafeMkChar(const char *c_str, SEXP continuation_token) {
continuation_token);
}
+struct RFunAndEnv {
+ SEXP R_fun;
+ SEXP R_calling_env;
+};
+
+SEXP WrappedExecFun(void *void_ptr) {
+ RFunAndEnv *r_fun_and_env = static_cast(void_ptr);
+ SEXP f_expr = Rf_protect(Rf_lang1(r_fun_and_env->R_fun));
+ SEXP out = Rf_protect(Rf_eval(f_expr, r_fun_and_env->R_calling_env));
+ Rf_unprotect(2);
+ return out;
+}
+
+SEXP SafeExecFun(SEXP R_fun, SEXP R_calling_env, SEXP continuation_token) {
+ RFunAndEnv r_fun_and_env{R_fun, R_calling_env};
+ return R_UnwindProtect(
+ WrappedExecFun, static_cast(&r_fun_and_env),
+ ThrowExceptionFromRError, nullptr,
+ continuation_token);
+}
+
SEXP WrappedAllocReal(void *void_ptr) {
size_t *size = static_cast(void_ptr);
return Rf_allocVector(REALSXP, *size);
@@ -140,6 +166,47 @@ SEXP SafeAllocInteger(size_t size, SEXP continuation_token) {
return "";
}
+[[nodiscard]] std::string MakeArrayInterfaceFromRDataFrame(SEXP R_df) {
+ auto make_vec = [&](auto const *ptr, std::size_t len) {
+ auto v = xgboost::linalg::MakeVec(ptr, len);
+ return xgboost::linalg::ArrayInterface(v);
+ };
+
+ R_xlen_t n_features = Rf_xlength(R_df);
+ std::vector array(n_features);
+ CHECK_GT(n_features, 0);
+ std::size_t len = Rf_xlength(VECTOR_ELT(R_df, 0));
+
+ // The `data.frame` in R actually converts all data into numeric. The other type
+ // handlers here are not used. At the moment they are kept as a reference for when we
+ // can avoid making data copies during transformation.
+ for (R_xlen_t i = 0; i < n_features; ++i) {
+ switch (TYPEOF(VECTOR_ELT(R_df, i))) {
+ case INTSXP: {
+ auto const *ptr = INTEGER(VECTOR_ELT(R_df, i));
+ array[i] = make_vec(ptr, len);
+ break;
+ }
+ case REALSXP: {
+ auto const *ptr = REAL(VECTOR_ELT(R_df, i));
+ array[i] = make_vec(ptr, len);
+ break;
+ }
+ case LGLSXP: {
+ auto const *ptr = LOGICAL(VECTOR_ELT(R_df, i));
+ array[i] = make_vec(ptr, len);
+ break;
+ }
+ default: {
+ LOG(FATAL) << "data.frame has unsupported type.";
+ }
+ }
+ }
+
+ xgboost::Json jinterface{std::move(array)};
+ return xgboost::Json::Dump(jinterface);
+}
+
[[nodiscard]] std::string MakeJsonConfigForArray(SEXP missing, SEXP n_threads, SEXPTYPE arr_type) {
using namespace ::xgboost; // NOLINT
Json jconfig{Object{}};
@@ -335,51 +402,13 @@ XGB_DLL SEXP XGDMatrixCreateFromDF_R(SEXP df, SEXP missing, SEXP n_threads) {
R_API_BEGIN();
DMatrixHandle handle;
-
- auto make_vec = [&](auto const *ptr, std::int32_t len) {
- auto v = xgboost::linalg::MakeVec(ptr, len);
- return xgboost::linalg::ArrayInterface(v);
- };
-
std::int32_t rc{0};
{
- using xgboost::Json;
- auto n_features = Rf_xlength(df);
- std::vector array(n_features);
- CHECK_GT(n_features, 0);
- auto len = Rf_xlength(VECTOR_ELT(df, 0));
- // The `data.frame` in R actually converts all data into numeric. The other type
- // handlers here are not used. At the moment they are kept as a reference for when we
- // can avoid making data copies during transformation.
- for (decltype(n_features) i = 0; i < n_features; ++i) {
- switch (TYPEOF(VECTOR_ELT(df, i))) {
- case INTSXP: {
- auto const *ptr = INTEGER(VECTOR_ELT(df, i));
- array[i] = make_vec(ptr, len);
- break;
- }
- case REALSXP: {
- auto const *ptr = REAL(VECTOR_ELT(df, i));
- array[i] = make_vec(ptr, len);
- break;
- }
- case LGLSXP: {
- auto const *ptr = LOGICAL(VECTOR_ELT(df, i));
- array[i] = make_vec(ptr, len);
- break;
- }
- default: {
- LOG(FATAL) << "data.frame has unsupported type.";
- }
- }
- }
-
- Json jinterface{std::move(array)};
- auto sinterface = Json::Dump(jinterface);
- Json jconfig{xgboost::Object{}};
+ std::string sinterface = MakeArrayInterfaceFromRDataFrame(df);
+ xgboost::Json jconfig{xgboost::Object{}};
jconfig["missing"] = asReal(missing);
jconfig["nthread"] = asInteger(n_threads);
- auto sconfig = Json::Dump(jconfig);
+ std::string sconfig = xgboost::Json::Dump(jconfig);
rc = XGDMatrixCreateFromColumnar(sinterface.c_str(), sconfig.c_str(), &handle);
}
@@ -632,6 +661,192 @@ XGB_DLL SEXP XGDMatrixNumCol_R(SEXP handle) {
return ScalarInteger(static_cast(ncol));
}
+XGB_DLL SEXP XGProxyDMatrixCreate_R() {
+ SEXP out = Rf_protect(R_MakeExternalPtr(nullptr, R_NilValue, R_NilValue));
+ R_API_BEGIN();
+ DMatrixHandle proxy_dmat_handle;
+ CHECK_CALL(XGProxyDMatrixCreate(&proxy_dmat_handle));
+ R_SetExternalPtrAddr(out, proxy_dmat_handle);
+ R_RegisterCFinalizerEx(out, _DMatrixFinalizer, TRUE);
+ Rf_unprotect(1);
+ R_API_END();
+ return out;
+}
+
+XGB_DLL SEXP XGProxyDMatrixSetDataDense_R(SEXP handle, SEXP R_mat) {
+ R_API_BEGIN();
+ DMatrixHandle proxy_dmat = R_ExternalPtrAddr(handle);
+ int res_code;
+ {
+ std::string array_str = MakeArrayInterfaceFromRMat(R_mat);
+ res_code = XGProxyDMatrixSetDataDense(proxy_dmat, array_str.c_str());
+ }
+ CHECK_CALL(res_code);
+ R_API_END();
+ return R_NilValue;
+}
+
+XGB_DLL SEXP XGProxyDMatrixSetDataCSR_R(SEXP handle, SEXP lst) {
+ R_API_BEGIN();
+ DMatrixHandle proxy_dmat = R_ExternalPtrAddr(handle);
+ int res_code;
+ {
+ std::string array_str_indptr = MakeArrayInterfaceFromRVector(VECTOR_ELT(lst, 0));
+ std::string array_str_indices = MakeArrayInterfaceFromRVector(VECTOR_ELT(lst, 1));
+ std::string array_str_data = MakeArrayInterfaceFromRVector(VECTOR_ELT(lst, 2));
+ const int ncol = Rf_asInteger(VECTOR_ELT(lst, 3));
+ res_code = XGProxyDMatrixSetDataCSR(proxy_dmat,
+ array_str_indptr.c_str(),
+ array_str_indices.c_str(),
+ array_str_data.c_str(),
+ ncol);
+ }
+ CHECK_CALL(res_code);
+ R_API_END();
+ return R_NilValue;
+}
+
+XGB_DLL SEXP XGProxyDMatrixSetDataColumnar_R(SEXP handle, SEXP lst) {
+ R_API_BEGIN();
+ DMatrixHandle proxy_dmat = R_ExternalPtrAddr(handle);
+ int res_code;
+ {
+ std::string sinterface = MakeArrayInterfaceFromRDataFrame(lst);
+ res_code = XGProxyDMatrixSetDataColumnar(proxy_dmat, sinterface.c_str());
+ }
+ CHECK_CALL(res_code);
+ R_API_END();
+ return R_NilValue;
+}
+
+namespace {
+
+struct _RDataIterator {
+ SEXP f_next;
+ SEXP f_reset;
+ SEXP calling_env;
+ SEXP continuation_token;
+
+ _RDataIterator(
+ SEXP f_next, SEXP f_reset, SEXP calling_env, SEXP continuation_token) :
+ f_next(f_next), f_reset(f_reset), calling_env(calling_env),
+ continuation_token(continuation_token) {}
+
+ void reset() {
+ SafeExecFun(this->f_reset, this->calling_env, this->continuation_token);
+ }
+
+ int next() {
+ SEXP R_res = Rf_protect(
+ SafeExecFun(this->f_next, this->calling_env, this->continuation_token));
+ int res = Rf_asInteger(R_res);
+ Rf_unprotect(1);
+ return res;
+ }
+};
+
+void _reset_RDataIterator(DataIterHandle iter) {
+ static_cast<_RDataIterator*>(iter)->reset();
+}
+
+int _next_RDataIterator(DataIterHandle iter) {
+ return static_cast<_RDataIterator*>(iter)->next();
+}
+
+SEXP XGDMatrixCreateFromCallbackGeneric_R(
+ SEXP f_next, SEXP f_reset, SEXP calling_env, SEXP proxy_dmat,
+ SEXP n_threads, SEXP missing, SEXP max_bin, SEXP ref_dmat,
+ SEXP cache_prefix, bool as_quantile_dmatrix) {
+ SEXP continuation_token = Rf_protect(R_MakeUnwindCont());
+ SEXP out = Rf_protect(R_MakeExternalPtr(nullptr, R_NilValue, R_NilValue));
+ R_API_BEGIN();
+ DMatrixHandle out_dmat;
+
+ int res_code;
+ try {
+ _RDataIterator data_iterator(f_next, f_reset, calling_env, continuation_token);
+
+ std::string str_cache_prefix;
+ xgboost::Json jconfig{xgboost::Object{}};
+ jconfig["missing"] = Rf_asReal(missing);
+ if (!Rf_isNull(n_threads)) {
+ jconfig["nthread"] = Rf_asInteger(n_threads);
+ }
+ if (as_quantile_dmatrix) {
+ if (!Rf_isNull(max_bin)) {
+ jconfig["max_bin"] = Rf_asInteger(max_bin);
+ }
+ } else {
+ str_cache_prefix = std::string(CHAR(Rf_asChar(cache_prefix)));
+ jconfig["cache_prefix"] = str_cache_prefix;
+ }
+ std::string json_str = xgboost::Json::Dump(jconfig);
+
+ DMatrixHandle ref_dmat_handle = nullptr;
+ if (as_quantile_dmatrix && !Rf_isNull(ref_dmat)) {
+ ref_dmat_handle = R_ExternalPtrAddr(ref_dmat);
+ }
+
+ if (as_quantile_dmatrix) {
+ res_code = XGQuantileDMatrixCreateFromCallback(
+ &data_iterator,
+ R_ExternalPtrAddr(proxy_dmat),
+ ref_dmat_handle,
+ _reset_RDataIterator,
+ _next_RDataIterator,
+ json_str.c_str(),
+ &out_dmat);
+ } else {
+ res_code = XGDMatrixCreateFromCallback(
+ &data_iterator,
+ R_ExternalPtrAddr(proxy_dmat),
+ _reset_RDataIterator,
+ _next_RDataIterator,
+ json_str.c_str(),
+ &out_dmat);
+ }
+ } catch (ErrorWithUnwind &e) {
+ R_ContinueUnwind(continuation_token);
+ }
+ CHECK_CALL(res_code);
+
+ R_SetExternalPtrAddr(out, out_dmat);
+ R_RegisterCFinalizerEx(out, _DMatrixFinalizer, TRUE);
+ Rf_unprotect(2);
+ R_API_END();
+ return out;
+}
+
+} /* namespace */
+
+XGB_DLL SEXP XGQuantileDMatrixCreateFromCallback_R(
+ SEXP f_next, SEXP f_reset, SEXP calling_env, SEXP proxy_dmat,
+ SEXP n_threads, SEXP missing, SEXP max_bin, SEXP ref_dmat) {
+ return XGDMatrixCreateFromCallbackGeneric_R(
+ f_next, f_reset, calling_env, proxy_dmat,
+ n_threads, missing, max_bin, ref_dmat,
+ R_NilValue, true);
+}
+
+XGB_DLL SEXP XGDMatrixCreateFromCallback_R(
+ SEXP f_next, SEXP f_reset, SEXP calling_env, SEXP proxy_dmat,
+ SEXP n_threads, SEXP missing, SEXP cache_prefix) {
+ return XGDMatrixCreateFromCallbackGeneric_R(
+ f_next, f_reset, calling_env, proxy_dmat,
+ n_threads, missing, R_NilValue, R_NilValue,
+ cache_prefix, false);
+}
+
+XGB_DLL SEXP XGDMatrixFree_R(SEXP proxy_dmat) {
+ _DMatrixFinalizer(proxy_dmat);
+ return R_NilValue;
+}
+
+XGB_DLL SEXP XGGetRNAIntAsDouble() {
+ double sentinel_as_double = static_cast(R_NaInt);
+ return Rf_ScalarReal(sentinel_as_double);
+}
+
XGB_DLL SEXP XGDuplicate_R(SEXP obj) {
return Rf_duplicate(obj);
}
diff --git a/R-package/src/xgboost_R.h b/R-package/src/xgboost_R.h
index ec30dbada79f..d2e0ae82855d 100644
--- a/R-package/src/xgboost_R.h
+++ b/R-package/src/xgboost_R.h
@@ -161,6 +161,84 @@ XGB_DLL SEXP XGDMatrixNumRow_R(SEXP handle);
*/
XGB_DLL SEXP XGDMatrixNumCol_R(SEXP handle);
+/*!
+<<<<<<< HEAD
+ * \brief create a ProxyDMatrix and get an R externalptr object for it
+ */
+XGB_DLL SEXP XGProxyDMatrixCreate_R();
+
+/*!
+ * \brief Set dense matrix data on a proxy dmatrix
+ * \param handle R externalptr pointing to a ProxyDMatrix
+ * \param R_mat R matrix to set in the proxy dmatrix
+ */
+XGB_DLL SEXP XGProxyDMatrixSetDataDense_R(SEXP handle, SEXP R_mat);
+
+/*!
+ * \brief Set dense matrix data on a proxy dmatrix
+ * \param handle R externalptr pointing to a ProxyDMatrix
+ * \param lst R list containing, in this order:
+ * 1. 'p' or 'indptr' vector of the CSR matrix.
+ * 2. 'j' or 'indices' vector of the CSR matrix.
+ * 3. 'x' or 'data' vector of the CSR matrix.
+ * 4. Number of columns in the CSR matrix.
+ */
+XGB_DLL SEXP XGProxyDMatrixSetDataCSR_R(SEXP handle, SEXP lst);
+
+/*!
+ * \brief Set dense matrix data on a proxy dmatrix
+ * \param handle R externalptr pointing to a ProxyDMatrix
+ * \param lst R list or data.frame object containing its columns as numeric vectors
+ */
+XGB_DLL SEXP XGProxyDMatrixSetDataColumnar_R(SEXP handle, SEXP lst);
+
+/*!
+ * \brief Create a DMatrix from a DataIter with callbacks
+ * \param expr_f_next expression for function(env, proxy_dmat) that sets the data on the proxy
+ * dmatrix and returns either zero (end of batch) or one (batch continues).
+ * \param expr_f_reset expression for function(env) that resets the data iterator to
+ * the beginning (first batch).
+ * \param calling_env R environment where to evaluate the expressions above
+ * \param proxy_dmat R externalptr holding a ProxyDMatrix.
+ * \param n_threads number of parallel threads to use for constructing the DMatrix.
+ * \param missing which value to represent missing value.
+ * \param cache_prefix path of cache file
+ * \return handle R externalptr holding the resulting DMatrix.
+ */
+XGB_DLL SEXP XGDMatrixCreateFromCallback_R(
+ SEXP expr_f_next, SEXP expr_f_reset, SEXP calling_env, SEXP proxy_dmat,
+ SEXP n_threads, SEXP missing, SEXP cache_prefix);
+
+/*!
+ * \brief Create a QuantileDMatrix from a DataIter with callbacks
+ * \param expr_f_next expression for function(env, proxy_dmat) that sets the data on the proxy
+ * dmatrix and returns either zero (end of batch) or one (batch continues).
+ * \param expr_f_reset expression for function(env) that resets the data iterator to
+ * the beginning (first batch).
+ * \param calling_env R environment where to evaluate the expressions above
+ * \param proxy_dmat R externalptr holding a ProxyDMatrix.
+ * \param n_threads number of parallel threads to use for constructing the QuantileDMatrix.
+ * \param missing which value to represent missing value.
+ * \param max_bin maximum number of bins to have in the resulting QuantileDMatrix.
+ * \param ref_dmat an optional reference DMatrix from which to get the bin boundaries.
+ * \return handle R externalptr holding the resulting QuantileDMatrix.
+ */
+XGB_DLL SEXP XGQuantileDMatrixCreateFromCallback_R(
+ SEXP expr_f_next, SEXP expr_f_reset, SEXP calling_env, SEXP proxy_dmat,
+ SEXP n_threads, SEXP missing, SEXP max_bin, SEXP ref_dmat);
+
+/*!
+ * \brief Frees a ProxyDMatrix and empties out the R externalptr object that holds it
+ * \param proxy_dmat R externalptr containing a ProxyDMatrix
+ * \return NULL
+ */
+XGB_DLL SEXP XGDMatrixFree_R(SEXP proxy_dmat);
+
+/*!
+ * \brief Get the value that represents missingness in R integers as a numeric non-missing value.
+ */
+XGB_DLL SEXP XGGetRNAIntAsDouble();
+
/*!
* \brief Call R C-level function 'duplicate'
* \param obj Object to duplicate
diff --git a/R-package/tests/testthat/test_dmatrix.R b/R-package/tests/testthat/test_dmatrix.R
index 568aaa3bd78d..65374240df00 100644
--- a/R-package/tests/testthat/test_dmatrix.R
+++ b/R-package/tests/testthat/test_dmatrix.R
@@ -343,7 +343,7 @@ test_that("xgb.DMatrix: data.frame", {
expect_equal(
getinfo(m, "feature_type"), c("float", "float", "int", "i", "c", "c")
)
- expect_error(xgb.DMatrix(df))
+ expect_error(xgb.DMatrix(df, enable_categorical = FALSE))
df <- data.frame(
missing = c("a", "b", "d", NA),
@@ -380,6 +380,261 @@ test_that("xgb.DMatrix: can take multi-dimensional 'base_margin'", {
expect_equal(pred_only_x, pred_w_base - b, tolerance = 1e-5)
})
+test_that("xgb.DMatrix: QuantileDMatrix produces same result as DMatrix", {
+ data(mtcars)
+ y <- mtcars[, 1]
+ x <- mtcars[, -1]
+
+ cast_matrix <- function(x) as.matrix(x)
+ cast_df <- function(x) as.data.frame(x)
+ cast_csr <- function(x) as(as.matrix(x), "RsparseMatrix")
+ casting_funs <- list(cast_matrix, cast_df, cast_csr)
+
+ for (casting_fun in casting_funs) {
+
+ qdm <- xgb.QuantileDMatrix(
+ data = casting_fun(x),
+ label = y,
+ nthread = n_threads,
+ max_bin = 5
+ )
+ params <- list(
+ tree_method = "hist",
+ objective = "reg:squarederror",
+ nthread = n_threads,
+ max_bin = 5
+ )
+ model_qdm <- xgb.train(
+ params = params,
+ data = qdm,
+ nrounds = 2
+ )
+ pred_qdm <- predict(model_qdm, x)
+
+ dm <- xgb.DMatrix(
+ data = x,
+ label = y,
+ nthread = n_threads
+ )
+ model_dm <- xgb.train(
+ params = params,
+ data = dm,
+ nrounds = 2
+ )
+ pred_dm <- predict(model_dm, x)
+
+ expect_equal(pred_qdm, pred_dm)
+ }
+})
+
+test_that("xgb.DMatrix: QuantileDMatrix is not accepted by exact method", {
+ data(mtcars)
+ y <- mtcars[, 1]
+ x <- as.matrix(mtcars[, -1])
+ qdm <- xgb.QuantileDMatrix(
+ data = x,
+ label = y,
+ nthread = n_threads
+ )
+ params <- list(
+ tree_method = "exact",
+ objective = "reg:squarederror",
+ nthread = n_threads
+ )
+ expect_error({
+ xgb.train(
+ params = params,
+ data = qdm,
+ nrounds = 2
+ )
+ })
+})
+
+test_that("xgb.DMatrix: ExternalDMatrix produces the same results as regular DMatrix", {
+ data(mtcars)
+ y <- mtcars[, 1]
+ x <- as.matrix(mtcars[, -1])
+ set.seed(123)
+ params <- list(
+ objective = "reg:squarederror",
+ nthread = n_threads
+ )
+ model <- xgb.train(
+ data = xgb.DMatrix(x, label = y),
+ params = params,
+ nrounds = 5
+ )
+ pred <- predict(model, x)
+
+ iterator_env <- as.environment(
+ list(
+ iter = 0,
+ x = mtcars[, -1],
+ y = mtcars[, 1]
+ )
+ )
+ iterator_next <- function(iterator_env, proxy_handle) {
+ curr_iter <- iterator_env[["iter"]]
+ if (curr_iter >= 2) {
+ return(NULL)
+ }
+ if (curr_iter == 0) {
+ x_batch <- iterator_env[["x"]][1:16, ]
+ y_batch <- iterator_env[["y"]][1:16]
+ } else {
+ x_batch <- iterator_env[["x"]][17:32, ]
+ y_batch <- iterator_env[["y"]][17:32]
+ }
+ on.exit({
+ iterator_env[["iter"]] <- curr_iter + 1
+ })
+ return(xgb.ProxyDMatrix(data = x_batch, label = y_batch))
+ }
+ iterator_reset <- function(iterator_env) {
+ iterator_env[["iter"]] <- 0
+ }
+ data_iterator <- xgb.DataIter(
+ env = iterator_env,
+ f_next = iterator_next,
+ f_reset = iterator_reset
+ )
+ cache_prefix <- tempdir()
+ edm <- xgb.ExternalDMatrix(data_iterator, cache_prefix, nthread = 1)
+ expect_true(inherits(edm, "xgb.ExternalDMatrix"))
+ expect_true(inherits(edm, "xgb.DMatrix"))
+ set.seed(123)
+ model_ext <- xgb.train(
+ data = edm,
+ params = params,
+ nrounds = 5
+ )
+
+ pred_model1_edm <- predict(model, edm)
+ pred_model2_mat <- predict(model_ext, x)
+ pred_model2_edm <- predict(model_ext, edm)
+
+ expect_equal(pred_model1_edm, pred)
+ expect_equal(pred_model2_mat, pred)
+ expect_equal(pred_model2_edm, pred)
+})
+
+test_that("xgb.DMatrix: External QDM produces same results as regular QDM", {
+ data(mtcars)
+ y <- mtcars[, 1]
+ x <- as.matrix(mtcars[, -1])
+ set.seed(123)
+ params <- list(
+ objective = "reg:squarederror",
+ nthread = n_threads,
+ max_bin = 3
+ )
+ model <- xgb.train(
+ data = xgb.QuantileDMatrix(
+ x,
+ label = y,
+ nthread = 1,
+ max_bin = 3
+ ),
+ params = params,
+ nrounds = 5
+ )
+ pred <- predict(model, x)
+
+ iterator_env <- as.environment(
+ list(
+ iter = 0,
+ x = mtcars[, -1],
+ y = mtcars[, 1]
+ )
+ )
+ iterator_next <- function(iterator_env, proxy_handle) {
+ curr_iter <- iterator_env[["iter"]]
+ if (curr_iter >= 2) {
+ return(NULL)
+ }
+ if (curr_iter == 0) {
+ x_batch <- iterator_env[["x"]][1:16, ]
+ y_batch <- iterator_env[["y"]][1:16]
+ } else {
+ x_batch <- iterator_env[["x"]][17:32, ]
+ y_batch <- iterator_env[["y"]][17:32]
+ }
+ on.exit({
+ iterator_env[["iter"]] <- curr_iter + 1
+ })
+ return(xgb.ProxyDMatrix(data = x_batch, label = y_batch))
+ }
+ iterator_reset <- function(iterator_env) {
+ iterator_env[["iter"]] <- 0
+ }
+ data_iterator <- xgb.DataIter(
+ env = iterator_env,
+ f_next = iterator_next,
+ f_reset = iterator_reset
+ )
+ cache_prefix <- tempdir()
+ qdm <- xgb.QuantileDMatrix.from_iterator(
+ data_iterator,
+ max_bin = 3,
+ nthread = 1
+ )
+ expect_true(inherits(qdm, "xgb.QuantileDMatrix"))
+ expect_true(inherits(qdm, "xgb.DMatrix"))
+ set.seed(123)
+ model_ext <- xgb.train(
+ data = qdm,
+ params = params,
+ nrounds = 5
+ )
+
+ pred_model1_qdm <- predict(model, qdm)
+ pred_model2_mat <- predict(model_ext, x)
+ pred_model2_qdm <- predict(model_ext, qdm)
+
+ expect_equal(pred_model1_qdm, pred)
+ expect_equal(pred_model2_mat, pred)
+ expect_equal(pred_model2_qdm, pred)
+})
+
+test_that("xgb.DMatrix: R errors thrown on DataIterator are thrown back to the user", {
+ data(mtcars)
+ iterator_env <- as.environment(
+ list(
+ iter = 0,
+ x = mtcars[, -1],
+ y = mtcars[, 1]
+ )
+ )
+ iterator_next <- function(iterator_env, proxy_handle) {
+ curr_iter <- iterator_env[["iter"]]
+ if (curr_iter >= 2) {
+ return(0)
+ }
+ if (curr_iter == 0) {
+ x_batch <- iterator_env[["x"]][1:16, ]
+ y_batch <- iterator_env[["y"]][1:16]
+ } else {
+ stop("custom error")
+ }
+ on.exit({
+ iterator_env[["iter"]] <- curr_iter + 1
+ })
+ return(xgb.ProxyDMatrix(data = x_batch, label = y_batch))
+ }
+ iterator_reset <- function(iterator_env) {
+ iterator_env[["iter"]] <- 0
+ }
+ data_iterator <- xgb.DataIter(
+ env = iterator_env,
+ f_next = iterator_next,
+ f_reset = iterator_reset
+ )
+ expect_error(
+ {xgb.ExternalDMatrix(data_iterator, nthread = 1)},
+ "custom error"
+ )
+})
+
test_that("xgb.DMatrix: number of non-missing matches data", {
x <- matrix(1:10, nrow = 5)
dm1 <- xgb.DMatrix(x)
diff --git a/python-package/xgboost/core.py b/python-package/xgboost/core.py
index 5761b4b14db7..27331d3de5ca 100644
--- a/python-package/xgboost/core.py
+++ b/python-package/xgboost/core.py
@@ -798,9 +798,23 @@ def __init__(
Set names for features.
feature_types :
- Set types for features. When `enable_categorical` is set to `True`, string
- "c" represents categorical data type while "q" represents numerical feature
- type. For categorical features, the input is assumed to be preprocessed and
+ Set types for features. If `data` is a DataFrame type and passing
+ `enable_categorical=True`, the types will be deduced automatically
+ from the column types.
+
+ Otherwise, one can pass a list-like input with the same length as number
+ of columns in `data`, with the following possible values:
+ - "c", which represents categorical columns.
+ - "q", which represents numeric columns.
+ - "int", which represents integer columns.
+ - "i", which represents boolean columns.
+
+ Note that, while categorical types are treated differently from
+ the rest for model fitting purposes, the other types do not influence
+ the generated model, but have effects in other functionalities such as
+ feature importances.
+
+ For categorical features, the input is assumed to be preprocessed and
encoded by the users. The encoding can be done via
:py:class:`sklearn.preprocessing.OrdinalEncoder` or pandas dataframe
`.cat.codes` method. This is useful when users want to specify categorical