From 3abbbe41accc2ad0df202d40e6b1f79cb1b98a0f Mon Sep 17 00:00:00 2001 From: david-cortes Date: Tue, 30 Jan 2024 12:26:44 +0100 Subject: [PATCH] [R] Add data iterator, quantile dmatrix, external memory, and missing `feature_types` (#9913) --- R-package/DESCRIPTION | 2 +- R-package/NAMESPACE | 5 + R-package/R/xgb.DMatrix.R | 712 ++++++++++++++++-- R-package/man/xgb.DMatrix.Rd | 108 ++- R-package/man/xgb.DataIter.Rd | 51 ++ R-package/man/xgb.ExternalDMatrix.Rd | 122 +++ R-package/man/xgb.ProxyDMatrix.Rd | 121 +++ .../man/xgb.QuantileDMatrix.from_iterator.Rd | 65 ++ R-package/src/init.c | 16 + R-package/src/xgboost_R.cc | 299 ++++++-- R-package/src/xgboost_R.h | 78 ++ R-package/tests/testthat/test_dmatrix.R | 257 ++++++- python-package/xgboost/core.py | 20 +- 13 files changed, 1753 insertions(+), 103 deletions(-) create mode 100644 R-package/man/xgb.DataIter.Rd create mode 100644 R-package/man/xgb.ExternalDMatrix.Rd create mode 100644 R-package/man/xgb.ProxyDMatrix.Rd create mode 100644 R-package/man/xgb.QuantileDMatrix.from_iterator.Rd diff --git a/R-package/DESCRIPTION b/R-package/DESCRIPTION index bbaf3e75da4e..66e2b5692190 100644 --- a/R-package/DESCRIPTION +++ b/R-package/DESCRIPTION @@ -65,6 +65,6 @@ Imports: data.table (>= 1.9.6), jsonlite (>= 1.0) Roxygen: list(markdown = TRUE) -RoxygenNote: 7.3.0 +RoxygenNote: 7.3.1 Encoding: UTF-8 SystemRequirements: GNU make, C++17 diff --git a/R-package/NAMESPACE b/R-package/NAMESPACE index 398b0da5a056..49f93bb57274 100644 --- a/R-package/NAMESPACE +++ b/R-package/NAMESPACE @@ -34,6 +34,11 @@ export(slice) export(xgb.DMatrix) export(xgb.DMatrix.hasinfo) export(xgb.DMatrix.save) +export(xgb.DataIter) +export(xgb.ExternalDMatrix) +export(xgb.ProxyDMatrix) +export(xgb.QuantileDMatrix) +export(xgb.QuantileDMatrix.from_iterator) export(xgb.attr) export(xgb.attributes) export(xgb.config) diff --git a/R-package/R/xgb.DMatrix.R b/R-package/R/xgb.DMatrix.R index 7c4c30bd3035..da036b952b83 100644 --- a/R-package/R/xgb.DMatrix.R +++ b/R-package/R/xgb.DMatrix.R @@ -1,13 +1,42 @@ #' Construct xgb.DMatrix object #' -#' Construct xgb.DMatrix object from either a dense matrix, a sparse matrix, or a local file. -#' Supported input file formats are either a LIBSVM text file or a binary file that was created previously by -#' \code{\link{xgb.DMatrix.save}}). -#' -#' @param data a \code{matrix} object (either numeric or integer), a \code{dgCMatrix} object, -#' a \code{dgRMatrix} object, -#' a \code{dsparseVector} object (only when making predictions from a fitted model, will be -#' interpreted as a row vector), or a character string representing a filename. +#' Construct an 'xgb.DMatrix' object from a given data source, which can then be passed to functions +#' such as \link{xgb.train} or \link{predict.xgb.Booster}. +#' +#' Function 'xgb.QuantileDMatrix' will construct a DMatrix with quantization for the histogram +#' method already applied to it, which can be used to reduce memory usage (compared to using a +#' a regular DMatrix first and then creating a quantization out of it) when using the histogram +#' method (`tree_method = "hist"`, which is the default algorithm), but is not usable for the +#' sorted-indices method (`tree_method = "exact"`), nor for the approximate method +#' (`tree_method = "approx"`). +#' @param data Data from which to create a DMatrix, which can then be used for fitting models or +#' for getting predictions out of a fitted model. +#' +#' Supported input types are as follows:\itemize{ +#' \item `matrix` objects, with types `numeric`, `integer`, or `logical`. +#' \item `data.frame` objects, with columns of types `numeric`, `integer`, `logical`, or `factor`. +#' +#' If passing `enable_categorical=TRUE`, columns with `factor` type will be treated as categorical. +#' Otherwise, if passing `enable_categorical=FALSE` and the data contains `factor` columns, an error +#' will be thrown. +#' +#' Note that xgboost uses base-0 encoding for categorical types, hence `factor` types (which use base-1 +#' encoding') will be converted inside the function call. Be aware that the encoding used for `factor` +#' types is not kept as part of the model, so in subsequent calls to `predict`, it is the user's +#' responsibility to ensure that factor columns have the same levels as the ones from which the DMatrix +#' was constructed. +#' +#' Other column types are not supported. +#' \item CSR matrices, as class `dgRMatrix` from package `Matrix`. +#' \item CSC matrices, as class `dgCMatrix` from package `Matrix`. These are \bold{not} supported for +#' 'xgb.QuantileDMatrix'. +#' \item Single-row CSR matrices, as class `dsparseVector` from package `Matrix`, which is interpreted +#' as a single row (only when making predictions from a fitted model). +#' \item Text files in SVMLight / LibSVM formats, passed as a path to the file. These are \bold{not} +#' supported for xgb.QuantileDMatrix'. +#' \item Binary files generated by \link{xgb.DMatrix.save}, passed as a path to the file. These are +#' \bold{not} supported for xgb.QuantileDMatrix'. +#' } #' @param label Label of the training data. #' @param weight Weight for each instance. #' @@ -18,11 +47,32 @@ #' @param base_margin Base margin used for boosting from existing model. #' #' In the case of multi-output models, one can also pass multi-dimensional base_margin. -#' @param missing a float value to represents missing values in data (used only when input is a dense matrix). -#' It is useful when a 0 or some other extreme value represents missing values in data. +#' @param missing A float value to represents missing values in data (not used when creating DMatrix +#' from text files). +#' It is useful to change when a zero, infinite, or some other extreme value represents missing +#' values in data. #' @param silent whether to suppress printing an informational message after loading from a file. #' @param feature_names Set names for features. Overrides column names in data #' frame and matrix. +#' +#' Note: columns are not referenced by name when calling `predict`, so the column order there +#' must be the same as in the DMatrix construction, regardless of the column names. +#' @param feature_types Set types for features. +#' +#' If `data` is a `data.frame` and passing `enable_categorical=TRUE`, the types will be deduced +#' automatically from the column types. +#' +#' Otherwise, one can pass a character vector with the same length as number of columns in `data`, +#' with the following possible values:\itemize{ +#' \item "c", which represents categorical columns. +#' \item "q", which represents numeric columns. +#' \item "int", which represents integer columns. +#' \item "i", which represents logical (boolean) columns. +#' } +#' +#' Note that, while categorical types are treated differently from the rest for model fitting +#' purposes, the other types do not influence the generated model, but have effects in other +#' functionalities such as feature importances. #' @param nthread Number of threads used for creating DMatrix. #' @param group Group size for all ranking group. #' @param qid Query ID for data samples, used for ranking. @@ -41,6 +91,8 @@ #' If 'data' is not a data frame, this argument is ignored. #' #' JSON/UBJSON serialization format is required for this. +#' @return An 'xgb.DMatrix' object. If calling 'xgb.QuantileDMatrix', it will have additional +#' subclass 'xgb.QuantileDMatrix'. #' #' @details #' Note that DMatrix objects are not serializable through R functions such as \code{saveRDS} or \code{save}. @@ -60,6 +112,7 @@ #' xgb.DMatrix.save(dtrain, fname) #' dtrain <- xgb.DMatrix(fname) #' @export +#' @rdname xgb.DMatrix xgb.DMatrix <- function( data, label = NULL, @@ -68,6 +121,7 @@ xgb.DMatrix <- function( missing = NA, silent = FALSE, feature_names = colnames(data), + feature_types = NULL, nthread = NULL, group = NULL, qid = NULL, @@ -79,7 +133,7 @@ xgb.DMatrix <- function( if (!is.null(group) && !is.null(qid)) { stop("Either one of 'group' or 'qid' should be NULL") } - ctypes <- NULL + nthread <- as.integer(NVL(nthread, -1L)) if (typeof(data) == "character") { if (length(data) > 1) { stop( @@ -91,7 +145,7 @@ xgb.DMatrix <- function( handle <- .Call(XGDMatrixCreateFromFile_R, data, as.integer(silent)) } else if (is.matrix(data)) { handle <- .Call( - XGDMatrixCreateFromMat_R, data, missing, as.integer(NVL(nthread, -1)) + XGDMatrixCreateFromMat_R, data, missing, nthread ) } else if (inherits(data, "dgCMatrix")) { handle <- .Call( @@ -101,7 +155,7 @@ xgb.DMatrix <- function( data@x, nrow(data), missing, - as.integer(NVL(nthread, -1)) + nthread ) } else if (inherits(data, "dgRMatrix")) { handle <- .Call( @@ -111,7 +165,7 @@ xgb.DMatrix <- function( data@x, ncol(data), missing, - as.integer(NVL(nthread, -1)) + nthread ) } else if (inherits(data, "dsparseVector")) { indptr <- c(0L, as.integer(length(data@i))) @@ -123,51 +177,99 @@ xgb.DMatrix <- function( data@x, length(data), missing, - as.integer(NVL(nthread, -1)) + nthread ) } else if (is.data.frame(data)) { - ctypes <- sapply(data, function(x) { - if (is.factor(x)) { + tmp <- .process.df.for.dmatrix(data, enable_categorical, feature_types) + feature_types <- tmp$feature_types + handle <- .Call( + XGDMatrixCreateFromDF_R, tmp$lst, missing, nthread + ) + rm(tmp) + } else { + stop("xgb.DMatrix does not support construction from ", typeof(data)) + } + + dmat <- handle + attributes(dmat) <- list( + class = "xgb.DMatrix", + fields = new.env() + ) + .set.dmatrix.fields( + dmat = dmat, + label = label, + weight = weight, + base_margin = base_margin, + feature_names = feature_names, + feature_types = feature_types, + group = group, + qid = qid, + label_lower_bound = label_lower_bound, + label_upper_bound = label_upper_bound, + feature_weights = feature_weights + ) + + return(dmat) +} + +.process.df.for.dmatrix <- function(df, enable_categorical, feature_types) { + if (!nrow(df) || !ncol(df)) { + stop("'data' is an empty data.frame.") + } + if (!is.null(feature_types)) { + if (!is.character(feature_types) || length(feature_types) != ncol(df)) { + stop( + "'feature_types' must be a character vector with one entry per column in 'data'." + ) + } + } else { + feature_types <- sapply(df, function(col) { + if (is.factor(col)) { if (!enable_categorical) { stop( "When factor type is used, the parameter `enable_categorical`", " must be set to TRUE." ) } - "c" - } else if (is.integer(x)) { - "int" - } else if (is.logical(x)) { - "i" + return("c") + } else if (is.integer(col)) { + return("int") + } else if (is.logical(col)) { + return("i") } else { - if (!is.numeric(x)) { + if (!is.numeric(col)) { stop("Invalid type in dataframe.") } - "float" + return("float") } }) - ## as.data.frame somehow converts integer/logical into real. - data <- as.data.frame(sapply(data, function(x) { - if (is.factor(x)) { - ## XGBoost uses 0-based indexing. - as.numeric(x) - 1 - } else { - x - } - })) - handle <- .Call( - XGDMatrixCreateFromDF_R, data, missing, as.integer(NVL(nthread, -1)) - ) - } else { - stop("xgb.DMatrix does not support construction from ", typeof(data)) } - dmat <- handle - attributes(dmat) <- list( - class = "xgb.DMatrix", - fields = new.env() - ) + lst <- lapply(df, function(col) { + is_factor <- is.factor(col) + col <- as.numeric(col) + if (is_factor) { + col <- col - 1 + } + return(col) + }) + + return(list(lst = lst, feature_types = feature_types)) +} +.set.dmatrix.fields <- function( + dmat, + label, + weight, + base_margin, + feature_names, + feature_types, + group, + qid, + label_lower_bound, + label_upper_bound, + feature_weights +) { if (!is.null(label)) { setinfo(dmat, "label", label) } @@ -180,6 +282,9 @@ xgb.DMatrix <- function( if (!is.null(feature_names)) { setinfo(dmat, "feature_name", feature_names) } + if (!is.null(feature_types)) { + setinfo(dmat, "feature_type", feature_types) + } if (!is.null(group)) { setinfo(dmat, "group", group) } @@ -195,10 +300,515 @@ xgb.DMatrix <- function( if (!is.null(feature_weights)) { setinfo(dmat, "feature_weights", feature_weights) } - if (!is.null(ctypes)) { - setinfo(dmat, "feature_type", ctypes) +} + +#' @param ref The training dataset that provides quantile information, needed when creating +#' validation/test dataset with `xgb.QuantileDMatrix`. Supplying the training DMatrix +#' as a reference means that the same quantisation applied to the training data is +#' applied to the validation/test data +#' @param max_bin The number of histogram bin, should be consistent with the training parameter +#' `max_bin`. +#' +#' This is only supported when constructing a QuantileDMatrix. +#' @export +#' @rdname xgb.DMatrix +xgb.QuantileDMatrix <- function( + data, + label = NULL, + weight = NULL, + base_margin = NULL, + missing = NA, + feature_names = colnames(data), + feature_types = NULL, + nthread = NULL, + group = NULL, + qid = NULL, + label_lower_bound = NULL, + label_upper_bound = NULL, + feature_weights = NULL, + enable_categorical = FALSE, + ref = NULL, + max_bin = NULL +) { + nthread <- as.integer(NVL(nthread, -1L)) + if (!is.null(ref) && !inherits(ref, "xgb.DMatrix")) { + stop("'ref' must be an xgb.DMatrix object.") + } + + # Note: when passing an integer matrix, it won't get casted to numeric. + # Since 'int' values as understood by languages like C cannot have missing values, + # R represents missingness there by assigning them a value equal to the minimum + # integer. The 'missing' value here is set before the data, so in case of integers, + # need to make the conversion manually beforehand. + if (is.matrix(data) && storage.mode(data) %in% c("integer", "logical") && is.na(missing)) { + missing <- .Call(XGGetRNAIntAsDouble) + } + + iterator_env <- as.environment( + list( + data = data, + label = label, + weight = weight, + base_margin = base_margin, + missing = missing, + feature_names = feature_names, + feature_types = feature_types, + group = group, + qid = qid, + label_lower_bound = label_lower_bound, + label_upper_bound = label_upper_bound, + feature_weights = feature_weights, + enable_categorical = enable_categorical + ) + ) + data_iterator <- .single.data.iterator(iterator_env) + + # Note: the ProxyDMatrix has its finalizer assigned in the R externalptr + # object, but that finalizer will only be called once the object is + # garbage-collected, which doesn't happen immediately after it goes out + # of scope, hence this piece of code to tigger its destruction earlier + # and free memory right away. + proxy_handle <- .make.proxy.handle() + on.exit({ + .Call(XGDMatrixFree_R, proxy_handle) + }) + iterator_next <- function() { + return(xgb.ProxyDMatrix.internal(proxy_handle, data_iterator)) + } + iterator_reset <- function() { + return(data_iterator$f_reset(iterator_env)) + } + calling_env <- environment() + + dmat <- .Call( + XGQuantileDMatrixCreateFromCallback_R, + iterator_next, + iterator_reset, + calling_env, + proxy_handle, + nthread, + missing, + max_bin, + ref + ) + attributes(dmat) <- list( + class = c("xgb.DMatrix", "xgb.QuantileDMatrix"), + fields = attributes(proxy_handle)$fields + ) + return(dmat) +} + +#' @title XGBoost Data Iterator +#' @description Interface to create a custom data iterator in order to construct a DMatrix +#' from external memory. +#' +#' This function is responsible for generating an R object structure containing callback +#' functions and an environment shared with them. +#' +#' The output structure from this function is then meant to be passed to \link{xgb.ExternalDMatrix}, +#' which will consume the data and create a DMatrix from it by executing the callback functions. +#' +#' For more information, and for a usage example, see the documentation for \link{xgb.ExternalDMatrix}. +#' @param env An R environment to pass to the callback functions supplied here, which can be +#' used to keep track of variables to determine how to handle the batches. +#' +#' For example, one might want to keep track of an iteration number in this environment in order +#' to know which part of the data to pass next. +#' @param f_next `function(env)` which is responsible for:\itemize{ +#' \item Accessing or retrieving the next batch of data in the iterator. +#' \item Supplying this data by calling function \link{xgb.ProxyDMatrix} on it and returning the result. +#' \item Keeping track of where in the iterator batch it is or will go next, which can for example +#' be done by modifiying variables in the `env` variable that is passed here. +#' \item Signaling whether there are more batches to be consumed or not, by returning `NULL` +#' when the stream of data ends (all batches in the iterator have been consumed), or the result from +#' calling \link{xgb.ProxyDMatrix} when there are more batches in the line to be consumed. +#' } +#' @param f_reset `function(env)` which is responsible for reseting the data iterator +#' (i.e. taking it back to the first batch, called before and after the sequence of batches +#' has been consumed). +#' +#' Note that, after resetting the iterator, the batches will be accessed again, so the same data +#' (and in the same order) must be passed in subsequent iterations. +#' @return An `xgb.DataIter` object, containing the same inputs supplied here, which can then +#' be passed to \link{xgb.ExternalDMatrix}. +#' @seealso \link{xgb.ExternalDMatrix}, \link{xgb.ProxyDMatrix}. +#' @export +xgb.DataIter <- function(env = new.env(), f_next, f_reset) { + if (!is.function(f_next)) { + stop("'f_next' must be a function.") + } + if (!is.function(f_reset)) { + stop("'f_reset' must be a function.") + } + out <- list( + env = env, + f_next = f_next, + f_reset = f_reset + ) + class(out) <- "xgb.DataIter" + return(out) +} + +.qdm.single.fnext <- function(env) { + curr_iter <- env[["iter"]] + if (curr_iter >= 1L) { + return(NULL) + } + + on.exit({ + env[["iter"]] <- curr_iter + 1L + }) + return( + xgb.ProxyDMatrix( + data = env[["data"]], + label = env[["label"]], + weight = env[["weight"]], + base_margin = env[["base_margin"]], + feature_names = env[["feature_names"]], + feature_types = env[["feature_types"]], + group = env[["group"]], + qid = env[["qid"]], + label_lower_bound = env[["label_lower_bound"]], + label_upper_bound = env[["label_upper_bound"]], + feature_weights = env[["feature_weights"]], + enable_categorical = env[["enable_categorical"]] + ) + ) +} + +.qdm.single.freset <- function(env) { + env[["iter"]] <- 0L + return(invisible(NULL)) +} + +.single.data.iterator <- function(env) { + env[["iter"]] <- 0L + return(xgb.DataIter(env, .qdm.single.fnext, .qdm.single.freset)) +} + +# Only for internal usage +.make.proxy.handle <- function() { + out <- .Call(XGProxyDMatrixCreate_R) + attributes(out) <- list( + class = c("xgb.DMatrix", "xgb.ProxyDMatrixHandle"), + fields = new.env() + ) + return(out) +} + +#' @title Proxy DMatrix Updater +#' @description Helper function to supply data in batches of a data iterator when +#' constructing a DMatrix from external memory through \link{xgb.ExternalDMatrix} +#' or through \link{xgb.QuantileDMatrix.from_iterator}. +#' +#' This function is \bold{only} meant to be called inside of a callback function (which +#' is passed as argument to function \link{xgb.DataIter} to construct a data iterator) +#' when constructing a DMatrix through external memory - otherwise, one should call +#' \link{xgb.DMatrix} or \link{xgb.QuantileDMatrix}. +#' +#' The object that results from calling this function directly is \bold{not} like the other +#' `xgb.DMatrix` variants - i.e. cannot be used to train a model, nor to get predictions - only +#' possible usage is to supply data to an iterator, from which a DMatrix is then constructed. +#' +#' For more information and for example usage, see the documentation for \link{xgb.ExternalDMatrix}. +#' @inheritParams xgb.DMatrix +#' @param data Batch of data belonging to this batch. +#' +#' Note that not all of the input types supported by \link{xgb.DMatrix} are possible +#' to pass here. Supported types are:\itemize{ +#' \item `matrix`, with types `numeric`, `integer`, and `logical`. Note that for types +#' `integer` and `logical`, missing values might not be automatically recognized as +#' as such - see the documentation for parameter `missing` in \link{xgb.ExternalDMatrix} +#' for details on this. +#' \item `data.frame`, with the same types as supported by 'xgb.DMatrix' and same +#' conversions applied to it. See the documentation for parameter `data` in +#' \link{xgb.DMatrix} for details on it. +#' \item CSR matrices, as class `dgRMatrix` from package `Matrix`. +#' } +#' @return An object of class `xgb.ProxyDMatrix`, which is just a list containing the +#' data and parameters passed here. It does \bold{not} inherit from `xgb.DMatrix`. +#' @seealso \link{xgb.DataIter}, \link{xgb.ExternalDMatrix}. +#' @export +xgb.ProxyDMatrix <- function( + data, + label = NULL, + weight = NULL, + base_margin = NULL, + feature_names = colnames(data), + feature_types = NULL, + group = NULL, + qid = NULL, + label_lower_bound = NULL, + label_upper_bound = NULL, + feature_weights = NULL, + enable_categorical = FALSE +) { + stopifnot(inherits(data, c("matrix", "data.frame", "dgRMatrix"))) + out <- list( + data = data, + label = label, + weight = weight, + base_margin = base_margin, + feature_names = feature_names, + feature_types = feature_types, + group = group, + qid = qid, + label_lower_bound = label_lower_bound, + label_upper_bound = label_upper_bound, + feature_weights = feature_weights, + enable_categorical = enable_categorical + ) + class(out) <- "xgb.ProxyDMatrix" + return(out) +} + +xgb.ProxyDMatrix.internal <- function(proxy_handle, data_iterator) { + lst <- data_iterator$f_next(data_iterator$env) + if (is.null(lst)) { + return(0L) + } + if (!inherits(lst, "xgb.ProxyDMatrix")) { + stop("DataIter 'f_next' must return either NULL or the result from calling 'xgb.ProxyDMatrix'.") + } + + if (!is.null(lst$group) && !is.null(lst$qid)) { + stop("Either one of 'group' or 'qid' should be NULL") + } + if (is.data.frame(lst$data)) { + tmp <- .process.df.for.dmatrix(lst$data, lst$enable_categorical, lst$feature_types) + lst$feature_types <- tmp$feature_types + .Call(XGProxyDMatrixSetDataColumnar_R, proxy_handle, tmp$lst) + rm(tmp) + } else if (is.matrix(lst$data)) { + .Call(XGProxyDMatrixSetDataDense_R, proxy_handle, lst$data) + } else if (inherits(lst$data, "dgRMatrix")) { + tmp <- list(p = lst$data@p, j = lst$data@j, x = lst$data@x, ncol = ncol(lst$data)) + .Call(XGProxyDMatrixSetDataCSR_R, proxy_handle, tmp) + } else { + stop("'data' has unsupported type.") + } + + .set.dmatrix.fields( + dmat = proxy_handle, + label = lst$label, + weight = lst$weight, + base_margin = lst$base_margin, + feature_names = lst$feature_names, + feature_types = lst$feature_types, + group = lst$group, + qid = lst$qid, + label_lower_bound = lst$label_lower_bound, + label_upper_bound = lst$label_upper_bound, + feature_weights = lst$feature_weights + ) + + return(1L) +} + +#' @title DMatrix from External Data +#' @description Create a special type of xgboost 'DMatrix' object from external data +#' supplied by an \link{xgb.DataIter} object, potentially passed in batches from a +#' bigger set that might not fit entirely in memory. +#' +#' The data supplied by the iterator is accessed on-demand as needed, multiple times, +#' without being concatenated, but note that fields like 'label' \bold{will} be +#' concatenated from multiple calls to the data iterator. +#' +#' For more information, see the guide 'Using XGBoost External Memory Version': +#' \url{https://xgboost.readthedocs.io/en/stable/tutorials/external_memory.html} +#' @inheritParams xgb.DMatrix +#' @param data_iterator A data iterator structure as returned by \link{xgb.DataIter}, +#' which includes an environment shared between function calls, and functions to access +#' the data in batches on-demand. +#' @param cache_prefix The path of cache file, caller must initialize all the directories in this path. +#' @param missing A float value to represents missing values in data. +#' +#' Note that, while functions like \link{xgb.DMatrix} can take a generic `NA` and interpret it +#' correctly for different types like `numeric` and `integer`, if an `NA` value is passed here, +#' it will not be adapted for different input types. +#' +#' For example, in R `integer` types, missing values are represented by integer number `-2147483648` +#' (since machine 'integer' types do not have an inherent 'NA' value) - hence, if one passes `NA`, +#' which is interpreted as a floating-point NaN by 'xgb.ExternalDMatrix' and by +#' 'xgb.QuantileDMatrix.from_iterator', these integer missing values will not be treated as missing. +#' This should not pose any problem for `numeric` types, since they do have an inheret NaN value. +#' @return An 'xgb.DMatrix' object, with subclass 'xgb.ExternalDMatrix', in which the data is not +#' held internally but accessed through the iterator when needed. +#' @seealso \link{xgb.DataIter}, \link{xgb.ProxyDMatrix}, \link{xgb.QuantileDMatrix.from_iterator} +#' @examples +#' library(xgboost) +#' data(mtcars) +#' +#' # this custom environment will be passed to the iterator +#' # functions at each call. It's up to the user to keep +#' # track of the iteration number in this environment. +#' iterator_env <- as.environment( +#' list( +#' iter = 0, +#' x = mtcars[, -1], +#' y = mtcars[, 1] +#' ) +#' ) +#' +#' # Data is passed in two batches. +#' # In this example, batches are obtained by subsetting the 'x' variable. +#' # This is not advantageous to do, since the data is already loaded in memory +#' # and can be passed in full in one go, but there can be situations in which +#' # only a subset of the data will fit in the computer's memory, and it can +#' # be loaded in batches that are accessed one-at-a-time only. +#' iterator_next <- function(iterator_env) { +#' curr_iter <- iterator_env[["iter"]] +#' if (curr_iter >= 2) { +#' # there are only two batches, so this signals end of the stream +#' return(NULL) +#' } +#' +#' if (curr_iter == 0) { +#' x_batch <- iterator_env[["x"]][1:16, ] +#' y_batch <- iterator_env[["y"]][1:16] +#' } else { +#' x_batch <- iterator_env[["x"]][17:32, ] +#' y_batch <- iterator_env[["y"]][17:32] +#' } +#' on.exit({ +#' iterator_env[["iter"]] <- curr_iter + 1 +#' }) +#' +#' # Function 'xgb.ProxyDMatrix' must be called manually +#' # at each batch with all the appropriate attributes, +#' # such as feature names and feature types. +#' return(xgb.ProxyDMatrix(data = x_batch, label = y_batch)) +#' } +#' +#' # This moves the iterator back to its beginning +#' iterator_reset <- function(iterator_env) { +#' iterator_env[["iter"]] <- 0 +#' } +#' +#' data_iterator <- xgb.DataIter( +#' env = iterator_env, +#' f_next = iterator_next, +#' f_reset = iterator_reset +#' ) +#' cache_prefix <- tempdir() +#' +#' # DMatrix will be constructed from the iterator's batches +#' dm <- xgb.ExternalDMatrix(data_iterator, cache_prefix, nthread = 1) +#' +#' # After construction, can be used as a regular DMatrix +#' params <- list(nthread = 1, objective = "reg:squarederror") +#' model <- xgb.train(data = dm, nrounds = 2, params = params) +#' +#' # Predictions can also be called on it, and should be the same +#' # as if the data were passed differently. +#' pred_dm <- predict(model, dm) +#' pred_mat <- predict(model, as.matrix(mtcars[, -1])) +#' @export +xgb.ExternalDMatrix <- function( + data_iterator, + cache_prefix = tempdir(), + missing = NA, + nthread = NULL +) { + stopifnot(inherits(data_iterator, "xgb.DataIter")) + stopifnot(is.character(cache_prefix)) + + cache_prefix <- path.expand(cache_prefix) + nthread <- as.integer(NVL(nthread, -1L)) + + proxy_handle <- .make.proxy.handle() + on.exit({ + .Call(XGDMatrixFree_R, proxy_handle) + }) + iterator_next <- function() { + return(xgb.ProxyDMatrix.internal(proxy_handle, data_iterator)) + } + iterator_reset <- function() { + return(data_iterator$f_reset(data_iterator$env)) } + calling_env <- environment() + dmat <- .Call( + XGDMatrixCreateFromCallback_R, + iterator_next, + iterator_reset, + calling_env, + proxy_handle, + nthread, + missing, + cache_prefix + ) + + attributes(dmat) <- list( + class = c("xgb.DMatrix", "xgb.ExternalDMatrix"), + fields = attributes(proxy_handle)$fields + ) + return(dmat) +} + + +#' @title QuantileDMatrix from External Data +#' @description Create an `xgb.QuantileDMatrix` object (exact same class as would be returned by +#' calling function \link{xgb.QuantileDMatrix}, with the same advantages and limitations) from +#' external data supplied by an \link{xgb.DataIter} object, potentially passed in batches from +#' a bigger set that might not fit entirely in memory, same way as \link{xgb.ExternalDMatrix}. +#' +#' Note that, while external data will only be loaded through the iterator (thus the full data +#' might not be held entirely in-memory), the quantized representation of the data will get +#' created in-memory, being concatenated from multiple calls to the data iterator. The quantized +#' version is typically lighter than the original data, so there might be cases in which this +#' representation could potentially fit in memory even if the full data doesn't. +#' +#' For more information, see the guide 'Using XGBoost External Memory Version': +#' \url{https://xgboost.readthedocs.io/en/stable/tutorials/external_memory.html} +#' @inheritParams xgb.ExternalDMatrix +#' @inheritParams xgb.QuantileDMatrix +#' @return An 'xgb.DMatrix' object, with subclass 'xgb.QuantileDMatrix'. +#' @seealso \link{xgb.DataIter}, \link{xgb.ProxyDMatrix}, \link{xgb.ExternalDMatrix}, +#' \link{xgb.QuantileDMatrix} +#' @export +xgb.QuantileDMatrix.from_iterator <- function( # nolint + data_iterator, + missing = NA, + nthread = NULL, + ref = NULL, + max_bin = NULL +) { + stopifnot(inherits(data_iterator, "xgb.DataIter")) + if (!is.null(ref) && !inherits(ref, "xgb.DMatrix")) { + stop("'ref' must be an xgb.DMatrix object.") + } + + nthread <- as.integer(NVL(nthread, -1L)) + + proxy_handle <- .make.proxy.handle() + on.exit({ + .Call(XGDMatrixFree_R, proxy_handle) + }) + iterator_next <- function() { + return(xgb.ProxyDMatrix.internal(proxy_handle, data_iterator)) + } + iterator_reset <- function() { + return(data_iterator$f_reset(data_iterator$env)) + } + calling_env <- environment() + + dmat <- .Call( + XGQuantileDMatrixCreateFromCallback_R, + iterator_next, + iterator_reset, + calling_env, + proxy_handle, + nthread, + missing, + max_bin, + ref + ) + + attributes(dmat) <- list( + class = c("xgb.DMatrix", "xgb.QuantileDMatrix"), + fields = attributes(proxy_handle)$fields + ) return(dmat) } @@ -712,7 +1322,17 @@ print.xgb.DMatrix <- function(x, verbose = FALSE, ...) { cat("INVALID xgb.DMatrix object. Must be constructed anew.\n") return(invisible(x)) } - cat('xgb.DMatrix dim:', nrow(x), 'x', ncol(x), ' info: ') + class_print <- if (inherits(x, "xgb.QuantileDMatrix")) { + "xgb.QuantileDMatrix" + } else if (inherits(x, "xgb.ExternalDMatrix")) { + "xgb.ExternalDMatrix" + } else if (inherits(x, "xgb.ProxyDMatrix")) { + "xgb.ProxyDMatrix" + } else { + "xgb.DMatrix" + } + + cat(class_print, ' dim:', nrow(x), 'x', ncol(x), ' info: ') infos <- character(0) if (xgb.DMatrix.hasinfo(x, 'label')) infos <- 'label' if (xgb.DMatrix.hasinfo(x, 'weight')) infos <- c(infos, 'weight') diff --git a/R-package/man/xgb.DMatrix.Rd b/R-package/man/xgb.DMatrix.Rd index eb667377f0b3..ceb60dc42906 100644 --- a/R-package/man/xgb.DMatrix.Rd +++ b/R-package/man/xgb.DMatrix.Rd @@ -2,6 +2,7 @@ % Please edit documentation in R/xgb.DMatrix.R \name{xgb.DMatrix} \alias{xgb.DMatrix} +\alias{xgb.QuantileDMatrix} \title{Construct xgb.DMatrix object} \usage{ xgb.DMatrix( @@ -12,6 +13,7 @@ xgb.DMatrix( missing = NA, silent = FALSE, feature_names = colnames(data), + feature_types = NULL, nthread = NULL, group = NULL, qid = NULL, @@ -20,12 +22,55 @@ xgb.DMatrix( feature_weights = NULL, enable_categorical = FALSE ) + +xgb.QuantileDMatrix( + data, + label = NULL, + weight = NULL, + base_margin = NULL, + missing = NA, + feature_names = colnames(data), + feature_types = NULL, + nthread = NULL, + group = NULL, + qid = NULL, + label_lower_bound = NULL, + label_upper_bound = NULL, + feature_weights = NULL, + enable_categorical = FALSE, + ref = NULL, + max_bin = NULL +) } \arguments{ -\item{data}{a \code{matrix} object (either numeric or integer), a \code{dgCMatrix} object, -a \code{dgRMatrix} object, -a \code{dsparseVector} object (only when making predictions from a fitted model, will be -interpreted as a row vector), or a character string representing a filename.} +\item{data}{Data from which to create a DMatrix, which can then be used for fitting models or +for getting predictions out of a fitted model. + +Supported input types are as follows:\itemize{ +\item \code{matrix} objects, with types \code{numeric}, \code{integer}, or \code{logical}. +\item \code{data.frame} objects, with columns of types \code{numeric}, \code{integer}, \code{logical}, or \code{factor}. + +If passing \code{enable_categorical=TRUE}, columns with \code{factor} type will be treated as categorical. +Otherwise, if passing \code{enable_categorical=FALSE} and the data contains \code{factor} columns, an error +will be thrown. + +Note that xgboost uses base-0 encoding for categorical types, hence \code{factor} types (which use base-1 +encoding') will be converted inside the function call. Be aware that the encoding used for \code{factor} +types is not kept as part of the model, so in subsequent calls to \code{predict}, it is the user's +responsibility to ensure that factor columns have the same levels as the ones from which the DMatrix +was constructed. + +Other column types are not supported. +\item CSR matrices, as class \code{dgRMatrix} from package \code{Matrix}. +\item CSC matrices, as class \code{dgCMatrix} from package \code{Matrix}. These are \bold{not} supported for +'xgb.QuantileDMatrix'. +\item Single-row CSR matrices, as class \code{dsparseVector} from package \code{Matrix}, which is interpreted +as a single row (only when making predictions from a fitted model). +\item Text files in SVMLight / LibSVM formats, passed as a path to the file. These are \bold{not} +supported for xgb.QuantileDMatrix'. +\item Binary files generated by \link{xgb.DMatrix.save}, passed as a path to the file. These are +\bold{not} supported for xgb.QuantileDMatrix'. +}} \item{label}{Label of the training data.} @@ -41,13 +86,36 @@ so it doesn't make sense to assign weights to individual data points.} \if{html}{\out{
}}\preformatted{ In the case of multi-output models, one can also pass multi-dimensional base_margin. }\if{html}{\out{
}}} -\item{missing}{a float value to represents missing values in data (used only when input is a dense matrix). -It is useful when a 0 or some other extreme value represents missing values in data.} +\item{missing}{A float value to represents missing values in data (not used when creating DMatrix +from text files). +It is useful to change when a zero, infinite, or some other extreme value represents missing +values in data.} \item{silent}{whether to suppress printing an informational message after loading from a file.} \item{feature_names}{Set names for features. Overrides column names in data -frame and matrix.} +frame and matrix. + +\if{html}{\out{
}}\preformatted{ Note: columns are not referenced by name when calling `predict`, so the column order there + must be the same as in the DMatrix construction, regardless of the column names. +}\if{html}{\out{
}}} + +\item{feature_types}{Set types for features. + +If \code{data} is a \code{data.frame} and passing \code{enable_categorical=TRUE}, the types will be deduced +automatically from the column types. + +Otherwise, one can pass a character vector with the same length as number of columns in \code{data}, +with the following possible values:\itemize{ +\item "c", which represents categorical columns. +\item "q", which represents numeric columns. +\item "int", which represents integer columns. +\item "i", which represents logical (boolean) columns. +} + +Note that, while categorical types are treated differently from the rest for model fitting +purposes, the other types do not influence the generated model, but have effects in other +functionalities such as feature importances.} \item{nthread}{Number of threads used for creating DMatrix.} @@ -74,13 +142,33 @@ frame and matrix.} JSON/UBJSON serialization format is required for this. }\if{html}{\out{}}} + +\item{ref}{The training dataset that provides quantile information, needed when creating +validation/test dataset with \code{xgb.QuantileDMatrix}. Supplying the training DMatrix +as a reference means that the same quantisation applied to the training data is +applied to the validation/test data} + +\item{max_bin}{The number of histogram bin, should be consistent with the training parameter +\code{max_bin}. + +This is only supported when constructing a QuantileDMatrix.} +} +\value{ +An 'xgb.DMatrix' object. If calling 'xgb.QuantileDMatrix', it will have additional +subclass 'xgb.QuantileDMatrix'. } \description{ -Construct xgb.DMatrix object from either a dense matrix, a sparse matrix, or a local file. -Supported input file formats are either a LIBSVM text file or a binary file that was created previously by -\code{\link{xgb.DMatrix.save}}). +Construct an 'xgb.DMatrix' object from a given data source, which can then be passed to functions +such as \link{xgb.train} or \link{predict.xgb.Booster}. } \details{ +Function 'xgb.QuantileDMatrix' will construct a DMatrix with quantization for the histogram +method already applied to it, which can be used to reduce memory usage (compared to using a +a regular DMatrix first and then creating a quantization out of it) when using the histogram +method (\code{tree_method = "hist"}, which is the default algorithm), but is not usable for the +sorted-indices method (\code{tree_method = "exact"}), nor for the approximate method +(\code{tree_method = "approx"}). + Note that DMatrix objects are not serializable through R functions such as \code{saveRDS} or \code{save}. If a DMatrix gets serialized and then de-serialized (for example, when saving data in an R session or caching chunks in an Rmd file), the resulting object will not be usable anymore and will need to be reconstructed diff --git a/R-package/man/xgb.DataIter.Rd b/R-package/man/xgb.DataIter.Rd new file mode 100644 index 000000000000..29cf5acc9cf4 --- /dev/null +++ b/R-package/man/xgb.DataIter.Rd @@ -0,0 +1,51 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/xgb.DMatrix.R +\name{xgb.DataIter} +\alias{xgb.DataIter} +\title{XGBoost Data Iterator} +\usage{ +xgb.DataIter(env = new.env(), f_next, f_reset) +} +\arguments{ +\item{env}{An R environment to pass to the callback functions supplied here, which can be +used to keep track of variables to determine how to handle the batches. + +For example, one might want to keep track of an iteration number in this environment in order +to know which part of the data to pass next.} + +\item{f_next}{\verb{function(env)} which is responsible for:\itemize{ +\item Accessing or retrieving the next batch of data in the iterator. +\item Supplying this data by calling function \link{xgb.ProxyDMatrix} on it and returning the result. +\item Keeping track of where in the iterator batch it is or will go next, which can for example +be done by modifiying variables in the \code{env} variable that is passed here. +\item Signaling whether there are more batches to be consumed or not, by returning \code{NULL} +when the stream of data ends (all batches in the iterator have been consumed), or the result from +calling \link{xgb.ProxyDMatrix} when there are more batches in the line to be consumed. +}} + +\item{f_reset}{\verb{function(env)} which is responsible for reseting the data iterator +(i.e. taking it back to the first batch, called before and after the sequence of batches +has been consumed). + +Note that, after resetting the iterator, the batches will be accessed again, so the same data +(and in the same order) must be passed in subsequent iterations.} +} +\value{ +An \code{xgb.DataIter} object, containing the same inputs supplied here, which can then +be passed to \link{xgb.ExternalDMatrix}. +} +\description{ +Interface to create a custom data iterator in order to construct a DMatrix +from external memory. + +This function is responsible for generating an R object structure containing callback +functions and an environment shared with them. + +The output structure from this function is then meant to be passed to \link{xgb.ExternalDMatrix}, +which will consume the data and create a DMatrix from it by executing the callback functions. + +For more information, and for a usage example, see the documentation for \link{xgb.ExternalDMatrix}. +} +\seealso{ +\link{xgb.ExternalDMatrix}, \link{xgb.ProxyDMatrix}. +} diff --git a/R-package/man/xgb.ExternalDMatrix.Rd b/R-package/man/xgb.ExternalDMatrix.Rd new file mode 100644 index 000000000000..3e7844990b50 --- /dev/null +++ b/R-package/man/xgb.ExternalDMatrix.Rd @@ -0,0 +1,122 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/xgb.DMatrix.R +\name{xgb.ExternalDMatrix} +\alias{xgb.ExternalDMatrix} +\title{DMatrix from External Data} +\usage{ +xgb.ExternalDMatrix( + data_iterator, + cache_prefix = tempdir(), + missing = NA, + nthread = NULL +) +} +\arguments{ +\item{data_iterator}{A data iterator structure as returned by \link{xgb.DataIter}, +which includes an environment shared between function calls, and functions to access +the data in batches on-demand.} + +\item{cache_prefix}{The path of cache file, caller must initialize all the directories in this path.} + +\item{missing}{A float value to represents missing values in data. + +Note that, while functions like \link{xgb.DMatrix} can take a generic \code{NA} and interpret it +correctly for different types like \code{numeric} and \code{integer}, if an \code{NA} value is passed here, +it will not be adapted for different input types. + +For example, in R \code{integer} types, missing values are represented by integer number \code{-2147483648} +(since machine 'integer' types do not have an inherent 'NA' value) - hence, if one passes \code{NA}, +which is interpreted as a floating-point NaN by 'xgb.ExternalDMatrix' and by +'xgb.QuantileDMatrix.from_iterator', these integer missing values will not be treated as missing. +This should not pose any problem for \code{numeric} types, since they do have an inheret NaN value.} + +\item{nthread}{Number of threads used for creating DMatrix.} +} +\value{ +An 'xgb.DMatrix' object, with subclass 'xgb.ExternalDMatrix', in which the data is not +held internally but accessed through the iterator when needed. +} +\description{ +Create a special type of xgboost 'DMatrix' object from external data +supplied by an \link{xgb.DataIter} object, potentially passed in batches from a +bigger set that might not fit entirely in memory. + +The data supplied by the iterator is accessed on-demand as needed, multiple times, +without being concatenated, but note that fields like 'label' \bold{will} be +concatenated from multiple calls to the data iterator. + +For more information, see the guide 'Using XGBoost External Memory Version': +\url{https://xgboost.readthedocs.io/en/stable/tutorials/external_memory.html} +} +\examples{ +library(xgboost) +data(mtcars) + +# this custom environment will be passed to the iterator +# functions at each call. It's up to the user to keep +# track of the iteration number in this environment. +iterator_env <- as.environment( + list( + iter = 0, + x = mtcars[, -1], + y = mtcars[, 1] + ) +) + +# Data is passed in two batches. +# In this example, batches are obtained by subsetting the 'x' variable. +# This is not advantageous to do, since the data is already loaded in memory +# and can be passed in full in one go, but there can be situations in which +# only a subset of the data will fit in the computer's memory, and it can +# be loaded in batches that are accessed one-at-a-time only. +iterator_next <- function(iterator_env) { + curr_iter <- iterator_env[["iter"]] + if (curr_iter >= 2) { + # there are only two batches, so this signals end of the stream + return(NULL) + } + + if (curr_iter == 0) { + x_batch <- iterator_env[["x"]][1:16, ] + y_batch <- iterator_env[["y"]][1:16] + } else { + x_batch <- iterator_env[["x"]][17:32, ] + y_batch <- iterator_env[["y"]][17:32] + } + on.exit({ + iterator_env[["iter"]] <- curr_iter + 1 + }) + + # Function 'xgb.ProxyDMatrix' must be called manually + # at each batch with all the appropriate attributes, + # such as feature names and feature types. + return(xgb.ProxyDMatrix(data = x_batch, label = y_batch)) +} + +# This moves the iterator back to its beginning +iterator_reset <- function(iterator_env) { + iterator_env[["iter"]] <- 0 +} + +data_iterator <- xgb.DataIter( + env = iterator_env, + f_next = iterator_next, + f_reset = iterator_reset +) +cache_prefix <- tempdir() + +# DMatrix will be constructed from the iterator's batches +dm <- xgb.ExternalDMatrix(data_iterator, cache_prefix, nthread = 1) + +# After construction, can be used as a regular DMatrix +params <- list(nthread = 1, objective = "reg:squarederror") +model <- xgb.train(data = dm, nrounds = 2, params = params) + +# Predictions can also be called on it, and should be the same +# as if the data were passed differently. +pred_dm <- predict(model, dm) +pred_mat <- predict(model, as.matrix(mtcars[, -1])) +} +\seealso{ +\link{xgb.DataIter}, \link{xgb.ProxyDMatrix}, \link{xgb.QuantileDMatrix.from_iterator} +} diff --git a/R-package/man/xgb.ProxyDMatrix.Rd b/R-package/man/xgb.ProxyDMatrix.Rd new file mode 100644 index 000000000000..5a9b6251af40 --- /dev/null +++ b/R-package/man/xgb.ProxyDMatrix.Rd @@ -0,0 +1,121 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/xgb.DMatrix.R +\name{xgb.ProxyDMatrix} +\alias{xgb.ProxyDMatrix} +\title{Proxy DMatrix Updater} +\usage{ +xgb.ProxyDMatrix( + data, + label = NULL, + weight = NULL, + base_margin = NULL, + feature_names = colnames(data), + feature_types = NULL, + group = NULL, + qid = NULL, + label_lower_bound = NULL, + label_upper_bound = NULL, + feature_weights = NULL, + enable_categorical = FALSE +) +} +\arguments{ +\item{data}{Batch of data belonging to this batch. + +Note that not all of the input types supported by \link{xgb.DMatrix} are possible +to pass here. Supported types are:\itemize{ +\item \code{matrix}, with types \code{numeric}, \code{integer}, and \code{logical}. Note that for types +\code{integer} and \code{logical}, missing values might not be automatically recognized as +as such - see the documentation for parameter \code{missing} in \link{xgb.ExternalDMatrix} +for details on this. +\item \code{data.frame}, with the same types as supported by 'xgb.DMatrix' and same +conversions applied to it. See the documentation for parameter \code{data} in +\link{xgb.DMatrix} for details on it. +\item CSR matrices, as class \code{dgRMatrix} from package \code{Matrix}. +}} + +\item{label}{Label of the training data.} + +\item{weight}{Weight for each instance. + +Note that, for ranking task, weights are per-group. In ranking task, one weight +is assigned to each group (not each data point). This is because we +only care about the relative ordering of data points within each group, +so it doesn't make sense to assign weights to individual data points.} + +\item{base_margin}{Base margin used for boosting from existing model. + +\if{html}{\out{
}}\preformatted{ In the case of multi-output models, one can also pass multi-dimensional base_margin. +}\if{html}{\out{
}}} + +\item{feature_names}{Set names for features. Overrides column names in data +frame and matrix. + +\if{html}{\out{
}}\preformatted{ Note: columns are not referenced by name when calling `predict`, so the column order there + must be the same as in the DMatrix construction, regardless of the column names. +}\if{html}{\out{
}}} + +\item{feature_types}{Set types for features. + +If \code{data} is a \code{data.frame} and passing \code{enable_categorical=TRUE}, the types will be deduced +automatically from the column types. + +Otherwise, one can pass a character vector with the same length as number of columns in \code{data}, +with the following possible values:\itemize{ +\item "c", which represents categorical columns. +\item "q", which represents numeric columns. +\item "int", which represents integer columns. +\item "i", which represents logical (boolean) columns. +} + +Note that, while categorical types are treated differently from the rest for model fitting +purposes, the other types do not influence the generated model, but have effects in other +functionalities such as feature importances.} + +\item{group}{Group size for all ranking group.} + +\item{qid}{Query ID for data samples, used for ranking.} + +\item{label_lower_bound}{Lower bound for survival training.} + +\item{label_upper_bound}{Upper bound for survival training.} + +\item{feature_weights}{Set feature weights for column sampling.} + +\item{enable_categorical}{Experimental support of specializing for categorical features. + +\if{html}{\out{
}}\preformatted{ If passing 'TRUE' and 'data' is a data frame, + columns of categorical types will automatically + be set to be of categorical type (feature_type='c') in the resulting DMatrix. + + If passing 'FALSE' and 'data' is a data frame with categorical columns, + it will result in an error being thrown. + + If 'data' is not a data frame, this argument is ignored. + + JSON/UBJSON serialization format is required for this. +}\if{html}{\out{
}}} +} +\value{ +An object of class \code{xgb.ProxyDMatrix}, which is just a list containing the +data and parameters passed here. It does \bold{not} inherit from \code{xgb.DMatrix}. +} +\description{ +Helper function to supply data in batches of a data iterator when +constructing a DMatrix from external memory through \link{xgb.ExternalDMatrix} +or through \link{xgb.QuantileDMatrix.from_iterator}. + +This function is \bold{only} meant to be called inside of a callback function (which +is passed as argument to function \link{xgb.DataIter} to construct a data iterator) +when constructing a DMatrix through external memory - otherwise, one should call +\link{xgb.DMatrix} or \link{xgb.QuantileDMatrix}. + +The object that results from calling this function directly is \bold{not} like the other +\code{xgb.DMatrix} variants - i.e. cannot be used to train a model, nor to get predictions - only +possible usage is to supply data to an iterator, from which a DMatrix is then constructed. + +For more information and for example usage, see the documentation for \link{xgb.ExternalDMatrix}. +} +\seealso{ +\link{xgb.DataIter}, \link{xgb.ExternalDMatrix}. +} diff --git a/R-package/man/xgb.QuantileDMatrix.from_iterator.Rd b/R-package/man/xgb.QuantileDMatrix.from_iterator.Rd new file mode 100644 index 000000000000..21f24576dcb1 --- /dev/null +++ b/R-package/man/xgb.QuantileDMatrix.from_iterator.Rd @@ -0,0 +1,65 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/xgb.DMatrix.R +\name{xgb.QuantileDMatrix.from_iterator} +\alias{xgb.QuantileDMatrix.from_iterator} +\title{QuantileDMatrix from External Data} +\usage{ +xgb.QuantileDMatrix.from_iterator( + data_iterator, + missing = NA, + nthread = NULL, + ref = NULL, + max_bin = NULL +) +} +\arguments{ +\item{data_iterator}{A data iterator structure as returned by \link{xgb.DataIter}, +which includes an environment shared between function calls, and functions to access +the data in batches on-demand.} + +\item{missing}{A float value to represents missing values in data. + +Note that, while functions like \link{xgb.DMatrix} can take a generic \code{NA} and interpret it +correctly for different types like \code{numeric} and \code{integer}, if an \code{NA} value is passed here, +it will not be adapted for different input types. + +For example, in R \code{integer} types, missing values are represented by integer number \code{-2147483648} +(since machine 'integer' types do not have an inherent 'NA' value) - hence, if one passes \code{NA}, +which is interpreted as a floating-point NaN by 'xgb.ExternalDMatrix' and by +'xgb.QuantileDMatrix.from_iterator', these integer missing values will not be treated as missing. +This should not pose any problem for \code{numeric} types, since they do have an inheret NaN value.} + +\item{nthread}{Number of threads used for creating DMatrix.} + +\item{ref}{The training dataset that provides quantile information, needed when creating +validation/test dataset with \code{xgb.QuantileDMatrix}. Supplying the training DMatrix +as a reference means that the same quantisation applied to the training data is +applied to the validation/test data} + +\item{max_bin}{The number of histogram bin, should be consistent with the training parameter +\code{max_bin}. + +This is only supported when constructing a QuantileDMatrix.} +} +\value{ +An 'xgb.DMatrix' object, with subclass 'xgb.QuantileDMatrix'. +} +\description{ +Create an \code{xgb.QuantileDMatrix} object (exact same class as would be returned by +calling function \link{xgb.QuantileDMatrix}, with the same advantages and limitations) from +external data supplied by an \link{xgb.DataIter} object, potentially passed in batches from +a bigger set that might not fit entirely in memory, same way as \link{xgb.ExternalDMatrix}. + +Note that, while external data will only be loaded through the iterator (thus the full data +might not be held entirely in-memory), the quantized representation of the data will get +created in-memory, being concatenated from multiple calls to the data iterator. The quantized +version is typically lighter than the original data, so there might be cases in which this +representation could potentially fit in memory even if the full data doesn't. + +For more information, see the guide 'Using XGBoost External Memory Version': +\url{https://xgboost.readthedocs.io/en/stable/tutorials/external_memory.html} +} +\seealso{ +\link{xgb.DataIter}, \link{xgb.ProxyDMatrix}, \link{xgb.ExternalDMatrix}, +\link{xgb.QuantileDMatrix} +} diff --git a/R-package/src/init.c b/R-package/src/init.c index fff5d9f901d2..a9f3f3e380c2 100644 --- a/R-package/src/init.c +++ b/R-package/src/init.c @@ -54,6 +54,14 @@ extern SEXP XGDMatrixCreateFromDF_R(SEXP, SEXP, SEXP); extern SEXP XGDMatrixGetStrFeatureInfo_R(SEXP, SEXP); extern SEXP XGDMatrixNumCol_R(SEXP); extern SEXP XGDMatrixNumRow_R(SEXP); +extern SEXP XGProxyDMatrixCreate_R(); +extern SEXP XGProxyDMatrixSetDataDense_R(SEXP, SEXP); +extern SEXP XGProxyDMatrixSetDataCSR_R(SEXP, SEXP); +extern SEXP XGProxyDMatrixSetDataColumnar_R(SEXP, SEXP); +extern SEXP XGDMatrixCreateFromCallback_R(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP); +extern SEXP XGQuantileDMatrixCreateFromCallback_R(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP); +extern SEXP XGDMatrixFree_R(SEXP); +extern SEXP XGGetRNAIntAsDouble(); extern SEXP XGDMatrixGetQuantileCut_R(SEXP); extern SEXP XGDMatrixNumNonMissing_R(SEXP); extern SEXP XGDMatrixGetDataAsCSR_R(SEXP); @@ -105,6 +113,14 @@ static const R_CallMethodDef CallEntries[] = { {"XGDMatrixGetStrFeatureInfo_R", (DL_FUNC) &XGDMatrixGetStrFeatureInfo_R, 2}, {"XGDMatrixNumCol_R", (DL_FUNC) &XGDMatrixNumCol_R, 1}, {"XGDMatrixNumRow_R", (DL_FUNC) &XGDMatrixNumRow_R, 1}, + {"XGProxyDMatrixCreate_R", (DL_FUNC) &XGProxyDMatrixCreate_R, 0}, + {"XGProxyDMatrixSetDataDense_R", (DL_FUNC) &XGProxyDMatrixSetDataDense_R, 2}, + {"XGProxyDMatrixSetDataCSR_R", (DL_FUNC) &XGProxyDMatrixSetDataCSR_R, 2}, + {"XGProxyDMatrixSetDataColumnar_R", (DL_FUNC) &XGProxyDMatrixSetDataColumnar_R, 2}, + {"XGDMatrixCreateFromCallback_R", (DL_FUNC) &XGDMatrixCreateFromCallback_R, 7}, + {"XGQuantileDMatrixCreateFromCallback_R", (DL_FUNC) &XGQuantileDMatrixCreateFromCallback_R, 8}, + {"XGDMatrixFree_R", (DL_FUNC) &XGDMatrixFree_R, 1}, + {"XGGetRNAIntAsDouble", (DL_FUNC) &XGGetRNAIntAsDouble, 0}, {"XGDMatrixGetQuantileCut_R", (DL_FUNC) &XGDMatrixGetQuantileCut_R, 1}, {"XGDMatrixNumNonMissing_R", (DL_FUNC) &XGDMatrixNumNonMissing_R, 1}, {"XGDMatrixGetDataAsCSR_R", (DL_FUNC) &XGDMatrixGetDataAsCSR_R, 1}, diff --git a/R-package/src/xgboost_R.cc b/R-package/src/xgboost_R.cc index 1d01b9aae967..c91fb94c447c 100644 --- a/R-package/src/xgboost_R.cc +++ b/R-package/src/xgboost_R.cc @@ -27,7 +27,12 @@ #include "./xgboost_R.h" // Must follow other includes. namespace { -struct ErrorWithUnwind : public std::exception {}; + +/* Note: this class is used as a throwable exception. +Some xgboost C functions that use callbacks will catch exceptions +that happen inside of the callback execution, hence it purposefully +doesn't inherit from 'std::exception' even if used as such. */ +struct ErrorWithUnwind {}; void ThrowExceptionFromRError(void *, Rboolean jump) { if (jump) { @@ -51,6 +56,27 @@ SEXP SafeMkChar(const char *c_str, SEXP continuation_token) { continuation_token); } +struct RFunAndEnv { + SEXP R_fun; + SEXP R_calling_env; +}; + +SEXP WrappedExecFun(void *void_ptr) { + RFunAndEnv *r_fun_and_env = static_cast(void_ptr); + SEXP f_expr = Rf_protect(Rf_lang1(r_fun_and_env->R_fun)); + SEXP out = Rf_protect(Rf_eval(f_expr, r_fun_and_env->R_calling_env)); + Rf_unprotect(2); + return out; +} + +SEXP SafeExecFun(SEXP R_fun, SEXP R_calling_env, SEXP continuation_token) { + RFunAndEnv r_fun_and_env{R_fun, R_calling_env}; + return R_UnwindProtect( + WrappedExecFun, static_cast(&r_fun_and_env), + ThrowExceptionFromRError, nullptr, + continuation_token); +} + SEXP WrappedAllocReal(void *void_ptr) { size_t *size = static_cast(void_ptr); return Rf_allocVector(REALSXP, *size); @@ -140,6 +166,47 @@ SEXP SafeAllocInteger(size_t size, SEXP continuation_token) { return ""; } +[[nodiscard]] std::string MakeArrayInterfaceFromRDataFrame(SEXP R_df) { + auto make_vec = [&](auto const *ptr, std::size_t len) { + auto v = xgboost::linalg::MakeVec(ptr, len); + return xgboost::linalg::ArrayInterface(v); + }; + + R_xlen_t n_features = Rf_xlength(R_df); + std::vector array(n_features); + CHECK_GT(n_features, 0); + std::size_t len = Rf_xlength(VECTOR_ELT(R_df, 0)); + + // The `data.frame` in R actually converts all data into numeric. The other type + // handlers here are not used. At the moment they are kept as a reference for when we + // can avoid making data copies during transformation. + for (R_xlen_t i = 0; i < n_features; ++i) { + switch (TYPEOF(VECTOR_ELT(R_df, i))) { + case INTSXP: { + auto const *ptr = INTEGER(VECTOR_ELT(R_df, i)); + array[i] = make_vec(ptr, len); + break; + } + case REALSXP: { + auto const *ptr = REAL(VECTOR_ELT(R_df, i)); + array[i] = make_vec(ptr, len); + break; + } + case LGLSXP: { + auto const *ptr = LOGICAL(VECTOR_ELT(R_df, i)); + array[i] = make_vec(ptr, len); + break; + } + default: { + LOG(FATAL) << "data.frame has unsupported type."; + } + } + } + + xgboost::Json jinterface{std::move(array)}; + return xgboost::Json::Dump(jinterface); +} + [[nodiscard]] std::string MakeJsonConfigForArray(SEXP missing, SEXP n_threads, SEXPTYPE arr_type) { using namespace ::xgboost; // NOLINT Json jconfig{Object{}}; @@ -335,51 +402,13 @@ XGB_DLL SEXP XGDMatrixCreateFromDF_R(SEXP df, SEXP missing, SEXP n_threads) { R_API_BEGIN(); DMatrixHandle handle; - - auto make_vec = [&](auto const *ptr, std::int32_t len) { - auto v = xgboost::linalg::MakeVec(ptr, len); - return xgboost::linalg::ArrayInterface(v); - }; - std::int32_t rc{0}; { - using xgboost::Json; - auto n_features = Rf_xlength(df); - std::vector array(n_features); - CHECK_GT(n_features, 0); - auto len = Rf_xlength(VECTOR_ELT(df, 0)); - // The `data.frame` in R actually converts all data into numeric. The other type - // handlers here are not used. At the moment they are kept as a reference for when we - // can avoid making data copies during transformation. - for (decltype(n_features) i = 0; i < n_features; ++i) { - switch (TYPEOF(VECTOR_ELT(df, i))) { - case INTSXP: { - auto const *ptr = INTEGER(VECTOR_ELT(df, i)); - array[i] = make_vec(ptr, len); - break; - } - case REALSXP: { - auto const *ptr = REAL(VECTOR_ELT(df, i)); - array[i] = make_vec(ptr, len); - break; - } - case LGLSXP: { - auto const *ptr = LOGICAL(VECTOR_ELT(df, i)); - array[i] = make_vec(ptr, len); - break; - } - default: { - LOG(FATAL) << "data.frame has unsupported type."; - } - } - } - - Json jinterface{std::move(array)}; - auto sinterface = Json::Dump(jinterface); - Json jconfig{xgboost::Object{}}; + std::string sinterface = MakeArrayInterfaceFromRDataFrame(df); + xgboost::Json jconfig{xgboost::Object{}}; jconfig["missing"] = asReal(missing); jconfig["nthread"] = asInteger(n_threads); - auto sconfig = Json::Dump(jconfig); + std::string sconfig = xgboost::Json::Dump(jconfig); rc = XGDMatrixCreateFromColumnar(sinterface.c_str(), sconfig.c_str(), &handle); } @@ -632,6 +661,192 @@ XGB_DLL SEXP XGDMatrixNumCol_R(SEXP handle) { return ScalarInteger(static_cast(ncol)); } +XGB_DLL SEXP XGProxyDMatrixCreate_R() { + SEXP out = Rf_protect(R_MakeExternalPtr(nullptr, R_NilValue, R_NilValue)); + R_API_BEGIN(); + DMatrixHandle proxy_dmat_handle; + CHECK_CALL(XGProxyDMatrixCreate(&proxy_dmat_handle)); + R_SetExternalPtrAddr(out, proxy_dmat_handle); + R_RegisterCFinalizerEx(out, _DMatrixFinalizer, TRUE); + Rf_unprotect(1); + R_API_END(); + return out; +} + +XGB_DLL SEXP XGProxyDMatrixSetDataDense_R(SEXP handle, SEXP R_mat) { + R_API_BEGIN(); + DMatrixHandle proxy_dmat = R_ExternalPtrAddr(handle); + int res_code; + { + std::string array_str = MakeArrayInterfaceFromRMat(R_mat); + res_code = XGProxyDMatrixSetDataDense(proxy_dmat, array_str.c_str()); + } + CHECK_CALL(res_code); + R_API_END(); + return R_NilValue; +} + +XGB_DLL SEXP XGProxyDMatrixSetDataCSR_R(SEXP handle, SEXP lst) { + R_API_BEGIN(); + DMatrixHandle proxy_dmat = R_ExternalPtrAddr(handle); + int res_code; + { + std::string array_str_indptr = MakeArrayInterfaceFromRVector(VECTOR_ELT(lst, 0)); + std::string array_str_indices = MakeArrayInterfaceFromRVector(VECTOR_ELT(lst, 1)); + std::string array_str_data = MakeArrayInterfaceFromRVector(VECTOR_ELT(lst, 2)); + const int ncol = Rf_asInteger(VECTOR_ELT(lst, 3)); + res_code = XGProxyDMatrixSetDataCSR(proxy_dmat, + array_str_indptr.c_str(), + array_str_indices.c_str(), + array_str_data.c_str(), + ncol); + } + CHECK_CALL(res_code); + R_API_END(); + return R_NilValue; +} + +XGB_DLL SEXP XGProxyDMatrixSetDataColumnar_R(SEXP handle, SEXP lst) { + R_API_BEGIN(); + DMatrixHandle proxy_dmat = R_ExternalPtrAddr(handle); + int res_code; + { + std::string sinterface = MakeArrayInterfaceFromRDataFrame(lst); + res_code = XGProxyDMatrixSetDataColumnar(proxy_dmat, sinterface.c_str()); + } + CHECK_CALL(res_code); + R_API_END(); + return R_NilValue; +} + +namespace { + +struct _RDataIterator { + SEXP f_next; + SEXP f_reset; + SEXP calling_env; + SEXP continuation_token; + + _RDataIterator( + SEXP f_next, SEXP f_reset, SEXP calling_env, SEXP continuation_token) : + f_next(f_next), f_reset(f_reset), calling_env(calling_env), + continuation_token(continuation_token) {} + + void reset() { + SafeExecFun(this->f_reset, this->calling_env, this->continuation_token); + } + + int next() { + SEXP R_res = Rf_protect( + SafeExecFun(this->f_next, this->calling_env, this->continuation_token)); + int res = Rf_asInteger(R_res); + Rf_unprotect(1); + return res; + } +}; + +void _reset_RDataIterator(DataIterHandle iter) { + static_cast<_RDataIterator*>(iter)->reset(); +} + +int _next_RDataIterator(DataIterHandle iter) { + return static_cast<_RDataIterator*>(iter)->next(); +} + +SEXP XGDMatrixCreateFromCallbackGeneric_R( + SEXP f_next, SEXP f_reset, SEXP calling_env, SEXP proxy_dmat, + SEXP n_threads, SEXP missing, SEXP max_bin, SEXP ref_dmat, + SEXP cache_prefix, bool as_quantile_dmatrix) { + SEXP continuation_token = Rf_protect(R_MakeUnwindCont()); + SEXP out = Rf_protect(R_MakeExternalPtr(nullptr, R_NilValue, R_NilValue)); + R_API_BEGIN(); + DMatrixHandle out_dmat; + + int res_code; + try { + _RDataIterator data_iterator(f_next, f_reset, calling_env, continuation_token); + + std::string str_cache_prefix; + xgboost::Json jconfig{xgboost::Object{}}; + jconfig["missing"] = Rf_asReal(missing); + if (!Rf_isNull(n_threads)) { + jconfig["nthread"] = Rf_asInteger(n_threads); + } + if (as_quantile_dmatrix) { + if (!Rf_isNull(max_bin)) { + jconfig["max_bin"] = Rf_asInteger(max_bin); + } + } else { + str_cache_prefix = std::string(CHAR(Rf_asChar(cache_prefix))); + jconfig["cache_prefix"] = str_cache_prefix; + } + std::string json_str = xgboost::Json::Dump(jconfig); + + DMatrixHandle ref_dmat_handle = nullptr; + if (as_quantile_dmatrix && !Rf_isNull(ref_dmat)) { + ref_dmat_handle = R_ExternalPtrAddr(ref_dmat); + } + + if (as_quantile_dmatrix) { + res_code = XGQuantileDMatrixCreateFromCallback( + &data_iterator, + R_ExternalPtrAddr(proxy_dmat), + ref_dmat_handle, + _reset_RDataIterator, + _next_RDataIterator, + json_str.c_str(), + &out_dmat); + } else { + res_code = XGDMatrixCreateFromCallback( + &data_iterator, + R_ExternalPtrAddr(proxy_dmat), + _reset_RDataIterator, + _next_RDataIterator, + json_str.c_str(), + &out_dmat); + } + } catch (ErrorWithUnwind &e) { + R_ContinueUnwind(continuation_token); + } + CHECK_CALL(res_code); + + R_SetExternalPtrAddr(out, out_dmat); + R_RegisterCFinalizerEx(out, _DMatrixFinalizer, TRUE); + Rf_unprotect(2); + R_API_END(); + return out; +} + +} /* namespace */ + +XGB_DLL SEXP XGQuantileDMatrixCreateFromCallback_R( + SEXP f_next, SEXP f_reset, SEXP calling_env, SEXP proxy_dmat, + SEXP n_threads, SEXP missing, SEXP max_bin, SEXP ref_dmat) { + return XGDMatrixCreateFromCallbackGeneric_R( + f_next, f_reset, calling_env, proxy_dmat, + n_threads, missing, max_bin, ref_dmat, + R_NilValue, true); +} + +XGB_DLL SEXP XGDMatrixCreateFromCallback_R( + SEXP f_next, SEXP f_reset, SEXP calling_env, SEXP proxy_dmat, + SEXP n_threads, SEXP missing, SEXP cache_prefix) { + return XGDMatrixCreateFromCallbackGeneric_R( + f_next, f_reset, calling_env, proxy_dmat, + n_threads, missing, R_NilValue, R_NilValue, + cache_prefix, false); +} + +XGB_DLL SEXP XGDMatrixFree_R(SEXP proxy_dmat) { + _DMatrixFinalizer(proxy_dmat); + return R_NilValue; +} + +XGB_DLL SEXP XGGetRNAIntAsDouble() { + double sentinel_as_double = static_cast(R_NaInt); + return Rf_ScalarReal(sentinel_as_double); +} + XGB_DLL SEXP XGDuplicate_R(SEXP obj) { return Rf_duplicate(obj); } diff --git a/R-package/src/xgboost_R.h b/R-package/src/xgboost_R.h index ec30dbada79f..d2e0ae82855d 100644 --- a/R-package/src/xgboost_R.h +++ b/R-package/src/xgboost_R.h @@ -161,6 +161,84 @@ XGB_DLL SEXP XGDMatrixNumRow_R(SEXP handle); */ XGB_DLL SEXP XGDMatrixNumCol_R(SEXP handle); +/*! +<<<<<<< HEAD + * \brief create a ProxyDMatrix and get an R externalptr object for it + */ +XGB_DLL SEXP XGProxyDMatrixCreate_R(); + +/*! + * \brief Set dense matrix data on a proxy dmatrix + * \param handle R externalptr pointing to a ProxyDMatrix + * \param R_mat R matrix to set in the proxy dmatrix + */ +XGB_DLL SEXP XGProxyDMatrixSetDataDense_R(SEXP handle, SEXP R_mat); + +/*! + * \brief Set dense matrix data on a proxy dmatrix + * \param handle R externalptr pointing to a ProxyDMatrix + * \param lst R list containing, in this order: + * 1. 'p' or 'indptr' vector of the CSR matrix. + * 2. 'j' or 'indices' vector of the CSR matrix. + * 3. 'x' or 'data' vector of the CSR matrix. + * 4. Number of columns in the CSR matrix. + */ +XGB_DLL SEXP XGProxyDMatrixSetDataCSR_R(SEXP handle, SEXP lst); + +/*! + * \brief Set dense matrix data on a proxy dmatrix + * \param handle R externalptr pointing to a ProxyDMatrix + * \param lst R list or data.frame object containing its columns as numeric vectors + */ +XGB_DLL SEXP XGProxyDMatrixSetDataColumnar_R(SEXP handle, SEXP lst); + +/*! + * \brief Create a DMatrix from a DataIter with callbacks + * \param expr_f_next expression for function(env, proxy_dmat) that sets the data on the proxy + * dmatrix and returns either zero (end of batch) or one (batch continues). + * \param expr_f_reset expression for function(env) that resets the data iterator to + * the beginning (first batch). + * \param calling_env R environment where to evaluate the expressions above + * \param proxy_dmat R externalptr holding a ProxyDMatrix. + * \param n_threads number of parallel threads to use for constructing the DMatrix. + * \param missing which value to represent missing value. + * \param cache_prefix path of cache file + * \return handle R externalptr holding the resulting DMatrix. + */ +XGB_DLL SEXP XGDMatrixCreateFromCallback_R( + SEXP expr_f_next, SEXP expr_f_reset, SEXP calling_env, SEXP proxy_dmat, + SEXP n_threads, SEXP missing, SEXP cache_prefix); + +/*! + * \brief Create a QuantileDMatrix from a DataIter with callbacks + * \param expr_f_next expression for function(env, proxy_dmat) that sets the data on the proxy + * dmatrix and returns either zero (end of batch) or one (batch continues). + * \param expr_f_reset expression for function(env) that resets the data iterator to + * the beginning (first batch). + * \param calling_env R environment where to evaluate the expressions above + * \param proxy_dmat R externalptr holding a ProxyDMatrix. + * \param n_threads number of parallel threads to use for constructing the QuantileDMatrix. + * \param missing which value to represent missing value. + * \param max_bin maximum number of bins to have in the resulting QuantileDMatrix. + * \param ref_dmat an optional reference DMatrix from which to get the bin boundaries. + * \return handle R externalptr holding the resulting QuantileDMatrix. + */ +XGB_DLL SEXP XGQuantileDMatrixCreateFromCallback_R( + SEXP expr_f_next, SEXP expr_f_reset, SEXP calling_env, SEXP proxy_dmat, + SEXP n_threads, SEXP missing, SEXP max_bin, SEXP ref_dmat); + +/*! + * \brief Frees a ProxyDMatrix and empties out the R externalptr object that holds it + * \param proxy_dmat R externalptr containing a ProxyDMatrix + * \return NULL + */ +XGB_DLL SEXP XGDMatrixFree_R(SEXP proxy_dmat); + +/*! + * \brief Get the value that represents missingness in R integers as a numeric non-missing value. + */ +XGB_DLL SEXP XGGetRNAIntAsDouble(); + /*! * \brief Call R C-level function 'duplicate' * \param obj Object to duplicate diff --git a/R-package/tests/testthat/test_dmatrix.R b/R-package/tests/testthat/test_dmatrix.R index 568aaa3bd78d..65374240df00 100644 --- a/R-package/tests/testthat/test_dmatrix.R +++ b/R-package/tests/testthat/test_dmatrix.R @@ -343,7 +343,7 @@ test_that("xgb.DMatrix: data.frame", { expect_equal( getinfo(m, "feature_type"), c("float", "float", "int", "i", "c", "c") ) - expect_error(xgb.DMatrix(df)) + expect_error(xgb.DMatrix(df, enable_categorical = FALSE)) df <- data.frame( missing = c("a", "b", "d", NA), @@ -380,6 +380,261 @@ test_that("xgb.DMatrix: can take multi-dimensional 'base_margin'", { expect_equal(pred_only_x, pred_w_base - b, tolerance = 1e-5) }) +test_that("xgb.DMatrix: QuantileDMatrix produces same result as DMatrix", { + data(mtcars) + y <- mtcars[, 1] + x <- mtcars[, -1] + + cast_matrix <- function(x) as.matrix(x) + cast_df <- function(x) as.data.frame(x) + cast_csr <- function(x) as(as.matrix(x), "RsparseMatrix") + casting_funs <- list(cast_matrix, cast_df, cast_csr) + + for (casting_fun in casting_funs) { + + qdm <- xgb.QuantileDMatrix( + data = casting_fun(x), + label = y, + nthread = n_threads, + max_bin = 5 + ) + params <- list( + tree_method = "hist", + objective = "reg:squarederror", + nthread = n_threads, + max_bin = 5 + ) + model_qdm <- xgb.train( + params = params, + data = qdm, + nrounds = 2 + ) + pred_qdm <- predict(model_qdm, x) + + dm <- xgb.DMatrix( + data = x, + label = y, + nthread = n_threads + ) + model_dm <- xgb.train( + params = params, + data = dm, + nrounds = 2 + ) + pred_dm <- predict(model_dm, x) + + expect_equal(pred_qdm, pred_dm) + } +}) + +test_that("xgb.DMatrix: QuantileDMatrix is not accepted by exact method", { + data(mtcars) + y <- mtcars[, 1] + x <- as.matrix(mtcars[, -1]) + qdm <- xgb.QuantileDMatrix( + data = x, + label = y, + nthread = n_threads + ) + params <- list( + tree_method = "exact", + objective = "reg:squarederror", + nthread = n_threads + ) + expect_error({ + xgb.train( + params = params, + data = qdm, + nrounds = 2 + ) + }) +}) + +test_that("xgb.DMatrix: ExternalDMatrix produces the same results as regular DMatrix", { + data(mtcars) + y <- mtcars[, 1] + x <- as.matrix(mtcars[, -1]) + set.seed(123) + params <- list( + objective = "reg:squarederror", + nthread = n_threads + ) + model <- xgb.train( + data = xgb.DMatrix(x, label = y), + params = params, + nrounds = 5 + ) + pred <- predict(model, x) + + iterator_env <- as.environment( + list( + iter = 0, + x = mtcars[, -1], + y = mtcars[, 1] + ) + ) + iterator_next <- function(iterator_env, proxy_handle) { + curr_iter <- iterator_env[["iter"]] + if (curr_iter >= 2) { + return(NULL) + } + if (curr_iter == 0) { + x_batch <- iterator_env[["x"]][1:16, ] + y_batch <- iterator_env[["y"]][1:16] + } else { + x_batch <- iterator_env[["x"]][17:32, ] + y_batch <- iterator_env[["y"]][17:32] + } + on.exit({ + iterator_env[["iter"]] <- curr_iter + 1 + }) + return(xgb.ProxyDMatrix(data = x_batch, label = y_batch)) + } + iterator_reset <- function(iterator_env) { + iterator_env[["iter"]] <- 0 + } + data_iterator <- xgb.DataIter( + env = iterator_env, + f_next = iterator_next, + f_reset = iterator_reset + ) + cache_prefix <- tempdir() + edm <- xgb.ExternalDMatrix(data_iterator, cache_prefix, nthread = 1) + expect_true(inherits(edm, "xgb.ExternalDMatrix")) + expect_true(inherits(edm, "xgb.DMatrix")) + set.seed(123) + model_ext <- xgb.train( + data = edm, + params = params, + nrounds = 5 + ) + + pred_model1_edm <- predict(model, edm) + pred_model2_mat <- predict(model_ext, x) + pred_model2_edm <- predict(model_ext, edm) + + expect_equal(pred_model1_edm, pred) + expect_equal(pred_model2_mat, pred) + expect_equal(pred_model2_edm, pred) +}) + +test_that("xgb.DMatrix: External QDM produces same results as regular QDM", { + data(mtcars) + y <- mtcars[, 1] + x <- as.matrix(mtcars[, -1]) + set.seed(123) + params <- list( + objective = "reg:squarederror", + nthread = n_threads, + max_bin = 3 + ) + model <- xgb.train( + data = xgb.QuantileDMatrix( + x, + label = y, + nthread = 1, + max_bin = 3 + ), + params = params, + nrounds = 5 + ) + pred <- predict(model, x) + + iterator_env <- as.environment( + list( + iter = 0, + x = mtcars[, -1], + y = mtcars[, 1] + ) + ) + iterator_next <- function(iterator_env, proxy_handle) { + curr_iter <- iterator_env[["iter"]] + if (curr_iter >= 2) { + return(NULL) + } + if (curr_iter == 0) { + x_batch <- iterator_env[["x"]][1:16, ] + y_batch <- iterator_env[["y"]][1:16] + } else { + x_batch <- iterator_env[["x"]][17:32, ] + y_batch <- iterator_env[["y"]][17:32] + } + on.exit({ + iterator_env[["iter"]] <- curr_iter + 1 + }) + return(xgb.ProxyDMatrix(data = x_batch, label = y_batch)) + } + iterator_reset <- function(iterator_env) { + iterator_env[["iter"]] <- 0 + } + data_iterator <- xgb.DataIter( + env = iterator_env, + f_next = iterator_next, + f_reset = iterator_reset + ) + cache_prefix <- tempdir() + qdm <- xgb.QuantileDMatrix.from_iterator( + data_iterator, + max_bin = 3, + nthread = 1 + ) + expect_true(inherits(qdm, "xgb.QuantileDMatrix")) + expect_true(inherits(qdm, "xgb.DMatrix")) + set.seed(123) + model_ext <- xgb.train( + data = qdm, + params = params, + nrounds = 5 + ) + + pred_model1_qdm <- predict(model, qdm) + pred_model2_mat <- predict(model_ext, x) + pred_model2_qdm <- predict(model_ext, qdm) + + expect_equal(pred_model1_qdm, pred) + expect_equal(pred_model2_mat, pred) + expect_equal(pred_model2_qdm, pred) +}) + +test_that("xgb.DMatrix: R errors thrown on DataIterator are thrown back to the user", { + data(mtcars) + iterator_env <- as.environment( + list( + iter = 0, + x = mtcars[, -1], + y = mtcars[, 1] + ) + ) + iterator_next <- function(iterator_env, proxy_handle) { + curr_iter <- iterator_env[["iter"]] + if (curr_iter >= 2) { + return(0) + } + if (curr_iter == 0) { + x_batch <- iterator_env[["x"]][1:16, ] + y_batch <- iterator_env[["y"]][1:16] + } else { + stop("custom error") + } + on.exit({ + iterator_env[["iter"]] <- curr_iter + 1 + }) + return(xgb.ProxyDMatrix(data = x_batch, label = y_batch)) + } + iterator_reset <- function(iterator_env) { + iterator_env[["iter"]] <- 0 + } + data_iterator <- xgb.DataIter( + env = iterator_env, + f_next = iterator_next, + f_reset = iterator_reset + ) + expect_error( + {xgb.ExternalDMatrix(data_iterator, nthread = 1)}, + "custom error" + ) +}) + test_that("xgb.DMatrix: number of non-missing matches data", { x <- matrix(1:10, nrow = 5) dm1 <- xgb.DMatrix(x) diff --git a/python-package/xgboost/core.py b/python-package/xgboost/core.py index 5761b4b14db7..27331d3de5ca 100644 --- a/python-package/xgboost/core.py +++ b/python-package/xgboost/core.py @@ -798,9 +798,23 @@ def __init__( Set names for features. feature_types : - Set types for features. When `enable_categorical` is set to `True`, string - "c" represents categorical data type while "q" represents numerical feature - type. For categorical features, the input is assumed to be preprocessed and + Set types for features. If `data` is a DataFrame type and passing + `enable_categorical=True`, the types will be deduced automatically + from the column types. + + Otherwise, one can pass a list-like input with the same length as number + of columns in `data`, with the following possible values: + - "c", which represents categorical columns. + - "q", which represents numeric columns. + - "int", which represents integer columns. + - "i", which represents boolean columns. + + Note that, while categorical types are treated differently from + the rest for model fitting purposes, the other types do not influence + the generated model, but have effects in other functionalities such as + feature importances. + + For categorical features, the input is assumed to be preprocessed and encoded by the users. The encoding can be done via :py:class:`sklearn.preprocessing.OrdinalEncoder` or pandas dataframe `.cat.codes` method. This is useful when users want to specify categorical