From f7005d32c182e1c8751b2549cde9d16f327486a1 Mon Sep 17 00:00:00 2001 From: david-cortes Date: Fri, 23 Feb 2024 19:03:54 +0100 Subject: [PATCH] [R] Use inplace predict (#9829) --------- Co-authored-by: Hyunsu Cho --- R-package/R/xgb.Booster.R | 137 +++++++++++++++--- R-package/man/predict.xgb.Booster.Rd | 45 ++++-- R-package/src/init.c | 6 + R-package/src/xgboost_R.cc | 184 +++++++++++++++++++++--- R-package/src/xgboost_R.h | 44 ++++++ R-package/tests/testthat/test_basic.R | 49 ++++++- R-package/tests/testthat/test_dmatrix.R | 31 ++++ 7 files changed, 450 insertions(+), 46 deletions(-) diff --git a/R-package/R/xgb.Booster.R b/R-package/R/xgb.Booster.R index febefb757129..8a5d66198834 100644 --- a/R-package/R/xgb.Booster.R +++ b/R-package/R/xgb.Booster.R @@ -77,26 +77,45 @@ xgb.get.handle <- function(object) { #' Predict method for XGBoost model #' -#' Predicted values based on either xgboost model or model handle object. +#' Predict values on data based on xgboost model. #' #' @param object Object of class `xgb.Booster`. -#' @param newdata Takes `matrix`, `dgCMatrix`, `dgRMatrix`, `dsparseVector`, +#' @param newdata Takes `data.frame`, `matrix`, `dgCMatrix`, `dgRMatrix`, `dsparseVector`, #' local data file, or `xgb.DMatrix`. -#' For single-row predictions on sparse data, it is recommended to use the CSR format. -#' If passing a sparse vector, it will take it as a row vector. -#' @param missing Only used when input is a dense matrix. Pick a float value that represents -#' missing values in data (e.g., 0 or some other extreme value). +#' +#' For single-row predictions on sparse data, it's recommended to use CSR format. If passing +#' a sparse vector, it will take it as a row vector. +#' +#' Note that, for repeated predictions on the same data, one might want to create a DMatrix to +#' pass here instead of passing R types like matrices or data frames, as predictions will be +#' faster on DMatrix. +#' +#' If `newdata` is a `data.frame`, be aware that:\itemize{ +#' \item Columns will be converted to numeric if they aren't already, which could potentially make +#' the operation slower than in an equivalent `matrix` object. +#' \item The order of the columns must match with that of the data from which the model was fitted +#' (i.e. columns will not be referenced by their names, just by their order in the data). +#' \item If the model was fitted to data with categorical columns, these columns must be of +#' `factor` type here, and must use the same encoding (i.e. have the same levels). +#' \item If `newdata` contains any `factor` columns, they will be converted to base-0 +#' encoding (same as during DMatrix creation) - hence, one should not pass a `factor` +#' under a column which during training had a different type. +#' } +#' @param missing Float value that represents missing values in data (e.g., 0 or some other extreme value). +#' +#' This parameter is not used when `newdata` is an `xgb.DMatrix` - in such cases, should pass +#' this as an argument to the DMatrix constructor instead. #' @param outputmargin Whether the prediction should be returned in the form of original untransformed #' sum of predictions from boosting iterations' results. E.g., setting `outputmargin=TRUE` for #' logistic regression would return log-odds instead of probabilities. -#' @param predleaf Whether to predict pre-tree leaf indices. +#' @param predleaf Whether to predict per-tree leaf indices. #' @param predcontrib Whether to return feature contributions to individual predictions (see Details). #' @param approxcontrib Whether to use a fast approximation for feature contributions (see Details). #' @param predinteraction Whether to return contributions of feature interactions to individual predictions (see Details). #' @param reshape Whether to reshape the vector of predictions to matrix form when there are several #' prediction outputs per case. No effect if `predleaf`, `predcontrib`, #' or `predinteraction` is `TRUE`. -#' @param training Whether the predictions are used for training. For dart booster, +#' @param training Whether the prediction result is used for training. For dart booster, #' training predicting will perform dropout. #' @param iterationrange Sequence of rounds/iterations from the model to use for prediction, specified by passing #' a two-dimensional vector with the start and end numbers in the sequence (same format as R's `seq` - i.e. @@ -111,6 +130,12 @@ xgb.get.handle <- function(object) { #' If passing "all", will use all of the rounds regardless of whether the model had early stopping or not. #' @param strict_shape Default is `FALSE`. When set to `TRUE`, the output #' type and shape of predictions are invariant to the model type. +#' @param base_margin Base margin used for boosting from existing model. +#' +#' Note that, if `newdata` is an `xgb.DMatrix` object, this argument will +#' be ignored as it needs to be added to the DMatrix instead (e.g. by passing it as +#' an argument in its constructor, or by calling \link{setinfo.xgb.DMatrix}). +#' #' @param validate_features When `TRUE`, validate that the Booster's and newdata's feature_names #' match (only applicable when both `object` and `newdata` have feature names). #' @@ -287,16 +312,80 @@ xgb.get.handle <- function(object) { predict.xgb.Booster <- function(object, newdata, missing = NA, outputmargin = FALSE, predleaf = FALSE, predcontrib = FALSE, approxcontrib = FALSE, predinteraction = FALSE, reshape = FALSE, training = FALSE, iterationrange = NULL, strict_shape = FALSE, - validate_features = FALSE, ...) { + validate_features = FALSE, base_margin = NULL, ...) { if (validate_features) { newdata <- validate.features(object, newdata) } - if (!inherits(newdata, "xgb.DMatrix")) { + is_dmatrix <- inherits(newdata, "xgb.DMatrix") + if (is_dmatrix && !is.null(base_margin)) { + stop( + "'base_margin' is not supported when passing 'xgb.DMatrix' as input.", + " Should be passed as argument to 'xgb.DMatrix' constructor." + ) + } + + use_as_df <- FALSE + use_as_dense_matrix <- FALSE + use_as_csr_matrix <- FALSE + n_row <- NULL + if (!is_dmatrix) { + + inplace_predict_supported <- !predcontrib && !predinteraction && !predleaf + if (inplace_predict_supported) { + booster_type <- xgb.booster_type(object) + if (booster_type == "gblinear" || (booster_type == "dart" && training)) { + inplace_predict_supported <- FALSE + } + } + if (inplace_predict_supported) { + + if (is.matrix(newdata)) { + use_as_dense_matrix <- TRUE + } else if (is.data.frame(newdata)) { + # note: since here it turns it into a non-data-frame list, + # needs to keep track of the number of rows it had for later + n_row <- nrow(newdata) + newdata <- lapply( + newdata, + function(x) { + if (is.factor(x)) { + return(as.numeric(x) - 1) + } else { + return(as.numeric(x)) + } + } + ) + use_as_df <- TRUE + } else if (inherits(newdata, "dgRMatrix")) { + use_as_csr_matrix <- TRUE + csr_data <- list(newdata@p, newdata@j, newdata@x, ncol(newdata)) + } else if (inherits(newdata, "dsparseVector")) { + use_as_csr_matrix <- TRUE + n_row <- 1L + i <- newdata@i - 1L + if (storage.mode(i) != "integer") { + storage.mode(i) <- "integer" + } + csr_data <- list(c(0L, length(i)), i, newdata@x, length(newdata)) + } + + } + + } # if (!is_dmatrix) + + if (!is_dmatrix && !use_as_dense_matrix && !use_as_csr_matrix && !use_as_df) { nthread <- xgb.nthread(object) newdata <- xgb.DMatrix( newdata, - missing = missing, nthread = NVL(nthread, -1) + missing = missing, + base_margin = base_margin, + nthread = NVL(nthread, -1) ) + is_dmatrix <- TRUE + } + + if (is.null(n_row)) { + n_row <- nrow(newdata) } @@ -354,18 +443,30 @@ predict.xgb.Booster <- function(object, newdata, missing = NA, outputmargin = FA args$type <- set_type(6) } - predts <- .Call( - XGBoosterPredictFromDMatrix_R, - xgb.get.handle(object), - newdata, - jsonlite::toJSON(args, auto_unbox = TRUE) - ) + json_conf <- jsonlite::toJSON(args, auto_unbox = TRUE) + if (is_dmatrix) { + predts <- .Call( + XGBoosterPredictFromDMatrix_R, xgb.get.handle(object), newdata, json_conf + ) + } else if (use_as_dense_matrix) { + predts <- .Call( + XGBoosterPredictFromDense_R, xgb.get.handle(object), newdata, missing, json_conf, base_margin + ) + } else if (use_as_csr_matrix) { + predts <- .Call( + XGBoosterPredictFromCSR_R, xgb.get.handle(object), csr_data, missing, json_conf, base_margin + ) + } else if (use_as_df) { + predts <- .Call( + XGBoosterPredictFromColumnar_R, xgb.get.handle(object), newdata, missing, json_conf, base_margin + ) + } + names(predts) <- c("shape", "results") shape <- predts$shape arr <- predts$results n_ret <- length(arr) - n_row <- nrow(newdata) if (n_row != shape[1]) { stop("Incorrect predict shape.") } diff --git a/R-package/man/predict.xgb.Booster.Rd b/R-package/man/predict.xgb.Booster.Rd index 95e7a51fd043..88a2f203efcd 100644 --- a/R-package/man/predict.xgb.Booster.Rd +++ b/R-package/man/predict.xgb.Booster.Rd @@ -18,25 +18,47 @@ iterationrange = NULL, strict_shape = FALSE, validate_features = FALSE, + base_margin = NULL, ... ) } \arguments{ \item{object}{Object of class \code{xgb.Booster}.} -\item{newdata}{Takes \code{matrix}, \code{dgCMatrix}, \code{dgRMatrix}, \code{dsparseVector}, +\item{newdata}{Takes \code{data.frame}, \code{matrix}, \code{dgCMatrix}, \code{dgRMatrix}, \code{dsparseVector}, local data file, or \code{xgb.DMatrix}. -For single-row predictions on sparse data, it is recommended to use the CSR format. -If passing a sparse vector, it will take it as a row vector.} -\item{missing}{Only used when input is a dense matrix. Pick a float value that represents -missing values in data (e.g., 0 or some other extreme value).} +\if{html}{\out{
}}\preformatted{ For single-row predictions on sparse data, it's recommended to use CSR format. If passing + a sparse vector, it will take it as a row vector. + + Note that, for repeated predictions on the same data, one might want to create a DMatrix to + pass here instead of passing R types like matrices or data frames, as predictions will be + faster on DMatrix. + + If `newdata` is a `data.frame`, be aware that:\\itemize\{ + \\item Columns will be converted to numeric if they aren't already, which could potentially make + the operation slower than in an equivalent `matrix` object. + \\item The order of the columns must match with that of the data from which the model was fitted + (i.e. columns will not be referenced by their names, just by their order in the data). + \\item If the model was fitted to data with categorical columns, these columns must be of + `factor` type here, and must use the same encoding (i.e. have the same levels). + \\item If `newdata` contains any `factor` columns, they will be converted to base-0 + encoding (same as during DMatrix creation) - hence, one should not pass a `factor` + under a column which during training had a different type. + \} +}\if{html}{\out{
}}} + +\item{missing}{Float value that represents missing values in data (e.g., 0 or some other extreme value). + +\if{html}{\out{
}}\preformatted{ This parameter is not used when `newdata` is an `xgb.DMatrix` - in such cases, should pass + this as an argument to the DMatrix constructor instead. +}\if{html}{\out{
}}} \item{outputmargin}{Whether the prediction should be returned in the form of original untransformed sum of predictions from boosting iterations' results. E.g., setting \code{outputmargin=TRUE} for logistic regression would return log-odds instead of probabilities.} -\item{predleaf}{Whether to predict pre-tree leaf indices.} +\item{predleaf}{Whether to predict per-tree leaf indices.} \item{predcontrib}{Whether to return feature contributions to individual predictions (see Details).} @@ -48,7 +70,7 @@ logistic regression would return log-odds instead of probabilities.} prediction outputs per case. No effect if \code{predleaf}, \code{predcontrib}, or \code{predinteraction} is \code{TRUE}.} -\item{training}{Whether the predictions are used for training. For dart booster, +\item{training}{Whether the prediction result is used for training. For dart booster, training predicting will perform dropout.} \item{iterationrange}{Sequence of rounds/iterations from the model to use for prediction, specified by passing @@ -84,6 +106,13 @@ match (only applicable when both \code{object} and \code{newdata} have feature n recommended to disable it for performance-sensitive applications. }\if{html}{\out{}}} +\item{base_margin}{Base margin used for boosting from existing model. + +\if{html}{\out{
}}\preformatted{ Note that, if `newdata` is an `xgb.DMatrix` object, this argument will + be ignored as it needs to be added to the DMatrix instead (e.g. by passing it as + an argument in its constructor, or by calling \link{setinfo.xgb.DMatrix}). +}\if{html}{\out{
}}} + \item{...}{Not used.} } \value{ @@ -115,7 +144,7 @@ When \code{strict_shape = TRUE}, the output is always an array: } } \description{ -Predicted values based on either xgboost model or model handle object. +Predict values on data based on xgboost model. } \details{ Note that \code{iterationrange} would currently do nothing for predictions from "gblinear", diff --git a/R-package/src/init.c b/R-package/src/init.c index 36f3e8953639..f2635742ebd7 100644 --- a/R-package/src/init.c +++ b/R-package/src/init.c @@ -37,6 +37,9 @@ extern SEXP XGBoosterLoadJsonConfig_R(SEXP handle, SEXP value); extern SEXP XGBoosterSerializeToBuffer_R(SEXP handle); extern SEXP XGBoosterUnserializeFromBuffer_R(SEXP handle, SEXP raw); extern SEXP XGBoosterPredictFromDMatrix_R(SEXP, SEXP, SEXP); +extern SEXP XGBoosterPredictFromDense_R(SEXP, SEXP, SEXP, SEXP, SEXP); +extern SEXP XGBoosterPredictFromCSR_R(SEXP, SEXP, SEXP, SEXP, SEXP); +extern SEXP XGBoosterPredictFromColumnar_R(SEXP, SEXP, SEXP, SEXP, SEXP); extern SEXP XGBoosterSaveModel_R(SEXP, SEXP); extern SEXP XGBoosterSetAttr_R(SEXP, SEXP, SEXP); extern SEXP XGBoosterSetParam_R(SEXP, SEXP, SEXP); @@ -96,6 +99,9 @@ static const R_CallMethodDef CallEntries[] = { {"XGBoosterSerializeToBuffer_R", (DL_FUNC) &XGBoosterSerializeToBuffer_R, 1}, {"XGBoosterUnserializeFromBuffer_R", (DL_FUNC) &XGBoosterUnserializeFromBuffer_R, 2}, {"XGBoosterPredictFromDMatrix_R", (DL_FUNC) &XGBoosterPredictFromDMatrix_R, 3}, + {"XGBoosterPredictFromDense_R", (DL_FUNC) &XGBoosterPredictFromDense_R, 5}, + {"XGBoosterPredictFromCSR_R", (DL_FUNC) &XGBoosterPredictFromCSR_R, 5}, + {"XGBoosterPredictFromColumnar_R", (DL_FUNC) &XGBoosterPredictFromColumnar_R, 5}, {"XGBoosterSaveModel_R", (DL_FUNC) &XGBoosterSaveModel_R, 2}, {"XGBoosterSetAttr_R", (DL_FUNC) &XGBoosterSetAttr_R, 3}, {"XGBoosterSetParam_R", (DL_FUNC) &XGBoosterSetParam_R, 3}, diff --git a/R-package/src/xgboost_R.cc b/R-package/src/xgboost_R.cc index 4192f82fbaa6..5baf8d41282e 100644 --- a/R-package/src/xgboost_R.cc +++ b/R-package/src/xgboost_R.cc @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include @@ -207,25 +208,24 @@ SEXP SafeAllocInteger(size_t size, SEXP continuation_token) { return xgboost::Json::Dump(jinterface); } -[[nodiscard]] std::string MakeJsonConfigForArray(SEXP missing, SEXP n_threads, SEXPTYPE arr_type) { - using namespace ::xgboost; // NOLINT - Json jconfig{Object{}}; - - const SEXPTYPE missing_type = TYPEOF(missing); - if (Rf_isNull(missing) || (missing_type == REALSXP && ISNAN(Rf_asReal(missing))) || - (missing_type == LGLSXP && Rf_asLogical(missing) == R_NaInt) || - (missing_type == INTSXP && Rf_asInteger(missing) == R_NaInt)) { +void AddMissingToJson(xgboost::Json *jconfig, SEXP missing, SEXPTYPE arr_type) { + if (Rf_isNull(missing) || ISNAN(Rf_asReal(missing))) { // missing is not specified if (arr_type == REALSXP) { - jconfig["missing"] = std::numeric_limits::quiet_NaN(); + (*jconfig)["missing"] = std::numeric_limits::quiet_NaN(); } else { - jconfig["missing"] = R_NaInt; + (*jconfig)["missing"] = R_NaInt; } } else { // missing specified - jconfig["missing"] = Rf_asReal(missing); + (*jconfig)["missing"] = Rf_asReal(missing); } +} +[[nodiscard]] std::string MakeJsonConfigForArray(SEXP missing, SEXP n_threads, SEXPTYPE arr_type) { + using namespace ::xgboost; // NOLINT + Json jconfig{Object{}}; + AddMissingToJson(&jconfig, missing, arr_type); jconfig["nthread"] = Rf_asInteger(n_threads); return Json::Dump(jconfig); } @@ -411,7 +411,7 @@ XGB_DLL SEXP XGDMatrixCreateFromDF_R(SEXP df, SEXP missing, SEXP n_threads) { DMatrixHandle handle; std::int32_t rc{0}; { - std::string sinterface = MakeArrayInterfaceFromRDataFrame(df); + const std::string sinterface = MakeArrayInterfaceFromRDataFrame(df); xgboost::Json jconfig{xgboost::Object{}}; jconfig["missing"] = asReal(missing); jconfig["nthread"] = asInteger(n_threads); @@ -463,7 +463,7 @@ XGB_DLL SEXP XGDMatrixCreateFromCSC_R(SEXP indptr, SEXP indices, SEXP data, SEXP Json jconfig{Object{}}; // Construct configuration jconfig["nthread"] = Integer{threads}; - jconfig["missing"] = xgboost::Number{asReal(missing)}; + AddMissingToJson(&jconfig, missing, TYPEOF(data)); std::string config; Json::Dump(jconfig, &config); res_code = XGDMatrixCreateFromCSC(sindptr.c_str(), sindices.c_str(), sdata.c_str(), nrow, @@ -498,7 +498,7 @@ XGB_DLL SEXP XGDMatrixCreateFromCSR_R(SEXP indptr, SEXP indices, SEXP data, SEXP Json jconfig{Object{}}; // Construct configuration jconfig["nthread"] = Integer{threads}; - jconfig["missing"] = xgboost::Number{asReal(missing)}; + AddMissingToJson(&jconfig, missing, TYPEOF(data)); std::string config; Json::Dump(jconfig, &config); res_code = XGDMatrixCreateFromCSR(sindptr.c_str(), sindices.c_str(), sdata.c_str(), ncol, @@ -1247,7 +1247,60 @@ XGB_DLL SEXP XGBoosterEvalOneIter_R(SEXP handle, SEXP iter, SEXP dmats, SEXP evn return mkString(ret); } -XGB_DLL SEXP XGBoosterPredictFromDMatrix_R(SEXP handle, SEXP dmat, SEXP json_config) { +namespace { + +struct ProxyDmatrixError : public std::exception {}; + +struct ProxyDmatrixWrapper { + DMatrixHandle proxy_dmat_handle; + + ProxyDmatrixWrapper() { + int res_code = XGProxyDMatrixCreate(&this->proxy_dmat_handle); + if (res_code != 0) { + throw ProxyDmatrixError(); + } + } + + ~ProxyDmatrixWrapper() { + if (this->proxy_dmat_handle) { + XGDMatrixFree(this->proxy_dmat_handle); + this->proxy_dmat_handle = nullptr; + } + } + + DMatrixHandle get_handle() { + return this->proxy_dmat_handle; + } +}; + +std::unique_ptr GetProxyDMatrixWithBaseMargin(SEXP base_margin) { + if (Rf_isNull(base_margin)) { + return std::unique_ptr(nullptr); + } + + SEXP base_margin_dim = Rf_getAttrib(base_margin, R_DimSymbol); + int res_code; + try { + const std::string array_str = Rf_isNull(base_margin_dim)? + MakeArrayInterfaceFromRVector(base_margin) : MakeArrayInterfaceFromRMat(base_margin); + std::unique_ptr proxy_dmat(new ProxyDmatrixWrapper()); + res_code = XGDMatrixSetInfoFromInterface(proxy_dmat->get_handle(), + "base_margin", + array_str.c_str()); + if (res_code != 0) { + throw ProxyDmatrixError(); + } + return proxy_dmat; + } catch(ProxyDmatrixError &err) { + Rf_error("%s", XGBGetLastError()); + } +} + +enum class PredictionInputType {DMatrix, DenseMatrix, CSRMatrix, DataFrame}; + +SEXP XGBoosterPredictGeneric(SEXP handle, SEXP input_data, SEXP json_config, + PredictionInputType input_type, SEXP missing, + SEXP base_margin) { SEXP r_out_shape; SEXP r_out_result; SEXP r_out = PROTECT(allocVector(VECSXP, 2)); @@ -1259,9 +1312,79 @@ XGB_DLL SEXP XGBoosterPredictFromDMatrix_R(SEXP handle, SEXP dmat, SEXP json_con bst_ulong out_dim; bst_ulong const *out_shape; float const *out_result; - CHECK_CALL(XGBoosterPredictFromDMatrix(R_ExternalPtrAddr(handle), - R_ExternalPtrAddr(dmat), c_json_config, - &out_shape, &out_dim, &out_result)); + + int res_code; + { + switch (input_type) { + case PredictionInputType::DMatrix: { + res_code = XGBoosterPredictFromDMatrix(R_ExternalPtrAddr(handle), + R_ExternalPtrAddr(input_data), c_json_config, + &out_shape, &out_dim, &out_result); + break; + } + + case PredictionInputType::CSRMatrix: { + std::unique_ptr proxy_dmat = GetProxyDMatrixWithBaseMargin( + base_margin); + DMatrixHandle proxy_dmat_handle = proxy_dmat.get()? proxy_dmat->get_handle() : nullptr; + + SEXP indptr = VECTOR_ELT(input_data, 0); + SEXP indices = VECTOR_ELT(input_data, 1); + SEXP data = VECTOR_ELT(input_data, 2); + const int ncol_csr = Rf_asInteger(VECTOR_ELT(input_data, 3)); + const SEXPTYPE type_data = TYPEOF(data); + CHECK_EQ(type_data, REALSXP); + std::string sindptr, sindices, sdata; + CreateFromSparse(indptr, indices, data, &sindptr, &sindices, &sdata); + + xgboost::StringView json_str(c_json_config); + xgboost::Json new_json = xgboost::Json::Load(json_str); + AddMissingToJson(&new_json, missing, type_data); + const std::string new_c_json = xgboost::Json::Dump(new_json); + + res_code = XGBoosterPredictFromCSR( + R_ExternalPtrAddr(handle), sindptr.c_str(), sindices.c_str(), sdata.c_str(), + ncol_csr, new_c_json.c_str(), proxy_dmat_handle, &out_shape, &out_dim, &out_result); + break; + } + + case PredictionInputType::DenseMatrix: { + std::unique_ptr proxy_dmat = GetProxyDMatrixWithBaseMargin( + base_margin); + DMatrixHandle proxy_dmat_handle = proxy_dmat.get()? proxy_dmat->get_handle() : nullptr; + const std::string array_str = MakeArrayInterfaceFromRMat(input_data); + + xgboost::StringView json_str(c_json_config); + xgboost::Json new_json = xgboost::Json::Load(json_str); + AddMissingToJson(&new_json, missing, TYPEOF(input_data)); + const std::string new_c_json = xgboost::Json::Dump(new_json); + + res_code = XGBoosterPredictFromDense( + R_ExternalPtrAddr(handle), array_str.c_str(), new_c_json.c_str(), + proxy_dmat_handle, &out_shape, &out_dim, &out_result); + break; + } + + case PredictionInputType::DataFrame: { + std::unique_ptr proxy_dmat = GetProxyDMatrixWithBaseMargin( + base_margin); + DMatrixHandle proxy_dmat_handle = proxy_dmat.get()? proxy_dmat->get_handle() : nullptr; + + const std::string df_str = MakeArrayInterfaceFromRDataFrame(input_data); + + xgboost::StringView json_str(c_json_config); + xgboost::Json new_json = xgboost::Json::Load(json_str); + AddMissingToJson(&new_json, missing, REALSXP); + const std::string new_c_json = xgboost::Json::Dump(new_json); + + res_code = XGBoosterPredictFromColumnar( + R_ExternalPtrAddr(handle), df_str.c_str(), new_c_json.c_str(), + proxy_dmat_handle, &out_shape, &out_dim, &out_result); + break; + } + } + } + CHECK_CALL(res_code); r_out_shape = PROTECT(allocVector(INTSXP, out_dim)); size_t len = 1; @@ -1282,6 +1405,31 @@ XGB_DLL SEXP XGBoosterPredictFromDMatrix_R(SEXP handle, SEXP dmat, SEXP json_con return r_out; } +} // namespace + +XGB_DLL SEXP XGBoosterPredictFromDMatrix_R(SEXP handle, SEXP dmat, SEXP json_config) { + return XGBoosterPredictGeneric(handle, dmat, json_config, + PredictionInputType::DMatrix, R_NilValue, R_NilValue); +} + +XGB_DLL SEXP XGBoosterPredictFromDense_R(SEXP handle, SEXP R_mat, SEXP missing, + SEXP json_config, SEXP base_margin) { + return XGBoosterPredictGeneric(handle, R_mat, json_config, + PredictionInputType::DenseMatrix, missing, base_margin); +} + +XGB_DLL SEXP XGBoosterPredictFromCSR_R(SEXP handle, SEXP lst, SEXP missing, + SEXP json_config, SEXP base_margin) { + return XGBoosterPredictGeneric(handle, lst, json_config, + PredictionInputType::CSRMatrix, missing, base_margin); +} + +XGB_DLL SEXP XGBoosterPredictFromColumnar_R(SEXP handle, SEXP R_df, SEXP missing, + SEXP json_config, SEXP base_margin) { + return XGBoosterPredictGeneric(handle, R_df, json_config, + PredictionInputType::DataFrame, missing, base_margin); +} + XGB_DLL SEXP XGBoosterLoadModel_R(SEXP handle, SEXP fname) { R_API_BEGIN(); CHECK_CALL(XGBoosterLoadModel(R_ExternalPtrAddr(handle), CHAR(asChar(fname)))); diff --git a/R-package/src/xgboost_R.h b/R-package/src/xgboost_R.h index 652345e52b64..70fd885e7f12 100644 --- a/R-package/src/xgboost_R.h +++ b/R-package/src/xgboost_R.h @@ -371,6 +371,50 @@ XGB_DLL SEXP XGBoosterEvalOneIter_R(SEXP handle, SEXP iter, SEXP dmats, SEXP evn * \return A list containing 2 vectors, first one for shape while second one for prediction result. */ XGB_DLL SEXP XGBoosterPredictFromDMatrix_R(SEXP handle, SEXP dmat, SEXP json_config); + +/*! + * \brief Run prediction on R dense matrix + * \param handle handle + * \param R_mat R matrix + * \param missing missing value + * \param json_config See `XGBoosterPredictFromDense` in xgboost c_api.h. Doesn't include 'missing' + * \param base_margin base margin for the prediction + * + * \return A list containing 2 vectors, first one for shape while second one for prediction result. + */ +XGB_DLL SEXP XGBoosterPredictFromDense_R(SEXP handle, SEXP R_mat, SEXP missing, + SEXP json_config, SEXP base_margin); + +/*! + * \brief Run prediction on R CSR matrix + * \param handle handle + * \param lst An R list, containing, in this order: + * (a) 'p' array (a.k.a. indptr) + * (b) 'j' array (a.k.a. indices) + * (c) 'x' array (a.k.a. data / values) + * (d) number of columns + * \param missing missing value + * \param json_config See `XGBoosterPredictFromCSR` in xgboost c_api.h. Doesn't include 'missing' + * \param base_margin base margin for the prediction + * + * \return A list containing 2 vectors, first one for shape while second one for prediction result. + */ +XGB_DLL SEXP XGBoosterPredictFromCSR_R(SEXP handle, SEXP lst, SEXP missing, + SEXP json_config, SEXP base_margin); + +/*! + * \brief Run prediction on R data.frame + * \param handle handle + * \param R_df R data.frame + * \param missing missing value + * \param json_config See `XGBoosterPredictFromDense` in xgboost c_api.h. Doesn't include 'missing' + * \param base_margin base margin for the prediction + * + * \return A list containing 2 vectors, first one for shape while second one for prediction result. + */ +XGB_DLL SEXP XGBoosterPredictFromColumnar_R(SEXP handle, SEXP R_df, SEXP missing, + SEXP json_config, SEXP base_margin); + /*! * \brief load model from existing file * \param handle handle diff --git a/R-package/tests/testthat/test_basic.R b/R-package/tests/testthat/test_basic.R index fb3162e423ce..5438c8bb2235 100644 --- a/R-package/tests/testthat/test_basic.R +++ b/R-package/tests/testthat/test_basic.R @@ -139,8 +139,8 @@ test_that("dart prediction works", { pred_by_train_1 <- predict(booster_by_train, newdata = dtrain, iterationrange = c(1, nrounds)) pred_by_train_2 <- predict(booster_by_train, newdata = dtrain, training = TRUE) - expect_true(all(matrix(pred_by_train_0, byrow = TRUE) == matrix(pred_by_xgboost_0, byrow = TRUE))) - expect_true(all(matrix(pred_by_train_1, byrow = TRUE) == matrix(pred_by_xgboost_1, byrow = TRUE))) + expect_equal(pred_by_train_0, pred_by_xgboost_0, tolerance = 1e-6) + expect_equal(pred_by_train_1, pred_by_xgboost_1, tolerance = 1e-6) expect_true(all(matrix(pred_by_train_2, byrow = TRUE) == matrix(pred_by_xgboost_2, byrow = TRUE))) }) @@ -651,6 +651,51 @@ test_that("Can use ranking objectives with either 'qid' or 'group'", { expect_equal(pred_qid, pred_gr) }) +test_that("Can predict on data.frame objects", { + data("mtcars") + y <- mtcars$mpg + x_df <- mtcars[, -1] + x_mat <- as.matrix(x_df) + dm <- xgb.DMatrix(x_mat, label = y, nthread = n_threads) + model <- xgb.train( + params = list( + tree_method = "hist", + objective = "reg:squarederror", + nthread = n_threads + ), + data = dm, + nrounds = 5 + ) + + pred_mat <- predict(model, xgb.DMatrix(x_mat), nthread = n_threads) + pred_df <- predict(model, x_df, nthread = n_threads) + expect_equal(pred_mat, pred_df) +}) + +test_that("'base_margin' gives the same result in DMatrix as in inplace_predict", { + data("mtcars") + y <- mtcars$mpg + x <- as.matrix(mtcars[, -1]) + dm <- xgb.DMatrix(x, label = y, nthread = n_threads) + model <- xgb.train( + params = list( + tree_method = "hist", + objective = "reg:squarederror", + nthread = n_threads + ), + data = dm, + nrounds = 5 + ) + + set.seed(123) + base_margin <- rnorm(nrow(x)) + dm_w_base <- xgb.DMatrix(data = x, base_margin = base_margin) + pred_from_dm <- predict(model, dm_w_base) + pred_from_mat <- predict(model, x, base_margin = base_margin) + + expect_equal(pred_from_dm, pred_from_mat) +}) + test_that("Coefficients from gblinear have the expected shape and names", { # Single-column coefficients data(mtcars) diff --git a/R-package/tests/testthat/test_dmatrix.R b/R-package/tests/testthat/test_dmatrix.R index 45bcac08d479..0612406444ae 100644 --- a/R-package/tests/testthat/test_dmatrix.R +++ b/R-package/tests/testthat/test_dmatrix.R @@ -302,6 +302,37 @@ test_that("xgb.DMatrix: Inf as missing", { file.remove(fname_nan) }) +test_that("xgb.DMatrix: missing in CSR", { + x_dense <- matrix(as.numeric(1:10), nrow = 5) + x_dense[2, 1] <- NA_real_ + + x_csr <- as(x_dense, "RsparseMatrix") + + m_dense <- xgb.DMatrix(x_dense, nthread = n_threads, missing = NA_real_) + xgb.DMatrix.save(m_dense, "dense.dmatrix") + + m_csr <- xgb.DMatrix(x_csr, nthread = n_threads, missing = NA) + xgb.DMatrix.save(m_csr, "csr.dmatrix") + + denseconn <- file("dense.dmatrix", "rb") + csrconn <- file("csr.dmatrix", "rb") + + expect_equal(file.size("dense.dmatrix"), file.size("csr.dmatrix")) + + bytes <- file.size("dense.dmatrix") + densedmatrix <- readBin(denseconn, "raw", n = bytes) + csrmatrix <- readBin(csrconn, "raw", n = bytes) + + expect_equal(length(densedmatrix), length(csrmatrix)) + expect_equal(densedmatrix, csrmatrix) + + close(denseconn) + close(csrconn) + + file.remove("dense.dmatrix") + file.remove("csr.dmatrix") +}) + test_that("xgb.DMatrix: error on three-dimensional array", { set.seed(123) x <- matrix(rnorm(500), nrow = 50)