Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Draft] [R] Add QuantileDMatrix creation from dense matrices #9864

Closed
wants to merge 4 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
56 changes: 47 additions & 9 deletions R-package/R/xgb.DMatrix.R
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,25 @@
#' @param label_lower_bound Lower bound for survival training.
#' @param label_upper_bound Upper bound for survival training.
#' @param feature_weights Set feature weights for column sampling.
#' @param as_quantile_dmatrix Whether to generate a QuantileDMatrix instead of a regular DMatrix.
#'
#' A QuantileDMatrix generates quantilized data directly from input for the \code{hist} tree method.
#' This DMatrix variant is primarily designed to save memory in training by avoiding intermediate storage.
#'
#' Currently, QuantileDMatrix creation is only supported from dense matrices (class \code{matrix} from base R).
#'
#' When the resulting object is generated as a QuantileDMatrix, it will have an additional class
#' \code{xgb.QuantileDMatrix} in addition to inheriting from regular \code{xgb.DMatrix}.
#' @param ref The training dataset that provides quantile information, needed when creating validation/test dataset
#' with QuantileDMatrix. Supplying the training DMatrix as a reference means that the same quantisation applied to
#' the training data is applied to the validation/test data.
#'
#' This is ignored when passing \code{as_quantile_dmatrix = FALSE} or when construction of a QuantileDMatrix is not
#' possible from the supplied inputs.
#' @param max_bin The number of histogram bin, should be consistent with the training parameter \code{max_bin}.
#'
#' This is ignored when passing \code{as_quantile_dmatrix = FALSE} or when construction of a QuantileDMatrix is not
#' possible from the supplied inputs.
#' @param enable_categorical Experimental support of specializing for categorical features.
#'
#' If passing 'TRUE' and 'data' is a data frame,
Expand Down Expand Up @@ -72,11 +91,16 @@ xgb.DMatrix <- function(
label_lower_bound = NULL,
label_upper_bound = NULL,
feature_weights = NULL,
enable_categorical = FALSE
enable_categorical = FALSE,
as_quantile_dmatrix = FALSE,
ref = NULL,
max_bin = NULL
) {
if (!is.null(group) && !is.null(qid)) {
stop("Either one of 'group' or 'qid' should be NULL")
}
is_quantile_dmatrix <- FALSE
nthread <- as.integer(NVL(nthread, -1))
ctypes <- NULL
if (typeof(data) == "character") {
if (length(data) > 1) {
Expand All @@ -88,9 +112,18 @@ xgb.DMatrix <- function(
data <- path.expand(data)
handle <- .Call(XGDMatrixCreateFromFile_R, data, as.integer(silent))
} else if (is.matrix(data)) {
handle <- .Call(
XGDMatrixCreateFromMat_R, data, missing, as.integer(NVL(nthread, -1))
)
if (!as_quantile_dmatrix) {
handle <- .Call(XGDMatrixCreateFromMat_R, data, missing, nthread)
} else {
if (!is.null(ref)) {
if (!inherits(ref, "xgb.DMatrix")) {
stop("'ref' must be an xgb.DMatrix object.")
}
}
handle <- .Call(XGQuantileDMatrixFromMat_R, data, missing,
nthread, max_bin, ref)
is_quantile_dmatrix <- TRUE
}
} else if (inherits(data, "dgCMatrix")) {
handle <- .Call(
XGDMatrixCreateFromCSC_R,
Expand All @@ -99,7 +132,7 @@ xgb.DMatrix <- function(
data@x,
nrow(data),
missing,
as.integer(NVL(nthread, -1))
nthread
)
} else if (inherits(data, "dgRMatrix")) {
handle <- .Call(
Expand All @@ -109,7 +142,7 @@ xgb.DMatrix <- function(
data@x,
ncol(data),
missing,
as.integer(NVL(nthread, -1))
nthread
)
} else if (inherits(data, "dsparseVector")) {
indptr <- c(0L, as.integer(length(data@i)))
Expand All @@ -121,7 +154,7 @@ xgb.DMatrix <- function(
data@x,
length(data),
missing,
as.integer(NVL(nthread, -1))
nthread
)
} else if (is.data.frame(data)) {
ctypes <- sapply(data, function(x) {
Expand Down Expand Up @@ -161,7 +194,11 @@ xgb.DMatrix <- function(
}

dmat <- handle
attributes(dmat) <- list(class = "xgb.DMatrix")
dmat_class <- "xgb.DMatrix"
if (is_quantile_dmatrix) {
dmat_class <- c(dmat_class, "xgb.QuantileDMatrix")
}
attributes(dmat) <- list(class = dmat_class)

if (!is.null(label)) {
setinfo(dmat, "label", label)
Expand Down Expand Up @@ -568,7 +605,8 @@ slice.xgb.DMatrix <- function(object, idxset, ...) {
#' @method print xgb.DMatrix
#' @export
print.xgb.DMatrix <- function(x, verbose = FALSE, ...) {
cat('xgb.DMatrix dim:', nrow(x), 'x', ncol(x), ' info: ')
class_print <- ifelse(inherits(x, "xgb.QuantileDMatrix"), "xgb.QuantileDMatrix", "xgb.DMatrix")
cat(class_print, ' dim:', nrow(x), 'x', ncol(x), ' info: ')
infos <- character(0)
if (length(getinfo(x, 'label')) > 0) infos <- 'label'
if (length(getinfo(x, 'weight')) > 0) infos <- c(infos, 'weight')
Expand Down
27 changes: 26 additions & 1 deletion R-package/man/xgb.DMatrix.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 2 additions & 0 deletions R-package/src/init.c
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ extern SEXP XGDMatrixSaveBinary_R(SEXP, SEXP, SEXP);
extern SEXP XGDMatrixSetInfo_R(SEXP, SEXP, SEXP);
extern SEXP XGDMatrixSetStrFeatureInfo_R(SEXP, SEXP, SEXP);
extern SEXP XGDMatrixSliceDMatrix_R(SEXP, SEXP);
extern SEXP XGQuantileDMatrixFromMat_R(SEXP, SEXP, SEXP, SEXP, SEXP);
extern SEXP XGBSetGlobalConfig_R(SEXP);
extern SEXP XGBGetGlobalConfig_R(void);
extern SEXP XGBoosterFeatureScore_R(SEXP, SEXP);
Expand Down Expand Up @@ -88,6 +89,7 @@ static const R_CallMethodDef CallEntries[] = {
{"XGDMatrixSetInfo_R", (DL_FUNC) &XGDMatrixSetInfo_R, 3},
{"XGDMatrixSetStrFeatureInfo_R", (DL_FUNC) &XGDMatrixSetStrFeatureInfo_R, 3},
{"XGDMatrixSliceDMatrix_R", (DL_FUNC) &XGDMatrixSliceDMatrix_R, 2},
{"XGQuantileDMatrixFromMat_R", (DL_FUNC) &XGQuantileDMatrixFromMat_R, 5},
{"XGBSetGlobalConfig_R", (DL_FUNC) &XGBSetGlobalConfig_R, 1},
{"XGBGetGlobalConfig_R", (DL_FUNC) &XGBGetGlobalConfig_R, 0},
{"XGBoosterFeatureScore_R", (DL_FUNC) &XGBoosterFeatureScore_R, 2},
Expand Down
91 changes: 91 additions & 0 deletions R-package/src/xgboost_R.cc
Original file line number Diff line number Diff line change
Expand Up @@ -519,6 +519,97 @@ XGB_DLL SEXP XGDMatrixNumCol_R(SEXP handle) {
return ScalarInteger(static_cast<int>(ncol));
}

struct IteratorError : public std::exception {};

struct _RMatrixSingleIterator {
int iter;
DMatrixHandle proxy_dmat_handle;
const char *array_str;

_RMatrixSingleIterator(
DMatrixHandle proxy_dmat_handle,
const char *array_str) : iter(0), proxy_dmat_handle(proxy_dmat_handle), array_str(array_str) {}

void reset() {
this->iter = 0;
}

int next() {
if (this->iter >= 1) {
return 0;
}

int res_code = XGProxyDMatrixSetDataDense(this->proxy_dmat_handle, this->array_str);
if (res_code != 0) {
throw IteratorError();
}
this->iter++;
return 1;
}
};

static void _reset_RMatrixSingleIterator(DataIterHandle iter) {
static_cast<_RMatrixSingleIterator*>(iter)->reset();
}

static int _next_RMatrixSingleIterator(DataIterHandle iter) {
return static_cast<_RMatrixSingleIterator*>(iter)->next();
}

XGB_DLL SEXP XGQuantileDMatrixFromMat_R(SEXP R_mat, SEXP missing, SEXP n_threads,
SEXP max_bin, SEXP ref_dmat) {
SEXP ret = PROTECT(R_MakeExternalPtr(nullptr, R_NilValue, R_NilValue));
R_API_BEGIN();
DMatrixHandle proxy_dmat_handle;
CHECK_CALL(XGProxyDMatrixCreate(&proxy_dmat_handle));
DMatrixHandle out_dmat;
int res_code1, res_code2;

try {
xgboost::Json jconfig{xgboost::Object{}};
/* FIXME: this 'missing' field should have R_NaInt when the input is an integer matrix. */
jconfig["missing"] = Rf_asReal(missing);
if (!Rf_isNull(n_threads)) {
jconfig["nthread"] = Rf_asInteger(n_threads);
}
if (!Rf_isNull(max_bin)) {
jconfig["max_bin"] = Rf_asInteger(max_bin);
}
std::string json_str = xgboost::Json::Dump(jconfig);

DMatrixHandle ref_dmat_handle = nullptr;
if (!Rf_isNull(ref_dmat)) {
ref_dmat_handle = R_ExternalPtrAddr(ref_dmat);
}

std::string array_str = MakeArrayInterfaceFromRMat(R_mat);
_RMatrixSingleIterator single_iterator(proxy_dmat_handle, array_str.c_str());

res_code1 = XGQuantileDMatrixCreateFromCallback(
&single_iterator,
proxy_dmat_handle,
ref_dmat_handle,
_reset_RMatrixSingleIterator,
_next_RMatrixSingleIterator,
json_str.c_str(),
&out_dmat);
res_code2 = XGDMatrixFree(proxy_dmat_handle);
} catch(IteratorError &err) {
XGDMatrixFree(proxy_dmat_handle);
Rf_error(XGBGetLastError());
}

CHECK_CALL(res_code2);
CHECK_CALL(res_code1);

R_SetExternalPtrAddr(ret, out_dmat);
R_RegisterCFinalizerEx(ret, _DMatrixFinalizer, TRUE);
R_API_END();

UNPROTECT(1);
return ret;
}
Comment on lines +559 to +611
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Depending on your roadmap, it might be desirable to implement this in R instead of C++, the data iterator is the common interface to QDM and external memory, I will share some old unpublished documents soon.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it'd be better to use a dedicated C++-only route for single-iteration QuantileDMatrix, and then later on implement a customizable DataIterator in R.


// functions related to booster
void _BoosterFinalizer(SEXP ext) {
if (R_ExternalPtrAddr(ext) == NULL) return;
Expand Down
13 changes: 13 additions & 0 deletions R-package/src/xgboost_R.h
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,19 @@ XGB_DLL SEXP XGDMatrixNumRow_R(SEXP handle);
*/
XGB_DLL SEXP XGDMatrixNumCol_R(SEXP handle);

/*!
* \brief create quantile dmatrix from dense matrix
* This assumes the matrix is stored in column major format
* \param R_mat R Matrix object
* \param missing which value to represent missing value
* \param n_threads Number of threads used to construct DMatrix from dense matrix.
* \param max_bin Maximum number of bins for building histogram.
* \param ref_dmat Optional reference DMatrix for providing quantile information
* \return created dmatrix
*/
XGB_DLL SEXP XGQuantileDMatrixFromMat_R(SEXP R_mat, SEXP missing, SEXP n_threads,
SEXP max_bin, SEXP ref_dmat);

/*!
* \brief create xgboost learner
* \param dmats a list of dmatrix handles that will be cached
Expand Down
64 changes: 64 additions & 0 deletions R-package/tests/testthat/test_dmatrix.R
Original file line number Diff line number Diff line change
Expand Up @@ -323,6 +323,70 @@ test_that("xgb.DMatrix: can get group for both 'qid' and 'group' constructors",
expect_equal(info_gr, expected_gr)
})

test_that("xgb.DMatrix: QuantileDMatrix produces same result as DMatrix", {
data(mtcars)
y <- mtcars[, 1]
x <- as.matrix(mtcars[, -1])
qdm <- xgb.DMatrix(
data = x,
label = y,
as_quantile_dmatrix = TRUE,
nthread = n_threads,
max_bin = 5
)
params <- list(
tree_method = "hist",
objective = "reg:squarederror",
nthread = n_threads,
max_bin = 5
)
model_qdm <- xgb.train(
params = params,
data = qdm,
nrounds = 2
)
pred_qdm <- predict(model_qdm, x)

dm <- xgb.DMatrix(
data = x,
label = y,
as_quantile_dmatrix = FALSE,
nthread = n_threads
)
model_dm <- xgb.train(
params = params,
data = dm,
nrounds = 2
)
pred_dm <- predict(model_dm, x)

expect_equal(pred_qdm, pred_dm)
})

test_that("xgb.DMatrix: QuantileDMatrix is not accepted by exact method", {
data(mtcars)
y <- mtcars[, 1]
x <- as.matrix(mtcars[, -1])
qdm <- xgb.DMatrix(
data = x,
label = y,
as_quantile_dmatrix = TRUE,
nthread = n_threads
)
params <- list(
tree_method = "exact",
objective = "reg:squarederror",
nthread = n_threads
)
expect_error({
xgb.train(
params = params,
data = qdm,
nrounds = 2
)
})
})

test_that("xgb.DMatrix: data.frame", {
df <- data.frame(
a = (1:4) / 10,
Expand Down
Loading