From deb70773b87a494f626c5b2c088f372f116c6559 Mon Sep 17 00:00:00 2001 From: Oliver Borchert Date: Mon, 13 Nov 2023 19:03:58 +0100 Subject: [PATCH 01/19] [python-package] Allow to pass Arrow array as weights (#6164) --- include/LightGBM/c_api.h | 4 +- include/LightGBM/dataset.h | 4 ++ python-package/lightgbm/basic.py | 29 +++++++---- python-package/lightgbm/compat.py | 7 +++ src/io/dataset.cpp | 2 + src/io/metadata.cpp | 28 ++++++++--- tests/python_package_test/test_arrow.py | 66 ++++++++++++++++++++----- 7 files changed, 108 insertions(+), 32 deletions(-) diff --git a/include/LightGBM/c_api.h b/include/LightGBM/c_api.h index a46f8332811a..fd337cbc7cbe 100644 --- a/include/LightGBM/c_api.h +++ b/include/LightGBM/c_api.h @@ -558,9 +558,9 @@ LIGHTGBM_C_EXPORT int LGBM_DatasetSetField(DatasetHandle handle, /*! * \brief Set vector to a content in info. * \note - * - \a label convert input datatype into ``float32``. + * - \a label and \a weight convert input datatype into ``float32``. * \param handle Handle of dataset - * \param field_name Field name, can be \a label + * \param field_name Field name, can be \a label, \a weight * \param n_chunks The number of Arrow arrays passed to this function * \param chunks Pointer to the list of Arrow arrays * \param schema Pointer to the schema of all Arrow arrays diff --git a/include/LightGBM/dataset.h b/include/LightGBM/dataset.h index 56bc7b841dc3..48c1bee804d7 100644 --- a/include/LightGBM/dataset.h +++ b/include/LightGBM/dataset.h @@ -113,6 +113,7 @@ class Metadata { void SetLabel(const ArrowChunkedArray& array); void SetWeights(const label_t* weights, data_size_t len); + void SetWeights(const ArrowChunkedArray& array); void SetQuery(const data_size_t* query, data_size_t len); @@ -340,6 +341,9 @@ class Metadata { void SetLabelsFromIterator(It first, It last); /*! \brief Insert weights at the given index */ void InsertWeights(const label_t* weights, data_size_t start_index, data_size_t len); + /*! \brief Set weights from pointers to the first element and the end of an iterator. */ + template + void SetWeightsFromIterator(It first, It last); /*! \brief Insert initial scores at the given index */ void InsertInitScores(const double* init_scores, data_size_t start_index, data_size_t len, data_size_t source_size); /*! \brief Insert queries at the given index */ diff --git a/python-package/lightgbm/basic.py b/python-package/lightgbm/basic.py index e8d8bd84cbe7..939842df3389 100644 --- a/python-package/lightgbm/basic.py +++ b/python-package/lightgbm/basic.py @@ -19,7 +19,8 @@ import scipy.sparse from .compat import (PANDAS_INSTALLED, PYARROW_INSTALLED, arrow_cffi, arrow_is_floating, arrow_is_integer, concat, - dt_DataTable, pa_Array, pa_ChunkedArray, pa_Table, pd_CategoricalDtype, pd_DataFrame, pd_Series) + dt_DataTable, pa_Array, pa_ChunkedArray, pa_compute, pa_Table, pd_CategoricalDtype, pd_DataFrame, + pd_Series) from .libpath import find_lib_path if TYPE_CHECKING: @@ -115,7 +116,9 @@ List[float], List[int], np.ndarray, - pd_Series + pd_Series, + pa_Array, + pa_ChunkedArray, ] ZERO_THRESHOLD = 1e-35 @@ -1635,7 +1638,7 @@ def __init__( Label of the data. reference : Dataset or None, optional (default=None) If this is Dataset for validation, training data should be used as reference. - weight : list, numpy 1-D array, pandas Series or None, optional (default=None) + weight : list, numpy 1-D array, pandas Series, pyarrow Array, pyarrow ChunkedArray or None, optional (default=None) Weight for each instance. Weights should be non-negative. group : list, numpy 1-D array, pandas Series or None, optional (default=None) Group/query data. @@ -2415,7 +2418,7 @@ def create_valid( If str or pathlib.Path, it represents the path to a text file (CSV, TSV, or LibSVM) or a LightGBM Dataset binary file. label : list, numpy 1-D array, pandas Series / one-column DataFrame, pyarrow Array, pyarrow ChunkedArray or None, optional (default=None) Label of the data. - weight : list, numpy 1-D array, pandas Series or None, optional (default=None) + weight : list, numpy 1-D array, pandas Series, pyarrow Array, pyarrow ChunkedArray or None, optional (default=None) Weight for each instance. Weights should be non-negative. group : list, numpy 1-D array, pandas Series or None, optional (default=None) Group/query data. @@ -2830,7 +2833,7 @@ def set_weight( Parameters ---------- - weight : list, numpy 1-D array, pandas Series or None + weight : list, numpy 1-D array, pandas Series, pyarrow Array, pyarrow ChunkedArray or None Weight to be set for each data point. Weights should be non-negative. Returns @@ -2838,11 +2841,19 @@ def set_weight( self : Dataset Dataset with set weight. """ - if weight is not None and np.all(weight == 1): - weight = None + # Check if the weight contains values other than one + if weight is not None: + if _is_pyarrow_array(weight): + if pa_compute.all(pa_compute.equal(weight, 1)).as_py(): + weight = None + elif np.all(weight == 1): + weight = None self.weight = weight + + # Set field if self._handle is not None and weight is not None: - weight = _list_to_1d_numpy(weight, dtype=np.float32, name='weight') + if not _is_pyarrow_array(weight): + weight = _list_to_1d_numpy(weight, dtype=np.float32, name='weight') self.set_field('weight', weight) self.weight = self.get_field('weight') # original values can be modified at cpp side return self @@ -4414,7 +4425,7 @@ def refit( .. versionadded:: 4.0.0 - weight : list, numpy 1-D array, pandas Series or None, optional (default=None) + weight : list, numpy 1-D array, pandas Series, pyarrow Array, pyarrow ChunkedArray or None, optional (default=None) Weight for each ``data`` instance. Weights should be non-negative. .. versionadded:: 4.0.0 diff --git a/python-package/lightgbm/compat.py b/python-package/lightgbm/compat.py index 984972ed1ae3..dc48dbf792cf 100644 --- a/python-package/lightgbm/compat.py +++ b/python-package/lightgbm/compat.py @@ -197,6 +197,7 @@ def __init__(self, *args, **kwargs): """pyarrow""" try: + import pyarrow.compute as pa_compute from pyarrow import Array as pa_Array from pyarrow import ChunkedArray as pa_ChunkedArray from pyarrow import Table as pa_Table @@ -236,6 +237,12 @@ class arrow_cffi: # type: ignore def __init__(self, *args, **kwargs): pass + class pa_compute: # type: ignore + """Dummy class for pyarrow.compute.""" + + all = None + equal = None + arrow_is_integer = None arrow_is_floating = None diff --git a/src/io/dataset.cpp b/src/io/dataset.cpp index e78f8a6b696c..01eb41b71367 100644 --- a/src/io/dataset.cpp +++ b/src/io/dataset.cpp @@ -902,6 +902,8 @@ bool Dataset::SetFieldFromArrow(const char* field_name, const ArrowChunkedArray name = Common::Trim(name); if (name == std::string("label") || name == std::string("target")) { metadata_.SetLabel(ca); + } else if (name == std::string("weight") || name == std::string("weights")) { + metadata_.SetWeights(ca); } else { return false; } diff --git a/src/io/metadata.cpp b/src/io/metadata.cpp index 41f9e3bf43c6..ed4fb135e62a 100644 --- a/src/io/metadata.cpp +++ b/src/io/metadata.cpp @@ -450,33 +450,45 @@ void Metadata::InsertLabels(const label_t* labels, data_size_t start_index, data // CUDA is handled after all insertions are complete } -void Metadata::SetWeights(const label_t* weights, data_size_t len) { +template +void Metadata::SetWeightsFromIterator(It first, It last) { std::lock_guard lock(mutex_); - // save to nullptr - if (weights == nullptr || len == 0) { + // Clear weights on empty input + if (last - first == 0) { weights_.clear(); num_weights_ = 0; return; } - if (num_data_ != len) { - Log::Fatal("Length of weights is not same with #data"); + if (num_data_ != last - first) { + Log::Fatal("Length of weights differs from the length of #data"); + } + if (weights_.empty()) { + weights_.resize(num_data_); } - if (weights_.empty()) { weights_.resize(num_data_); } num_weights_ = num_data_; #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static, 512) if (num_weights_ >= 1024) for (data_size_t i = 0; i < num_weights_; ++i) { - weights_[i] = Common::AvoidInf(weights[i]); + weights_[i] = Common::AvoidInf(first[i]); } CalculateQueryWeights(); weight_load_from_file_ = false; + #ifdef USE_CUDA if (cuda_metadata_ != nullptr) { - cuda_metadata_->SetWeights(weights_.data(), len); + cuda_metadata_->SetWeights(weights_.data(), weights_.size()); } #endif // USE_CUDA } +void Metadata::SetWeights(const label_t* weights, data_size_t len) { + SetWeightsFromIterator(weights, weights + len); +} + +void Metadata::SetWeights(const ArrowChunkedArray& array) { + SetWeightsFromIterator(array.begin(), array.end()); +} + void Metadata::InsertWeights(const label_t* weights, data_size_t start_index, data_size_t len) { if (!weights) { Log::Fatal("Passed null weights"); diff --git a/tests/python_package_test/test_arrow.py b/tests/python_package_test/test_arrow.py index 1dd270c8ec53..40482a904a62 100644 --- a/tests/python_package_test/test_arrow.py +++ b/tests/python_package_test/test_arrow.py @@ -9,6 +9,8 @@ import lightgbm as lgb +from .utils import np_assert_array_equal + # ----------------------------------------------------------------------------------------------- # # UTILITIES # # ----------------------------------------------------------------------------------------------- # @@ -67,10 +69,6 @@ def dummy_dataset_params() -> Dict[str, Any]: } -def assert_arrays_equal(lhs: np.ndarray, rhs: np.ndarray): - assert lhs.dtype == rhs.dtype and np.array_equal(lhs, rhs) - - # ----------------------------------------------------------------------------------------------- # # UNIT TESTS # # ----------------------------------------------------------------------------------------------- # @@ -103,6 +101,34 @@ def test_dataset_construct_fuzzy( assert filecmp.cmp(tmp_path / "arrow.txt", tmp_path / "pandas.txt") +# -------------------------------------------- FIELDS ------------------------------------------- # + + +def test_dataset_construct_fields_fuzzy(): + arrow_table = generate_random_arrow_table(3, 1000, 42) + arrow_labels = generate_random_arrow_array(1000, 42) + arrow_weights = generate_random_arrow_array(1000, 42) + + arrow_dataset = lgb.Dataset(arrow_table, label=arrow_labels, weight=arrow_weights) + arrow_dataset.construct() + + pandas_dataset = lgb.Dataset( + arrow_table.to_pandas(), label=arrow_labels.to_numpy(), weight=arrow_weights.to_numpy() + ) + pandas_dataset.construct() + + # Check for equality + for field in ("label", "weight"): + np_assert_array_equal( + arrow_dataset.get_field(field), pandas_dataset.get_field(field), strict=True + ) + np_assert_array_equal(arrow_dataset.get_label(), pandas_dataset.get_label(), strict=True) + np_assert_array_equal(arrow_dataset.get_weight(), pandas_dataset.get_weight(), strict=True) + + +# -------------------------------------------- LABELS ------------------------------------------- # + + @pytest.mark.parametrize( ["array_type", "label_data"], [(pa.array, [0, 1, 0, 0, 1]), (pa.chunked_array, [[0], [1, 0, 0, 1]])], @@ -129,17 +155,31 @@ def test_dataset_construct_labels(array_type: Any, label_data: Any, arrow_type: dataset.construct() expected = np.array([0, 1, 0, 0, 1], dtype=np.float32) - assert_arrays_equal(expected, dataset.get_label()) + np_assert_array_equal(expected, dataset.get_label(), strict=True) -def test_dataset_construct_labels_fuzzy(): - arrow_table = generate_random_arrow_table(3, 1000, 42) - arrow_array = generate_random_arrow_array(1000, 42) +# ------------------------------------------- WEIGHTS ------------------------------------------- # - arrow_dataset = lgb.Dataset(arrow_table, label=arrow_array) - arrow_dataset.construct() - pandas_dataset = lgb.Dataset(arrow_table.to_pandas(), label=arrow_array.to_numpy()) - pandas_dataset.construct() +def test_dataset_construct_weights_none(): + data = generate_dummy_arrow_table() + weight = pa.array([1, 1, 1, 1, 1]) + dataset = lgb.Dataset(data, weight=weight, params=dummy_dataset_params()) + dataset.construct() + assert dataset.get_weight() is None + assert dataset.get_field("weight") is None + + +@pytest.mark.parametrize( + ["array_type", "weight_data"], + [(pa.array, [3, 0.7, 1.5, 0.5, 0.1]), (pa.chunked_array, [[3], [0.7, 1.5, 0.5, 0.1]])], +) +@pytest.mark.parametrize("arrow_type", [pa.float32(), pa.float64()]) +def test_dataset_construct_weights(array_type: Any, weight_data: Any, arrow_type: Any): + data = generate_dummy_arrow_table() + weights = array_type(weight_data, type=arrow_type) + dataset = lgb.Dataset(data, weight=weights, params=dummy_dataset_params()) + dataset.construct() - assert_arrays_equal(arrow_dataset.get_label(), pandas_dataset.get_label()) + expected = np.array([3, 0.7, 1.5, 0.5, 0.1], dtype=np.float32) + np_assert_array_equal(expected, dataset.get_weight(), strict=True) From 694e41e4a3ec44987667755dd989f83f0ecd4311 Mon Sep 17 00:00:00 2001 From: James Lamb Date: Mon, 13 Nov 2023 12:26:44 -0600 Subject: [PATCH 02/19] [R-package] standardize naming of internal functions (#6179) --- R-package/R/callback.R | 8 +-- R-package/R/lgb.Booster.R | 34 +++++----- R-package/R/lgb.DataProcessor.R | 2 +- R-package/R/lgb.Dataset.R | 54 +++++++-------- R-package/R/lgb.Predictor.R | 10 +-- R-package/R/lgb.cv.R | 38 +++++------ R-package/R/lgb.drop_serialized.R | 2 +- R-package/R/lgb.importance.R | 2 +- R-package/R/lgb.interprete.R | 8 +-- R-package/R/lgb.make_serializable.R | 2 +- R-package/R/lgb.model.dt.tree.R | 7 +- R-package/R/lgb.plot.interpretation.R | 6 +- R-package/R/lgb.restore_handle.R | 2 +- R-package/R/lgb.train.R | 34 ++++++---- R-package/R/lightgbm.R | 10 +-- R-package/R/saveRDS.lgb.Booster.R | 2 +- R-package/R/utils.R | 20 +++--- R-package/tests/testthat/test_Predictor.R | 10 +-- R-package/tests/testthat/test_basic.R | 22 +++---- R-package/tests/testthat/test_dataset.R | 16 ++--- .../tests/testthat/test_learning_to_rank.R | 2 +- R-package/tests/testthat/test_lgb.Booster.R | 40 +++++------ R-package/tests/testthat/test_utils.R | 66 +++++++++---------- 23 files changed, 203 insertions(+), 194 deletions(-) diff --git a/R-package/R/callback.R b/R-package/R/callback.R index 3569b47f5b14..c436409ddafb 100644 --- a/R-package/R/callback.R +++ b/R-package/R/callback.R @@ -323,17 +323,17 @@ cb_early_stop <- function(stopping_rounds, first_metric_only, verbose) { } # Extract callback names from the list of callbacks -callback.names <- function(cb_list) { +.callback_names <- function(cb_list) { return(unlist(lapply(cb_list, attr, "name"))) } -add.cb <- function(cb_list, cb) { +.add_cb <- function(cb_list, cb) { # Combine two elements cb_list <- c(cb_list, cb) # Set names of elements - names(cb_list) <- callback.names(cb_list = cb_list) + names(cb_list) <- .callback_names(cb_list = cb_list) if ("cb_early_stop" %in% names(cb_list)) { @@ -349,7 +349,7 @@ add.cb <- function(cb_list, cb) { } -categorize.callbacks <- function(cb_list) { +.categorize_callbacks <- function(cb_list) { # Check for pre-iteration or post-iteration return( diff --git a/R-package/R/lgb.Booster.R b/R-package/R/lgb.Booster.R index 2256a250b131..17da9545ae19 100644 --- a/R-package/R/lgb.Booster.R +++ b/R-package/R/lgb.Booster.R @@ -31,12 +31,12 @@ Booster <- R6::R6Class( if (!is.null(train_set)) { - if (!lgb.is.Dataset(train_set)) { + if (!.is_Dataset(train_set)) { stop("lgb.Booster: Can only use lgb.Dataset as training data") } train_set_handle <- train_set$.__enclos_env__$private$get_handle() params <- utils::modifyList(params, train_set$get_params()) - params_str <- lgb.params2str(params = params) + params_str <- .params2str(params = params) # Store booster handle handle <- .Call( LGBM_BoosterCreate_R @@ -130,7 +130,7 @@ Booster <- R6::R6Class( # Add validation data add_valid = function(data, name) { - if (!lgb.is.Dataset(data)) { + if (!.is_Dataset(data)) { stop("lgb.Booster.add_valid: Can only use lgb.Dataset as validation data") } @@ -167,7 +167,7 @@ Booster <- R6::R6Class( params <- utils::modifyList(self$params, params) } - params_str <- lgb.params2str(params = params) + params_str <- .params2str(params = params) self$restore_handle() @@ -193,7 +193,7 @@ Booster <- R6::R6Class( if (!is.null(train_set)) { - if (!lgb.is.Dataset(train_set)) { + if (!.is_Dataset(train_set)) { stop("lgb.Booster.update: Only can use lgb.Dataset as training data") } @@ -340,7 +340,7 @@ Booster <- R6::R6Class( # Evaluate data on metrics eval = function(data, name, feval = NULL) { - if (!lgb.is.Dataset(data)) { + if (!.is_Dataset(data)) { stop("lgb.Booster.eval: Can only use lgb.Dataset to eval") } @@ -508,17 +508,17 @@ Booster <- R6::R6Class( # NOTE: doing this here instead of in Predictor$predict() to keep # Predictor$predict() as fast as possible if (length(params) > 0L) { - params <- lgb.check.wrapper_param( + params <- .check_wrapper_param( main_param_name = "predict_raw_score" , params = params , alternative_kwarg_value = rawscore ) - params <- lgb.check.wrapper_param( + params <- .check_wrapper_param( main_param_name = "predict_leaf_index" , params = params , alternative_kwarg_value = predleaf ) - params <- lgb.check.wrapper_param( + params <- .check_wrapper_param( main_param_name = "predict_contrib" , params = params , alternative_kwarg_value = predcontrib @@ -586,7 +586,7 @@ Booster <- R6::R6Class( , predcontrib , start_iteration , num_iteration - , lgb.params2str(params = params) + , .params2str(params = params) ) private$fast_predict_config <- list( @@ -622,7 +622,7 @@ Booster <- R6::R6Class( }, check_null_handle = function() { - return(lgb.is.null.handle(private$handle)) + return(.is_null_handle(private$handle)) }, restore_handle = function() { @@ -959,7 +959,7 @@ predict.lgb.Booster <- function(object, params = list(), ...) { - if (!lgb.is.Booster(x = object)) { + if (!.is_Booster(x = object)) { stop("predict.lgb.Booster: object should be an ", sQuote("lgb.Booster")) } @@ -1114,7 +1114,7 @@ lgb.configure_fast_predict <- function(model, num_iteration = NULL, type = "response", params = list()) { - if (!lgb.is.Booster(x = model)) { + if (!.is_Booster(x = model)) { stop("lgb.configure_fast_predict: model should be an ", sQuote("lgb.Booster")) } if (type == "class") { @@ -1160,7 +1160,7 @@ lgb.configure_fast_predict <- function(model, print.lgb.Booster <- function(x, ...) { # nolint start handle <- x$.__enclos_env__$private$handle - handle_is_null <- lgb.is.null.handle(handle) + handle_is_null <- .is_null_handle(handle) if (!handle_is_null) { ntrees <- x$current_iter() @@ -1316,7 +1316,7 @@ lgb.load <- function(filename = NULL, model_str = NULL) { #' @export lgb.save <- function(booster, filename, num_iteration = NULL) { - if (!lgb.is.Booster(x = booster)) { + if (!.is_Booster(x = booster)) { stop("lgb.save: booster should be an ", sQuote("lgb.Booster")) } @@ -1372,7 +1372,7 @@ lgb.save <- function(booster, filename, num_iteration = NULL) { #' @export lgb.dump <- function(booster, num_iteration = NULL) { - if (!lgb.is.Booster(x = booster)) { + if (!.is_Booster(x = booster)) { stop("lgb.dump: booster should be an ", sQuote("lgb.Booster")) } @@ -1430,7 +1430,7 @@ lgb.dump <- function(booster, num_iteration = NULL) { #' @export lgb.get.eval.result <- function(booster, data_name, eval_name, iters = NULL, is_err = FALSE) { - if (!lgb.is.Booster(x = booster)) { + if (!.is_Booster(x = booster)) { stop("lgb.get.eval.result: Can only use ", sQuote("lgb.Booster"), " to get eval result") } diff --git a/R-package/R/lgb.DataProcessor.R b/R-package/R/lgb.DataProcessor.R index fc7061945b62..c35ce4f93bd3 100644 --- a/R-package/R/lgb.DataProcessor.R +++ b/R-package/R/lgb.DataProcessor.R @@ -39,7 +39,7 @@ DataProcessor <- R6::R6Class( ) } data_num_class <- length(self$factor_levels) - params <- lgb.check.wrapper_param( + params <- .check_wrapper_param( main_param_name = "num_class" , params = params , alternative_kwarg_value = data_num_class diff --git a/R-package/R/lgb.Dataset.R b/R-package/R/lgb.Dataset.R index e2892ea4bae0..ddc338d2cae3 100644 --- a/R-package/R/lgb.Dataset.R +++ b/R-package/R/lgb.Dataset.R @@ -55,10 +55,10 @@ Dataset <- R6::R6Class( init_score = NULL) { # validate inputs early to avoid unnecessary computation - if (!(is.null(reference) || lgb.is.Dataset(reference))) { + if (!(is.null(reference) || .is_Dataset(reference))) { stop("lgb.Dataset: If provided, reference must be a ", sQuote("lgb.Dataset")) } - if (!(is.null(predictor) || lgb.is.Predictor(predictor))) { + if (!(is.null(predictor) || .is_Predictor(predictor))) { stop("lgb.Dataset: If provided, predictor must be a ", sQuote("lgb.Predictor")) } @@ -135,7 +135,7 @@ Dataset <- R6::R6Class( construct = function() { # Check for handle null - if (!lgb.is.null.handle(x = private$handle)) { + if (!.is_null_handle(x = private$handle)) { return(invisible(self)) } @@ -191,7 +191,7 @@ Dataset <- R6::R6Class( } # Generate parameter str - params_str <- lgb.params2str(params = private$params) + params_str <- .params2str(params = private$params) # Get handle of reference dataset ref_handle <- NULL @@ -277,7 +277,7 @@ Dataset <- R6::R6Class( ) } - if (lgb.is.null.handle(x = handle)) { + if (.is_null_handle(x = handle)) { stop("lgb.Dataset.construct: cannot create Dataset handle") } # Setup class and private type @@ -345,7 +345,7 @@ Dataset <- R6::R6Class( dim = function() { # Check for handle - if (!lgb.is.null.handle(x = private$handle)) { + if (!.is_null_handle(x = private$handle)) { num_row <- 0L num_col <- 0L @@ -385,7 +385,7 @@ Dataset <- R6::R6Class( # Get number of bins for feature get_feature_num_bin = function(feature) { - if (lgb.is.null.handle(x = private$handle)) { + if (.is_null_handle(x = private$handle)) { stop("Cannot get number of bins in feature before constructing Dataset.") } if (is.character(feature)) { @@ -409,7 +409,7 @@ Dataset <- R6::R6Class( get_colnames = function() { # Check for handle - if (!lgb.is.null.handle(x = private$handle)) { + if (!.is_null_handle(x = private$handle)) { private$colnames <- .Call( LGBM_DatasetGetFeatureNames_R , private$handle @@ -449,7 +449,7 @@ Dataset <- R6::R6Class( # Write column names private$colnames <- colnames - if (!lgb.is.null.handle(x = private$handle)) { + if (!.is_null_handle(x = private$handle)) { # Merge names with tab separation merged_name <- paste0(as.list(private$colnames), collapse = "\t") @@ -478,7 +478,7 @@ Dataset <- R6::R6Class( # Check for info name and handle if (is.null(private$info[[field_name]])) { - if (lgb.is.null.handle(x = private$handle)) { + if (.is_null_handle(x = private$handle)) { stop("Cannot perform Dataset$get_field() before constructing Dataset.") } @@ -536,7 +536,7 @@ Dataset <- R6::R6Class( # Store information privately private$info[[field_name]] <- data - if (!lgb.is.null.handle(x = private$handle) && !is.null(data)) { + if (!.is_null_handle(x = private$handle) && !is.null(data)) { if (length(data) > 0L) { @@ -583,14 +583,14 @@ Dataset <- R6::R6Class( return(invisible(self)) } new_params <- utils::modifyList(private$params, params) - if (lgb.is.null.handle(x = private$handle)) { + if (.is_null_handle(x = private$handle)) { private$params <- new_params } else { tryCatch({ .Call( LGBM_DatasetUpdateParamChecking_R - , lgb.params2str(params = private$params) - , lgb.params2str(params = new_params) + , .params2str(params = private$params) + , .params2str(params = new_params) ) private$params <- new_params }, error = function(e) { @@ -663,7 +663,7 @@ Dataset <- R6::R6Class( please set ", sQuote("free_raw_data = FALSE"), " when you construct lgb.Dataset") } - if (!lgb.is.Dataset(reference)) { + if (!.is_Dataset(reference)) { stop("set_reference: Can only use lgb.Dataset as a reference") } @@ -711,7 +711,7 @@ Dataset <- R6::R6Class( get_handle = function() { # Get handle and construct if needed - if (lgb.is.null.handle(x = private$handle)) { + if (.is_null_handle(x = private$handle)) { self$construct() } return(private$handle) @@ -734,7 +734,7 @@ Dataset <- R6::R6Class( if (!is.null(predictor)) { # Predictor is unknown - if (!lgb.is.Predictor(predictor)) { + if (!.is_Predictor(predictor)) { stop("set_predictor: Can only use lgb.Predictor as predictor") } @@ -888,7 +888,7 @@ lgb.Dataset.create.valid <- function(dataset, init_score = NULL, params = list()) { - if (!lgb.is.Dataset(x = dataset)) { + if (!.is_Dataset(x = dataset)) { stop("lgb.Dataset.create.valid: input data should be an lgb.Dataset object") } @@ -922,7 +922,7 @@ lgb.Dataset.create.valid <- function(dataset, #' @export lgb.Dataset.construct <- function(dataset) { - if (!lgb.is.Dataset(x = dataset)) { + if (!.is_Dataset(x = dataset)) { stop("lgb.Dataset.construct: input data should be an lgb.Dataset object") } @@ -954,7 +954,7 @@ lgb.Dataset.construct <- function(dataset) { #' @export dim.lgb.Dataset <- function(x) { - if (!lgb.is.Dataset(x = x)) { + if (!.is_Dataset(x = x)) { stop("dim.lgb.Dataset: input data should be an lgb.Dataset object") } @@ -989,7 +989,7 @@ dim.lgb.Dataset <- function(x) { #' @export dimnames.lgb.Dataset <- function(x) { - if (!lgb.is.Dataset(x = x)) { + if (!.is_Dataset(x = x)) { stop("dimnames.lgb.Dataset: input data should be an lgb.Dataset object") } @@ -1062,7 +1062,7 @@ slice <- function(dataset, idxset) { #' @export slice.lgb.Dataset <- function(dataset, idxset) { - if (!lgb.is.Dataset(x = dataset)) { + if (!.is_Dataset(x = dataset)) { stop("slice.lgb.Dataset: input dataset should be an lgb.Dataset object") } @@ -1110,7 +1110,7 @@ get_field <- function(dataset, field_name) { get_field.lgb.Dataset <- function(dataset, field_name) { # Check if dataset is not a dataset - if (!lgb.is.Dataset(x = dataset)) { + if (!.is_Dataset(x = dataset)) { stop("get_field.lgb.Dataset(): input dataset should be an lgb.Dataset object") } @@ -1158,7 +1158,7 @@ set_field <- function(dataset, field_name, data) { #' @export set_field.lgb.Dataset <- function(dataset, field_name, data) { - if (!lgb.is.Dataset(x = dataset)) { + if (!.is_Dataset(x = dataset)) { stop("set_field.lgb.Dataset: input dataset should be an lgb.Dataset object") } @@ -1189,7 +1189,7 @@ set_field.lgb.Dataset <- function(dataset, field_name, data) { #' @export lgb.Dataset.set.categorical <- function(dataset, categorical_feature) { - if (!lgb.is.Dataset(x = dataset)) { + if (!.is_Dataset(x = dataset)) { stop("lgb.Dataset.set.categorical: input dataset should be an lgb.Dataset object") } @@ -1222,7 +1222,7 @@ lgb.Dataset.set.categorical <- function(dataset, categorical_feature) { #' @export lgb.Dataset.set.reference <- function(dataset, reference) { - if (!lgb.is.Dataset(x = dataset)) { + if (!.is_Dataset(x = dataset)) { stop("lgb.Dataset.set.reference: input dataset should be an lgb.Dataset object") } @@ -1248,7 +1248,7 @@ lgb.Dataset.set.reference <- function(dataset, reference) { #' @export lgb.Dataset.save <- function(dataset, fname) { - if (!lgb.is.Dataset(x = dataset)) { + if (!.is_Dataset(x = dataset)) { stop("lgb.Dataset.save: input dataset should be an lgb.Dataset object") } diff --git a/R-package/R/lgb.Predictor.R b/R-package/R/lgb.Predictor.R index 0b7b39e2d8c2..3a411efd75ba 100644 --- a/R-package/R/lgb.Predictor.R +++ b/R-package/R/lgb.Predictor.R @@ -28,7 +28,7 @@ Predictor <- R6::R6Class( # Initialize will create a starter model initialize = function(modelfile, params = list(), fast_predict_config = list()) { - private$params <- lgb.params2str(params = params) + private$params <- .params2str(params = params) handle <- NULL if (is.character(modelfile)) { @@ -46,7 +46,7 @@ Predictor <- R6::R6Class( handle <- modelfile private$need_free_handle <- FALSE - } else if (lgb.is.Booster(modelfile)) { + } else if (.is_Booster(modelfile)) { handle <- modelfile$get_handle() private$need_free_handle <- FALSE @@ -512,7 +512,7 @@ Predictor <- R6::R6Class( return(FALSE) } - if (lgb.is.null.handle(private$fast_predict_config$handle)) { + if (.is_null_handle(private$fast_predict_config$handle)) { warning(paste0("Model had fast CSR predict configuration, but it is inactive." , " Try re-generating it through 'lgb.configure_fast_predict'.")) return(FALSE) @@ -527,8 +527,8 @@ Predictor <- R6::R6Class( private$fast_predict_config$rawscore == rawscore && private$fast_predict_config$predleaf == predleaf && private$fast_predict_config$predcontrib == predcontrib && - lgb.equal.or.both.null(private$fast_predict_config$start_iteration, start_iteration) && - lgb.equal.or.both.null(private$fast_predict_config$num_iteration, num_iteration) + .equal_or_both_null(private$fast_predict_config$start_iteration, start_iteration) && + .equal_or_both_null(private$fast_predict_config$num_iteration, num_iteration) ) } ) diff --git a/R-package/R/lgb.cv.R b/R-package/R/lgb.cv.R index f81026fe673f..11768c5bfa0b 100644 --- a/R-package/R/lgb.cv.R +++ b/R-package/R/lgb.cv.R @@ -99,7 +99,7 @@ lgb.cv <- function(params = list() } # If 'data' is not an lgb.Dataset, try to construct one using 'label' - if (!lgb.is.Dataset(x = data)) { + if (!.is_Dataset(x = data)) { if (is.null(label)) { stop("'label' must be provided for lgb.cv if 'data' is not an 'lgb.Dataset'") } @@ -110,27 +110,27 @@ lgb.cv <- function(params = list() # in `params`. # this ensures that the model stored with Booster$save() correctly represents # what was passed in - params <- lgb.check.wrapper_param( + params <- .check_wrapper_param( main_param_name = "verbosity" , params = params , alternative_kwarg_value = verbose ) - params <- lgb.check.wrapper_param( + params <- .check_wrapper_param( main_param_name = "num_iterations" , params = params , alternative_kwarg_value = nrounds ) - params <- lgb.check.wrapper_param( + params <- .check_wrapper_param( main_param_name = "metric" , params = params , alternative_kwarg_value = NULL ) - params <- lgb.check.wrapper_param( + params <- .check_wrapper_param( main_param_name = "objective" , params = params , alternative_kwarg_value = obj ) - params <- lgb.check.wrapper_param( + params <- .check_wrapper_param( main_param_name = "early_stopping_round" , params = params , alternative_kwarg_value = early_stopping_rounds @@ -148,7 +148,7 @@ lgb.cv <- function(params = list() # (for backwards compatibility). If it is a list of functions, store # all of them. This makes it possible to pass any mix of strings like "auc" # and custom functions to eval - params <- lgb.check.eval(params = params, eval = eval) + params <- .check_eval(params = params, eval = eval) eval_functions <- list(NULL) if (is.function(eval)) { eval_functions <- list(eval) @@ -166,7 +166,7 @@ lgb.cv <- function(params = list() # Check for boosting from a trained model if (is.character(init_model)) { predictor <- Predictor$new(modelfile = init_model) - } else if (lgb.is.Booster(x = init_model)) { + } else if (.is_Booster(x = init_model)) { predictor <- init_model$to_predictor() } @@ -193,7 +193,7 @@ lgb.cv <- function(params = list() } else if (!is.null(data$get_colnames())) { cnames <- data$get_colnames() } - params[["interaction_constraints"]] <- lgb.check_interaction_constraints( + params[["interaction_constraints"]] <- .check_interaction_constraints( interaction_constraints = interaction_constraints , column_names = cnames ) @@ -232,7 +232,7 @@ lgb.cv <- function(params = list() } # Create folds - folds <- generate.cv.folds( + folds <- .generate_cv_folds( nfold = nfold , nrows = nrow(data) , stratified = stratified @@ -245,12 +245,12 @@ lgb.cv <- function(params = list() # Add printing log callback if (params[["verbosity"]] > 0L && eval_freq > 0L) { - callbacks <- add.cb(cb_list = callbacks, cb = cb_print_evaluation(period = eval_freq)) + callbacks <- .add_cb(cb_list = callbacks, cb = cb_print_evaluation(period = eval_freq)) } # Add evaluation log callback if (record) { - callbacks <- add.cb(cb_list = callbacks, cb = cb_record_evaluation()) + callbacks <- .add_cb(cb_list = callbacks, cb = cb_record_evaluation()) } # Did user pass parameters that indicate they want to use early stopping? @@ -282,7 +282,7 @@ lgb.cv <- function(params = list() # If user supplied early_stopping_rounds, add the early stopping callback if (using_early_stopping) { - callbacks <- add.cb( + callbacks <- .add_cb( cb_list = callbacks , cb = cb_early_stop( stopping_rounds = early_stopping_rounds @@ -292,7 +292,7 @@ lgb.cv <- function(params = list() ) } - cb <- categorize.callbacks(cb_list = callbacks) + cb <- .categorize_callbacks(cb_list = callbacks) # Construct booster for each fold. The data.table() code below is used to # guarantee that indices are sorted while keeping init_score and weight together @@ -387,7 +387,7 @@ lgb.cv <- function(params = list() }) # Prepare collection of evaluation results - merged_msg <- lgb.merge.cv.result( + merged_msg <- .merge_cv_result( msg = msg , showsd = showsd ) @@ -463,7 +463,7 @@ lgb.cv <- function(params = list() } # Generates random (stratified if needed) CV folds -generate.cv.folds <- function(nfold, nrows, stratified, label, group, params) { +.generate_cv_folds <- function(nfold, nrows, stratified, label, group, params) { # Check for group existence if (is.null(group)) { @@ -476,7 +476,7 @@ generate.cv.folds <- function(nfold, nrows, stratified, label, group, params) { y <- label[rnd_idx] y <- as.factor(y) - folds <- lgb.stratified.folds(y = y, k = nfold) + folds <- .stratified_folds(y = y, k = nfold) } else { @@ -528,7 +528,7 @@ generate.cv.folds <- function(nfold, nrows, stratified, label, group, params) { # It was borrowed from caret::createFolds and simplified # by always returning an unnamed list of fold indices. #' @importFrom stats quantile -lgb.stratified.folds <- function(y, k) { +.stratified_folds <- function(y, k) { # Group the numeric data based on their magnitudes # and sample within those groups. @@ -594,7 +594,7 @@ lgb.stratified.folds <- function(y, k) { return(out) } -lgb.merge.cv.result <- function(msg, showsd) { +.merge_cv_result <- function(msg, showsd) { if (length(msg) == 0L) { stop("lgb.cv: size of cv result error") diff --git a/R-package/R/lgb.drop_serialized.R b/R-package/R/lgb.drop_serialized.R index bcc2480e8ccc..e53f2cafac11 100644 --- a/R-package/R/lgb.drop_serialized.R +++ b/R-package/R/lgb.drop_serialized.R @@ -13,7 +13,7 @@ #' @seealso \link{lgb.restore_handle}, \link{lgb.make_serializable}. #' @export lgb.drop_serialized <- function(model) { - if (!lgb.is.Booster(x = model)) { + if (!.is_Booster(x = model)) { stop("lgb.drop_serialized: model should be an ", sQuote("lgb.Booster")) } model$drop_raw() diff --git a/R-package/R/lgb.importance.R b/R-package/R/lgb.importance.R index 5a58770553f9..27efb17392df 100644 --- a/R-package/R/lgb.importance.R +++ b/R-package/R/lgb.importance.R @@ -39,7 +39,7 @@ #' @export lgb.importance <- function(model, percentage = TRUE) { - if (!lgb.is.Booster(x = model)) { + if (!.is_Booster(x = model)) { stop("'model' has to be an object of class lgb.Booster") } diff --git a/R-package/R/lgb.interprete.R b/R-package/R/lgb.interprete.R index 7de772664d8b..976315262792 100644 --- a/R-package/R/lgb.interprete.R +++ b/R-package/R/lgb.interprete.R @@ -86,7 +86,7 @@ lgb.interprete <- function(model, ) for (i in seq_along(idxset)) { - tree_interpretation_dt_list[[i]] <- single.row.interprete( + tree_interpretation_dt_list[[i]] <- .single_row_interprete( tree_dt = tree_dt , num_class = num_class , tree_index_mat = tree_index_mat_list[[i]] @@ -151,7 +151,7 @@ single.tree.interprete <- function(tree_dt, } #' @importFrom data.table := rbindlist setorder -multiple.tree.interprete <- function(tree_dt, +.multiple_tree_interprete <- function(tree_dt, tree_index, leaf_index) { @@ -186,7 +186,7 @@ multiple.tree.interprete <- function(tree_dt, } #' @importFrom data.table set setnames -single.row.interprete <- function(tree_dt, num_class, tree_index_mat, leaf_index_mat) { +.single_row_interprete <- function(tree_dt, num_class, tree_index_mat, leaf_index_mat) { # Prepare vector list tree_interpretation <- vector(mode = "list", length = num_class) @@ -194,7 +194,7 @@ single.row.interprete <- function(tree_dt, num_class, tree_index_mat, leaf_index # Loop throughout each class for (i in seq_len(num_class)) { - next_interp_dt <- multiple.tree.interprete( + next_interp_dt <- .multiple_tree_interprete( tree_dt = tree_dt , tree_index = tree_index_mat[, i] , leaf_index = leaf_index_mat[, i] diff --git a/R-package/R/lgb.make_serializable.R b/R-package/R/lgb.make_serializable.R index 58bdd194df4d..5a639aacb2b5 100644 --- a/R-package/R/lgb.make_serializable.R +++ b/R-package/R/lgb.make_serializable.R @@ -13,7 +13,7 @@ #' @seealso \link{lgb.restore_handle}, \link{lgb.drop_serialized}. #' @export lgb.make_serializable <- function(model) { - if (!lgb.is.Booster(x = model)) { + if (!.is_Booster(x = model)) { stop("lgb.make_serializable: model should be an ", sQuote("lgb.Booster")) } model$save_raw() diff --git a/R-package/R/lgb.model.dt.tree.R b/R-package/R/lgb.model.dt.tree.R index 8b0d8d81e2e8..5d994accfa7f 100644 --- a/R-package/R/lgb.model.dt.tree.R +++ b/R-package/R/lgb.model.dt.tree.R @@ -62,7 +62,10 @@ lgb.model.dt.tree <- function(model, num_iteration = NULL) { ) # Parse tree model - tree_list <- lapply(parsed_json_model$tree_info, single.tree.parse) + tree_list <- lapply( + X = parsed_json_model$tree_info + , FUN = .single_tree_parse + ) # Combine into single data.table tree_dt <- data.table::rbindlist(l = tree_list, use.names = TRUE) @@ -84,7 +87,7 @@ lgb.model.dt.tree <- function(model, num_iteration = NULL) { #' @importFrom data.table := data.table rbindlist -single.tree.parse <- function(lgb_tree) { +.single_tree_parse <- function(lgb_tree) { # Traverse tree function pre_order_traversal <- function(env = NULL, tree_node_leaf, current_depth = 0L, parent_index = NA_integer_) { diff --git a/R-package/R/lgb.plot.interpretation.R b/R-package/R/lgb.plot.interpretation.R index a88f14bf83f0..8b95371eb3c2 100644 --- a/R-package/R/lgb.plot.interpretation.R +++ b/R-package/R/lgb.plot.interpretation.R @@ -89,7 +89,7 @@ lgb.plot.interpretation <- function(tree_interpretation_dt, if (num_class == 1L) { # Only one class, plot straight away - multiple.tree.plot.interpretation( + .multiple_tree_plot_interpretation( tree_interpretation = tree_interpretation_dt , top_n = top_n , title = NULL @@ -118,7 +118,7 @@ lgb.plot.interpretation <- function(tree_interpretation_dt, , old = names(plot_dt) , new = c("Feature", "Contribution") ) - multiple.tree.plot.interpretation( + .multiple_tree_plot_interpretation( tree_interpretation = plot_dt , top_n = top_n , title = paste("Class", i - 1L) @@ -131,7 +131,7 @@ lgb.plot.interpretation <- function(tree_interpretation_dt, } #' @importFrom graphics barplot -multiple.tree.plot.interpretation <- function(tree_interpretation, +.multiple_tree_plot_interpretation <- function(tree_interpretation, top_n, title, cex) { diff --git a/R-package/R/lgb.restore_handle.R b/R-package/R/lgb.restore_handle.R index 4de93d46c96a..0ed25ef26f3d 100644 --- a/R-package/R/lgb.restore_handle.R +++ b/R-package/R/lgb.restore_handle.R @@ -35,7 +35,7 @@ #' model_new$check_null_handle() #' @export lgb.restore_handle <- function(model) { - if (!lgb.is.Booster(x = model)) { + if (!.is_Booster(x = model)) { stop("lgb.restore_handle: model should be an ", sQuote("lgb.Booster")) } model$restore_handle() diff --git a/R-package/R/lgb.train.R b/R-package/R/lgb.train.R index 20916c9844b5..6979558d22cd 100644 --- a/R-package/R/lgb.train.R +++ b/R-package/R/lgb.train.R @@ -63,11 +63,11 @@ lgb.train <- function(params = list(), if (nrounds <= 0L) { stop("nrounds should be greater than zero") } - if (!lgb.is.Dataset(x = data)) { + if (!.is_Dataset(x = data)) { stop("lgb.train: data must be an lgb.Dataset instance") } if (length(valids) > 0L) { - if (!identical(class(valids), "list") || !all(vapply(valids, lgb.is.Dataset, logical(1L)))) { + if (!identical(class(valids), "list") || !all(vapply(valids, .is_Dataset, logical(1L)))) { stop("lgb.train: valids must be a list of lgb.Dataset elements") } evnames <- names(valids) @@ -80,27 +80,27 @@ lgb.train <- function(params = list(), # in `params`. # this ensures that the model stored with Booster$save() correctly represents # what was passed in - params <- lgb.check.wrapper_param( + params <- .check_wrapper_param( main_param_name = "verbosity" , params = params , alternative_kwarg_value = verbose ) - params <- lgb.check.wrapper_param( + params <- .check_wrapper_param( main_param_name = "num_iterations" , params = params , alternative_kwarg_value = nrounds ) - params <- lgb.check.wrapper_param( + params <- .check_wrapper_param( main_param_name = "metric" , params = params , alternative_kwarg_value = NULL ) - params <- lgb.check.wrapper_param( + params <- .check_wrapper_param( main_param_name = "objective" , params = params , alternative_kwarg_value = obj ) - params <- lgb.check.wrapper_param( + params <- .check_wrapper_param( main_param_name = "early_stopping_round" , params = params , alternative_kwarg_value = early_stopping_rounds @@ -118,7 +118,7 @@ lgb.train <- function(params = list(), # (for backwards compatibility). If it is a list of functions, store # all of them. This makes it possible to pass any mix of strings like "auc" # and custom functions to eval - params <- lgb.check.eval(params = params, eval = eval) + params <- .check_eval(params = params, eval = eval) eval_functions <- list(NULL) if (is.function(eval)) { eval_functions <- list(eval) @@ -136,7 +136,7 @@ lgb.train <- function(params = list(), # Check for boosting from a trained model if (is.character(init_model)) { predictor <- Predictor$new(modelfile = init_model) - } else if (lgb.is.Booster(x = init_model)) { + } else if (.is_Booster(x = init_model)) { predictor <- init_model$to_predictor() } @@ -166,7 +166,7 @@ lgb.train <- function(params = list(), } else if (!is.null(data$get_colnames())) { cnames <- data$get_colnames() } - params[["interaction_constraints"]] <- lgb.check_interaction_constraints( + params[["interaction_constraints"]] <- .check_interaction_constraints( interaction_constraints = interaction_constraints , column_names = cnames ) @@ -212,12 +212,18 @@ lgb.train <- function(params = list(), # Add printing log callback if (params[["verbosity"]] > 0L && eval_freq > 0L) { - callbacks <- add.cb(cb_list = callbacks, cb = cb_print_evaluation(period = eval_freq)) + callbacks <- .add_cb( + cb_list = callbacks + , cb = cb_print_evaluation(period = eval_freq) + ) } # Add evaluation log callback if (record && length(valids) > 0L) { - callbacks <- add.cb(cb_list = callbacks, cb = cb_record_evaluation()) + callbacks <- .add_cb( + cb_list = callbacks + , cb = cb_record_evaluation() + ) } # Did user pass parameters that indicate they want to use early stopping? @@ -249,7 +255,7 @@ lgb.train <- function(params = list(), # If user supplied early_stopping_rounds, add the early stopping callback if (using_early_stopping) { - callbacks <- add.cb( + callbacks <- .add_cb( cb_list = callbacks , cb = cb_early_stop( stopping_rounds = early_stopping_rounds @@ -259,7 +265,7 @@ lgb.train <- function(params = list(), ) } - cb <- categorize.callbacks(cb_list = callbacks) + cb <- .categorize_callbacks(cb_list = callbacks) # Construct booster with datasets booster <- Booster$new(params = params, train_set = data) diff --git a/R-package/R/lightgbm.R b/R-package/R/lightgbm.R index 711b3ef0dc38..e5df7a93fc97 100644 --- a/R-package/R/lightgbm.R +++ b/R-package/R/lightgbm.R @@ -184,21 +184,21 @@ lightgbm <- function(data, } if (is.null(num_threads)) { - num_threads <- lgb.get.default.num.threads() + num_threads <- .get_default_num_threads() } - params <- lgb.check.wrapper_param( + params <- .check_wrapper_param( main_param_name = "num_threads" , params = params , alternative_kwarg_value = num_threads ) - params <- lgb.check.wrapper_param( + params <- .check_wrapper_param( main_param_name = "verbosity" , params = params , alternative_kwarg_value = verbose ) # Process factors as labels and auto-determine objective - if (!lgb.is.Dataset(data)) { + if (!.is_Dataset(data)) { data_processor <- DataProcessor$new() temp <- data_processor$process_label( label = label @@ -220,7 +220,7 @@ lightgbm <- function(data, dtrain <- data # Check whether data is lgb.Dataset, if not then create lgb.Dataset manually - if (!lgb.is.Dataset(x = dtrain)) { + if (!.is_Dataset(x = dtrain)) { dtrain <- lgb.Dataset(data = data, label = label, weight = weights, init_score = init_score) } diff --git a/R-package/R/saveRDS.lgb.Booster.R b/R-package/R/saveRDS.lgb.Booster.R index 5d3af097301f..d75056e69734 100644 --- a/R-package/R/saveRDS.lgb.Booster.R +++ b/R-package/R/saveRDS.lgb.Booster.R @@ -57,7 +57,7 @@ saveRDS.lgb.Booster <- function(object, warning("'saveRDS.lgb.Booster' is deprecated and will be removed in a future release. Use saveRDS() instead.") - if (!lgb.is.Booster(x = object)) { + if (!.is_Booster(x = object)) { stop("saveRDS.lgb.Booster: object should be an ", sQuote("lgb.Booster")) } diff --git a/R-package/R/utils.R b/R-package/R/utils.R index c9ba780316df..1ac6f197ca77 100644 --- a/R-package/R/utils.R +++ b/R-package/R/utils.R @@ -1,16 +1,16 @@ -lgb.is.Booster <- function(x) { +.is_Booster <- function(x) { return(all(c("R6", "lgb.Booster") %in% class(x))) # nolint: class_equals } -lgb.is.Dataset <- function(x) { +.is_Dataset <- function(x) { return(all(c("R6", "lgb.Dataset") %in% class(x))) # nolint: class_equals } -lgb.is.Predictor <- function(x) { +.is_Predictor <- function(x) { return(all(c("R6", "lgb.Predictor") %in% class(x))) # nolint: class_equals } -lgb.is.null.handle <- function(x) { +.is_null_handle <- function(x) { if (is.null(x)) { return(TRUE) } @@ -19,7 +19,7 @@ lgb.is.null.handle <- function(x) { ) } -lgb.params2str <- function(params) { +.params2str <- function(params) { if (!identical(class(params), "list")) { stop("params must be a list") @@ -59,7 +59,7 @@ lgb.params2str <- function(params) { } -lgb.check_interaction_constraints <- function(interaction_constraints, column_names) { +.check_interaction_constraints <- function(interaction_constraints, column_names) { # Convert interaction constraints to feature numbers string_constraints <- list() @@ -129,7 +129,7 @@ lgb.check_interaction_constraints <- function(interaction_constraints, column_na # This has to account for the fact that `eval` could be a character vector, # a function, a list of functions, or a list with a mix of strings and # functions -lgb.check.eval <- function(params, eval) { +.check_eval <- function(params, eval) { if (is.null(params$metric)) { params$metric <- list() @@ -194,7 +194,7 @@ lgb.check.eval <- function(params, eval) { # [return] # params with num_iterations set to the chosen value, and other aliases # of num_iterations removed -lgb.check.wrapper_param <- function(main_param_name, params, alternative_kwarg_value) { +.check_wrapper_param <- function(main_param_name, params, alternative_kwarg_value) { aliases <- .PARAMETER_ALIASES()[[main_param_name]] aliases_provided <- aliases[aliases %in% names(params)] @@ -225,7 +225,7 @@ lgb.check.wrapper_param <- function(main_param_name, params, alternative_kwarg_v } #' @importFrom parallel detectCores -lgb.get.default.num.threads <- function() { +.get_default_num_threads <- function() { if (requireNamespace("RhpcBLASctl", quietly = TRUE)) { # nolint: undesirable_function return(RhpcBLASctl::get_num_cores()) } else { @@ -247,7 +247,7 @@ lgb.get.default.num.threads <- function() { } } -lgb.equal.or.both.null <- function(a, b) { +.equal_or_both_null <- function(a, b) { if (is.null(a)) { if (!is.null(b)) { return(FALSE) diff --git a/R-package/tests/testthat/test_Predictor.R b/R-package/tests/testthat/test_Predictor.R index 90be1d08cf67..192171c915bf 100644 --- a/R-package/tests/testthat/test_Predictor.R +++ b/R-package/tests/testthat/test_Predictor.R @@ -17,16 +17,16 @@ test_that("Predictor$finalize() should not fail", { bst$save_model(filename = model_file) predictor <- Predictor$new(modelfile = model_file) - expect_true(lgb.is.Predictor(predictor)) + expect_true(.is_Predictor(predictor)) - expect_false(lgb.is.null.handle(predictor$.__enclos_env__$private$handle)) + expect_false(.is_null_handle(predictor$.__enclos_env__$private$handle)) predictor$finalize() - expect_true(lgb.is.null.handle(predictor$.__enclos_env__$private$handle)) + expect_true(.is_null_handle(predictor$.__enclos_env__$private$handle)) # calling finalize() a second time shouldn't cause any issues predictor$finalize() - expect_true(lgb.is.null.handle(predictor$.__enclos_env__$private$handle)) + expect_true(.is_null_handle(predictor$.__enclos_env__$private$handle)) }) test_that("predictions do not fail for integer input", { @@ -79,7 +79,7 @@ test_that("start_iteration works correctly", { , valids = list("test" = dtest) , early_stopping_rounds = 2L ) - expect_true(lgb.is.Booster(bst)) + expect_true(.is_Booster(bst)) pred1 <- predict(bst, newdata = test$data, type = "raw") pred_contrib1 <- predict(bst, test$data, type = "contrib") pred2 <- rep(0.0, length(pred1)) diff --git a/R-package/tests/testthat/test_basic.R b/R-package/tests/testthat/test_basic.R index 9b84017476a7..75abd26dd152 100644 --- a/R-package/tests/testthat/test_basic.R +++ b/R-package/tests/testthat/test_basic.R @@ -1094,7 +1094,7 @@ test_that("lgb.train() works as expected with sparse features", { , nrounds = nrounds ) - expect_true(lgb.is.Booster(bst)) + expect_true(.is_Booster(bst)) expect_equal(bst$current_iter(), nrounds) parsed_model <- jsonlite::fromJSON(bst$dump_model()) expect_equal(parsed_model$objective, "binary sigmoid:1") @@ -1816,7 +1816,7 @@ test_that("lgb.train() supports non-ASCII feature names", { ) , colnames = feature_names ) - expect_true(lgb.is.Booster(bst)) + expect_true(.is_Booster(bst)) dumped_model <- jsonlite::fromJSON(bst$dump_model()) # UTF-8 strings are not well-supported on Windows @@ -2522,7 +2522,7 @@ test_that("lgb.train() fit on linearly-relatead data improves when using linear , params = params , valids = list("train" = dtrain) ) - expect_true(lgb.is.Booster(bst)) + expect_true(.is_Booster(bst)) dtrain <- .new_dataset() bst_linear <- lgb.train( @@ -2531,7 +2531,7 @@ test_that("lgb.train() fit on linearly-relatead data improves when using linear , params = utils::modifyList(params, list(linear_tree = TRUE)) , valids = list("train" = dtrain) ) - expect_true(lgb.is.Booster(bst_linear)) + expect_true(.is_Booster(bst_linear)) bst_last_mse <- bst$record_evals[["train"]][["l2"]][["eval"]][[10L]] bst_lin_last_mse <- bst_linear$record_evals[["train"]][["l2"]][["eval"]][[10L]] @@ -2599,7 +2599,7 @@ test_that("lgb.train() works with linear learners even if Dataset has missing va , params = params , valids = list("train" = dtrain) ) - expect_true(lgb.is.Booster(bst)) + expect_true(.is_Booster(bst)) dtrain <- .new_dataset() bst_linear <- lgb.train( @@ -2608,7 +2608,7 @@ test_that("lgb.train() works with linear learners even if Dataset has missing va , params = utils::modifyList(params, list(linear_tree = TRUE)) , valids = list("train" = dtrain) ) - expect_true(lgb.is.Booster(bst_linear)) + expect_true(.is_Booster(bst_linear)) bst_last_mse <- bst$record_evals[["train"]][["l2"]][["eval"]][[10L]] bst_lin_last_mse <- bst_linear$record_evals[["train"]][["l2"]][["eval"]][[10L]] @@ -2649,7 +2649,7 @@ test_that("lgb.train() works with linear learners, bagging, and a Dataset that h , params = params , valids = list("train" = dtrain) ) - expect_true(lgb.is.Booster(bst)) + expect_true(.is_Booster(bst)) dtrain <- .new_dataset() bst_linear <- lgb.train( @@ -2658,7 +2658,7 @@ test_that("lgb.train() works with linear learners, bagging, and a Dataset that h , params = utils::modifyList(params, list(linear_tree = TRUE)) , valids = list("train" = dtrain) ) - expect_true(lgb.is.Booster(bst_linear)) + expect_true(.is_Booster(bst_linear)) bst_last_mse <- bst$record_evals[["train"]][["l2"]][["eval"]][[10L]] bst_lin_last_mse <- bst_linear$record_evals[["train"]][["l2"]][["eval"]][[10L]] @@ -2699,7 +2699,7 @@ test_that("lgb.train() works with linear learners and data where a feature has o , nrounds = 10L , params = utils::modifyList(params, list(linear_tree = TRUE)) ) - expect_true(lgb.is.Booster(bst_linear)) + expect_true(.is_Booster(bst_linear)) }) test_that("lgb.train() works with linear learners when Dataset has categorical features", { @@ -2732,7 +2732,7 @@ test_that("lgb.train() works with linear learners when Dataset has categorical f , params = params , valids = list("train" = dtrain) ) - expect_true(lgb.is.Booster(bst)) + expect_true(.is_Booster(bst)) dtrain <- .new_dataset() bst_linear <- lgb.train( @@ -2741,7 +2741,7 @@ test_that("lgb.train() works with linear learners when Dataset has categorical f , params = utils::modifyList(params, list(linear_tree = TRUE)) , valids = list("train" = dtrain) ) - expect_true(lgb.is.Booster(bst_linear)) + expect_true(.is_Booster(bst_linear)) bst_last_mse <- bst$record_evals[["train"]][["l2"]][["eval"]][[10L]] bst_lin_last_mse <- bst_linear$record_evals[["train"]][["l2"]][["eval"]][[10L]] diff --git a/R-package/tests/testthat/test_dataset.R b/R-package/tests/testthat/test_dataset.R index 401d1babf5e9..a8585baa2621 100644 --- a/R-package/tests/testthat/test_dataset.R +++ b/R-package/tests/testthat/test_dataset.R @@ -206,7 +206,7 @@ test_that("lgb.Dataset: Dataset should be able to construct from matrix and retu , rawData , nrow(rawData) , ncol(rawData) - , lightgbm:::lgb.params2str(params = list()) + , lightgbm:::.params2str(params = list()) , ref_handle ) expect_true(methods::is(handle, "externalptr")) @@ -322,7 +322,7 @@ test_that("Dataset$update_parameters() does nothing for empty inputs", { res <- ds$update_params( params = list() ) - expect_true(lgb.is.Dataset(res)) + expect_true(.is_Dataset(res)) new_params <- ds$get_params() expect_identical(new_params, initial_params) @@ -343,7 +343,7 @@ test_that("Dataset$update_params() works correctly for recognized Dataset parame res <- ds$update_params( params = new_params ) - expect_true(lgb.is.Dataset(res)) + expect_true(.is_Dataset(res)) updated_params <- ds$get_params() for (param_name in names(new_params)) { @@ -356,17 +356,17 @@ test_that("Dataset$finalize() should not fail on an already-finalized Dataset", data = test_data , label = test_label ) - expect_true(lgb.is.null.handle(dtest$.__enclos_env__$private$handle)) + expect_true(.is_null_handle(dtest$.__enclos_env__$private$handle)) dtest$construct() - expect_false(lgb.is.null.handle(dtest$.__enclos_env__$private$handle)) + expect_false(.is_null_handle(dtest$.__enclos_env__$private$handle)) dtest$finalize() - expect_true(lgb.is.null.handle(dtest$.__enclos_env__$private$handle)) + expect_true(.is_null_handle(dtest$.__enclos_env__$private$handle)) # calling finalize() a second time shouldn't cause any issues dtest$finalize() - expect_true(lgb.is.null.handle(dtest$.__enclos_env__$private$handle)) + expect_true(.is_null_handle(dtest$.__enclos_env__$private$handle)) }) test_that("lgb.Dataset: should be able to run lgb.train() immediately after using lgb.Dataset() on a file", { @@ -401,7 +401,7 @@ test_that("lgb.Dataset: should be able to run lgb.train() immediately after usin , data = dtest_read_in ) - expect_true(lgb.is.Booster(x = bst)) + expect_true(.is_Booster(x = bst)) }) test_that("lgb.Dataset: should be able to run lgb.cv() immediately after using lgb.Dataset() on a file", { diff --git a/R-package/tests/testthat/test_learning_to_rank.R b/R-package/tests/testthat/test_learning_to_rank.R index b4ebe7bd67c3..e99aff44ceb3 100644 --- a/R-package/tests/testthat/test_learning_to_rank.R +++ b/R-package/tests/testthat/test_learning_to_rank.R @@ -25,7 +25,7 @@ test_that("learning-to-rank with lgb.train() works as expected", { , data = dtrain , nrounds = 10L ) - expect_true(lgb.is.Booster(model)) + expect_true(.is_Booster(model)) dumped_model <- jsonlite::fromJSON( model$dump_model() diff --git a/R-package/tests/testthat/test_lgb.Booster.R b/R-package/tests/testthat/test_lgb.Booster.R index 7ebb236cd923..c1fc02630c13 100644 --- a/R-package/tests/testthat/test_lgb.Booster.R +++ b/R-package/tests/testthat/test_lgb.Booster.R @@ -11,16 +11,16 @@ test_that("Booster$finalize() should not fail", { , verbose = .LGB_VERBOSITY , nrounds = 3L ) - expect_true(lgb.is.Booster(bst)) + expect_true(.is_Booster(bst)) - expect_false(lgb.is.null.handle(bst$.__enclos_env__$private$handle)) + expect_false(.is_null_handle(bst$.__enclos_env__$private$handle)) bst$finalize() - expect_true(lgb.is.null.handle(bst$.__enclos_env__$private$handle)) + expect_true(.is_null_handle(bst$.__enclos_env__$private$handle)) # calling finalize() a second time shouldn't cause any issues bst$finalize() - expect_true(lgb.is.null.handle(bst$.__enclos_env__$private$handle)) + expect_true(.is_null_handle(bst$.__enclos_env__$private$handle)) }) test_that("lgb.get.eval.result() should throw an informative error if booster is not an lgb.Booster", { @@ -188,7 +188,7 @@ test_that("Loading a Booster from a text file works", { , params = params , nrounds = 2L ) - expect_true(lgb.is.Booster(bst)) + expect_true(.is_Booster(bst)) pred <- predict(bst, test$data) model_file <- tempfile(fileext = ".model") @@ -232,7 +232,7 @@ test_that("boosters with linear models at leaves can be written to text file and , params = params , verbose = .LGB_VERBOSITY ) - expect_true(lgb.is.Booster(bst)) + expect_true(.is_Booster(bst)) # save predictions, then write the model to a file and destroy it in R preds <- predict(bst, X) @@ -269,7 +269,7 @@ test_that("Loading a Booster from a string works", { ) , nrounds = 2L ) - expect_true(lgb.is.Booster(bst)) + expect_true(.is_Booster(bst)) pred <- predict(bst, test$data) model_string <- bst$save_model_to_string() @@ -376,7 +376,7 @@ test_that("If a string and a file are both passed to lgb.load() the file is used ) , nrounds = 2L ) - expect_true(lgb.is.Booster(bst)) + expect_true(.is_Booster(bst)) pred <- predict(bst, test$data) model_file <- tempfile(fileext = ".model") @@ -411,7 +411,7 @@ test_that("Creating a Booster from a Dataset should work", { ), train_set = dtrain ) - expect_true(lgb.is.Booster(bst)) + expect_true(.is_Booster(bst)) expect_equal(bst$current_iter(), 0L) expect_true(is.na(bst$best_score)) expect_true(all(bst$predict(agaricus.train$data) == 0.5)) @@ -446,10 +446,10 @@ test_that("Creating a Booster from a Dataset with an existing predictor should w , num_threads = .LGB_MAX_THREADS ) ) - expect_true(lgb.is.Booster(bst)) + expect_true(.is_Booster(bst)) expect_equal(bst$current_iter(), nrounds) expect_equal(bst$eval_train()[[1L]][["value"]], 0.1115352) - expect_true(lgb.is.Booster(bst_from_ds)) + expect_true(.is_Booster(bst_from_ds)) expect_equal(bst_from_ds$current_iter(), nrounds) expect_equal(bst_from_ds$eval_train()[[1L]][["value"]], 5.65704892) dumped_model <- jsonlite::fromJSON(bst$dump_model()) @@ -531,7 +531,7 @@ test_that("Booster$rollback_one_iter() should work as expected", { , nrounds = nrounds ) expect_equal(bst$current_iter(), nrounds) - expect_true(lgb.is.Booster(bst)) + expect_true(.is_Booster(bst)) logloss <- bst$eval_train()[[1L]][["value"]] expect_equal(logloss, 0.01904786) @@ -539,7 +539,7 @@ test_that("Booster$rollback_one_iter() should work as expected", { # rollback_one_iter() should return a booster and modify the original # booster in place - expect_true(lgb.is.Booster(x)) + expect_true(.is_Booster(x)) expect_equal(bst$current_iter(), nrounds - 1L) # score should now come from the model as of 4 iterations @@ -565,7 +565,7 @@ test_that("Booster$update() passing a train_set works as expected", { ) , nrounds = nrounds ) - expect_true(lgb.is.Booster(bst)) + expect_true(.is_Booster(bst)) expect_equal(bst$current_iter(), nrounds) bst$update( train_set = Dataset$new( @@ -574,7 +574,7 @@ test_that("Booster$update() passing a train_set works as expected", { , params = list(verbose = .LGB_VERBOSITY) ) ) - expect_true(lgb.is.Booster(bst)) + expect_true(.is_Booster(bst)) expect_equal(bst$current_iter(), nrounds + 1L) # train with 3 rounds directly @@ -590,7 +590,7 @@ test_that("Booster$update() passing a train_set works as expected", { ) , nrounds = nrounds + 1L ) - expect_true(lgb.is.Booster(bst2)) + expect_true(.is_Booster(bst2)) expect_equal(bst2$current_iter(), nrounds + 1L) # model with 2 rounds + 1 update should be identical to 3 rounds @@ -716,7 +716,7 @@ test_that("Saving a model with different feature importance types works", { ) , nrounds = 2L ) - expect_true(lgb.is.Booster(bst)) + expect_true(.is_Booster(bst)) .feat_importance_from_string <- function(model_string) { file_lines <- strsplit(model_string, "\n", fixed = TRUE)[[1L]] @@ -772,7 +772,7 @@ test_that("Saving a model with unknown importance type fails", { ) , nrounds = 2L ) - expect_true(lgb.is.Booster(bst)) + expect_true(.is_Booster(bst)) UNSUPPORTED_IMPORTANCE <- 2L expect_error({ @@ -1372,7 +1372,7 @@ test_that("boosters with linear models at leaves work with saveRDS.lgb.Booster a , nrounds = 10L , params = params ) - expect_true(lgb.is.Booster(bst)) + expect_true(.is_Booster(bst)) # save predictions, then write the model to a file and destroy it in R preds <- predict(bst, X) @@ -1412,7 +1412,7 @@ test_that("boosters with linear models at leaves can be written to RDS and re-lo , nrounds = 10L , params = params ) - expect_true(lgb.is.Booster(bst)) + expect_true(.is_Booster(bst)) # save predictions, then write the model to a file and destroy it in R preds <- predict(bst, X) diff --git a/R-package/tests/testthat/test_utils.R b/R-package/tests/testthat/test_utils.R index 4ab05e075ae3..898aed9b0915 100644 --- a/R-package/tests/testthat/test_utils.R +++ b/R-package/tests/testthat/test_utils.R @@ -1,12 +1,12 @@ -test_that("lgb.params2str() works as expected for empty lists", { - out_str <- lgb.params2str( +test_that(".params2str() works as expected for empty lists", { + out_str <- .params2str( params = list() ) expect_identical(class(out_str), "character") expect_equal(out_str, "") }) -test_that("lgb.params2str() works as expected for a key in params with multiple different-length elements", { +test_that(".params2str() works as expected for a key in params with multiple different-length elements", { metrics <- c("a", "ab", "abc", "abcdefg") params <- list( objective = "magic" @@ -14,7 +14,7 @@ test_that("lgb.params2str() works as expected for a key in params with multiple , nrounds = 10L , learning_rate = 0.0000001 ) - out_str <- lgb.params2str( + out_str <- .params2str( params = params ) expect_identical(class(out_str), "character") @@ -24,8 +24,8 @@ test_that("lgb.params2str() works as expected for a key in params with multiple ) }) -test_that("lgb.params2str() passes through duplicated params", { - out_str <- lgb.params2str( +test_that(".params2str() passes through duplicated params", { + out_str <- .params2str( params = list( objective = "regression" , bagging_fraction = 0.8 @@ -35,8 +35,8 @@ test_that("lgb.params2str() passes through duplicated params", { expect_equal(out_str, "objective=regression bagging_fraction=0.8 bagging_fraction=0.5") }) -test_that("lgb.check.eval works as expected with no metric", { - params <- lgb.check.eval( +test_that(".check_eval works as expected with no metric", { + params <- .check_eval( params = list(device = "cpu") , eval = "binary_error" ) @@ -44,8 +44,8 @@ test_that("lgb.check.eval works as expected with no metric", { expect_identical(params[["metric"]], list("binary_error")) }) -test_that("lgb.check.eval adds eval to metric in params", { - params <- lgb.check.eval( +test_that(".check_eval adds eval to metric in params", { + params <- .check_eval( params = list(metric = "auc") , eval = "binary_error" ) @@ -53,8 +53,8 @@ test_that("lgb.check.eval adds eval to metric in params", { expect_identical(params[["metric"]], list("auc", "binary_error")) }) -test_that("lgb.check.eval adds eval to metric in params if two evaluation names are provided", { - params <- lgb.check.eval( +test_that(".check_eval adds eval to metric in params if two evaluation names are provided", { + params <- .check_eval( params = list(metric = "auc") , eval = c("binary_error", "binary_logloss") ) @@ -62,8 +62,8 @@ test_that("lgb.check.eval adds eval to metric in params if two evaluation names expect_identical(params[["metric"]], list("auc", "binary_error", "binary_logloss")) }) -test_that("lgb.check.eval adds eval to metric in params if a list is provided", { - params <- lgb.check.eval( +test_that(".check_eval adds eval to metric in params if a list is provided", { + params <- .check_eval( params = list(metric = "auc") , eval = list("binary_error", "binary_logloss") ) @@ -71,8 +71,8 @@ test_that("lgb.check.eval adds eval to metric in params if a list is provided", expect_identical(params[["metric"]], list("auc", "binary_error", "binary_logloss")) }) -test_that("lgb.check.eval drops duplicate metrics and preserves order", { - params <- lgb.check.eval( +test_that(".check_eval drops duplicate metrics and preserves order", { + params <- .check_eval( params = list(metric = "l1") , eval = list("l2", "rmse", "l1", "rmse") ) @@ -80,9 +80,9 @@ test_that("lgb.check.eval drops duplicate metrics and preserves order", { expect_identical(params[["metric"]], list("l1", "l2", "rmse")) }) -test_that("lgb.check.wrapper_param() uses passed-in keyword arg if no alias found in params", { +test_that(".check_wrapper_param() uses passed-in keyword arg if no alias found in params", { kwarg_val <- sample(seq_len(100L), size = 1L) - params <- lgb.check.wrapper_param( + params <- .check_wrapper_param( main_param_name = "num_iterations" , params = list() , alternative_kwarg_value = kwarg_val @@ -90,10 +90,10 @@ test_that("lgb.check.wrapper_param() uses passed-in keyword arg if no alias foun expect_equal(params[["num_iterations"]], kwarg_val) }) -test_that("lgb.check.wrapper_param() prefers main parameter to alias and keyword arg", { +test_that(".check_wrapper_param() prefers main parameter to alias and keyword arg", { num_iterations <- sample(seq_len(100L), size = 1L) kwarg_val <- sample(seq_len(100L), size = 1L) - params <- lgb.check.wrapper_param( + params <- .check_wrapper_param( main_param_name = "num_iterations" , params = list( num_iterations = num_iterations @@ -108,11 +108,11 @@ test_that("lgb.check.wrapper_param() prefers main parameter to alias and keyword expect_identical(params, list(num_iterations = num_iterations)) }) -test_that("lgb.check.wrapper_param() prefers alias to keyword arg", { +test_that(".check_wrapper_param() prefers alias to keyword arg", { n_estimators <- sample(seq_len(100L), size = 1L) num_tree <- sample(seq_len(100L), size = 1L) kwarg_val <- sample(seq_len(100L), size = 1L) - params <- lgb.check.wrapper_param( + params <- .check_wrapper_param( main_param_name = "num_iterations" , params = list( num_tree = num_tree @@ -124,7 +124,7 @@ test_that("lgb.check.wrapper_param() prefers alias to keyword arg", { expect_identical(params, list(num_iterations = num_tree)) # switching the order shouldn't switch which one is chosen - params2 <- lgb.check.wrapper_param( + params2 <- .check_wrapper_param( main_param_name = "num_iterations" , params = list( n_estimators = n_estimators @@ -136,14 +136,14 @@ test_that("lgb.check.wrapper_param() prefers alias to keyword arg", { expect_identical(params2, list(num_iterations = num_tree)) }) -test_that("lgb.equal.or.both.null produces expected results", { - expect_true(lgb.equal.or.both.null(NULL, NULL)) - expect_false(lgb.equal.or.both.null(1.0, NULL)) - expect_false(lgb.equal.or.both.null(NULL, 1.0)) - expect_true(lgb.equal.or.both.null(1.0, 1.0)) - expect_true(lgb.equal.or.both.null(1.0, 1L)) - expect_false(lgb.equal.or.both.null(NA, NULL)) - expect_false(lgb.equal.or.both.null(NULL, NA)) - expect_false(lgb.equal.or.both.null(10.0, 1L)) - expect_true(lgb.equal.or.both.null(0L, 0L)) +test_that(".equal_or_both_null produces expected results", { + expect_true(.equal_or_both_null(NULL, NULL)) + expect_false(.equal_or_both_null(1.0, NULL)) + expect_false(.equal_or_both_null(NULL, 1.0)) + expect_true(.equal_or_both_null(1.0, 1.0)) + expect_true(.equal_or_both_null(1.0, 1L)) + expect_false(.equal_or_both_null(NA, NULL)) + expect_false(.equal_or_both_null(NULL, NA)) + expect_false(.equal_or_both_null(10.0, 1L)) + expect_true(.equal_or_both_null(0L, 0L)) }) From e63e54ace02afbc1c1f27505edf65c92733ac50b Mon Sep 17 00:00:00 2001 From: James Lamb Date: Mon, 13 Nov 2023 20:26:35 -0600 Subject: [PATCH 03/19] [docs] reduce redirects in docs links (#6181) --- docs/Experiments.rst | 26 +++++++++++++------------- docs/Features.rst | 2 +- docs/GPU-Performance.rst | 6 +++--- docs/Installation-Guide.rst | 2 +- docs/Parallel-Learning-Guide.rst | 4 ++-- docs/Parameters.rst | 16 ++++++++-------- docs/Quick-Start.rst | 2 +- include/LightGBM/config.h | 16 ++++++++-------- 8 files changed, 37 insertions(+), 37 deletions(-) diff --git a/docs/Experiments.rst b/docs/Experiments.rst index c314321e7a3a..4440a2c0ccae 100644 --- a/docs/Experiments.rst +++ b/docs/Experiments.rst @@ -18,19 +18,19 @@ Data We used 5 datasets to conduct our comparison experiments. Details of data are listed in the following table: -+-----------+-----------------------+------------------------------------------------------------------------+-------------+----------+----------------------------------------------+ -| Data | Task | Link | #Train\_Set | #Feature | Comments | -+===========+=======================+========================================================================+=============+==========+==============================================+ -| Higgs | Binary classification | `link `__ | 10,500,000 | 28 | last 500,000 samples were used as test set | -+-----------+-----------------------+------------------------------------------------------------------------+-------------+----------+----------------------------------------------+ -| Yahoo LTR | Learning to rank | `link `__ | 473,134 | 700 | set1.train as train, set1.test as test | -+-----------+-----------------------+------------------------------------------------------------------------+-------------+----------+----------------------------------------------+ -| MS LTR | Learning to rank | `link `__ | 2,270,296 | 137 | {S1,S2,S3} as train set, {S5} as test set | -+-----------+-----------------------+------------------------------------------------------------------------+-------------+----------+----------------------------------------------+ -| Expo | Binary classification | `link `__ | 11,000,000 | 700 | last 1,000,000 samples were used as test set | -+-----------+-----------------------+------------------------------------------------------------------------+-------------+----------+----------------------------------------------+ -| Allstate | Binary classification | `link `__ | 13,184,290 | 4228 | last 1,000,000 samples were used as test set | -+-----------+-----------------------+------------------------------------------------------------------------+-------------+----------+----------------------------------------------+ ++-----------+-----------------------+---------------------------------------------------------------------------------+-------------+----------+----------------------------------------------+ +| Data | Task | Link | #Train\_Set | #Feature | Comments | ++===========+=======================+=================================================================================+=============+==========+==============================================+ +| Higgs | Binary classification | `link `__ | 10,500,000 | 28 | last 500,000 samples were used as test set | ++-----------+-----------------------+---------------------------------------------------------------------------------+-------------+----------+----------------------------------------------+ +| Yahoo LTR | Learning to rank | `link `__ | 473,134 | 700 | set1.train as train, set1.test as test | ++-----------+-----------------------+---------------------------------------------------------------------------------+-------------+----------+----------------------------------------------+ +| MS LTR | Learning to rank | `link `__ | 2,270,296 | 137 | {S1,S2,S3} as train set, {S5} as test set | ++-----------+-----------------------+---------------------------------------------------------------------------------+-------------+----------+----------------------------------------------+ +| Expo | Binary classification | `link `__ | 11,000,000 | 700 | last 1,000,000 samples were used as test set | ++-----------+-----------------------+---------------------------------------------------------------------------------+-------------+----------+----------------------------------------------+ +| Allstate | Binary classification | `link `__ | 13,184,290 | 4228 | last 1,000,000 samples were used as test set | ++-----------+-----------------------+---------------------------------------------------------------------------------+-------------+----------+----------------------------------------------+ Environment ^^^^^^^^^^^ diff --git a/docs/Features.rst b/docs/Features.rst index a7db86ec2935..89b56646588f 100644 --- a/docs/Features.rst +++ b/docs/Features.rst @@ -291,7 +291,7 @@ References .. _On Grouping for Maximum Homogeneity: https://www.tandfonline.com/doi/abs/10.1080/01621459.1958.10501479 -.. _Optimization of collective communication operations in MPICH: https://www.mcs.anl.gov/~thakur/papers/ijhpca-coll.pdf +.. _Optimization of collective communication operations in MPICH: https://web.cels.anl.gov/~thakur/papers/ijhpca-coll.pdf .. _A Communication-Efficient Parallel Algorithm for Decision Tree: http://papers.nips.cc/paper/6381-a-communication-efficient-parallel-algorithm-for-decision-tree diff --git a/docs/GPU-Performance.rst b/docs/GPU-Performance.rst index be1c1051bb28..64cd78eb4202 100644 --- a/docs/GPU-Performance.rst +++ b/docs/GPU-Performance.rst @@ -194,7 +194,7 @@ following article: Huan Zhang, Si Si and Cho-Jui Hsieh. `GPU Acceleration for Large-scale Tree Boosting`_. SysML Conference, 2018. -.. _link1: https://archive.ics.uci.edu/ml/datasets/HIGGS +.. _link1: https://archive.ics.uci.edu/dataset/280/higgs .. _link2: https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary.html @@ -202,9 +202,9 @@ Huan Zhang, Si Si and Cho-Jui Hsieh. `GPU Acceleration for Large-scale Tree Boos .. _link4: https://webscope.sandbox.yahoo.com/catalog.php?datatype=c -.. _link5: http://research.microsoft.com/en-us/projects/mslr/ +.. _link5: https://www.microsoft.com/en-us/research/project/mslr/ -.. _link6: http://stat-computing.org/dataexpo/2009/ +.. _link6: https://community.amstat.org/jointscsg-section/dataexpo/dataexpo2009 .. _0bb4a82: https://github.com/microsoft/LightGBM/commit/0bb4a82 diff --git a/docs/Installation-Guide.rst b/docs/Installation-Guide.rst index 1acfbcefa711..564fa7304902 100644 --- a/docs/Installation-Guide.rst +++ b/docs/Installation-Guide.rst @@ -950,7 +950,7 @@ gcc .. _RDMA: https://en.wikipedia.org/wiki/Remote_direct_memory_access -.. _MS MPI: https://docs.microsoft.com/en-us/message-passing-interface/microsoft-mpi-release-notes +.. _MS MPI: https://learn.microsoft.com/en-us/message-passing-interface/microsoft-mpi-release-notes .. _Open MPI: https://www.open-mpi.org/ diff --git a/docs/Parallel-Learning-Guide.rst b/docs/Parallel-Learning-Guide.rst index a347be942570..cbc7b1012b98 100644 --- a/docs/Parallel-Learning-Guide.rst +++ b/docs/Parallel-Learning-Guide.rst @@ -518,7 +518,7 @@ See `the mars documentation`_ for usage examples. .. _the Dask DataFrame documentation: https://docs.dask.org/en/latest/dataframe.html -.. _the Dask prediction example: https://github.com/microsoft/lightgbm/tree/master/examples/python-guide/dask/prediction.py +.. _the Dask prediction example: https://github.com/microsoft/LightGBM/blob/master/examples/python-guide/dask/prediction.py .. _the Dask worker documentation: https://distributed.dask.org/en/stable/worker-memory.html @@ -536,7 +536,7 @@ See `the mars documentation`_ for usage examples. .. _lightgbm_ray: https://github.com/ray-project/lightgbm_ray -.. _Ray: https://ray.io/ +.. _Ray: https://www.ray.io/ .. _the lightgbm_ray documentation: https://docs.ray.io/en/latest/tune/api_docs/integration.html#lightgbm-tune-integration-lightgbm diff --git a/docs/Parameters.rst b/docs/Parameters.rst index 86104ba5be55..329f9c38656e 100644 --- a/docs/Parameters.rst +++ b/docs/Parameters.rst @@ -119,7 +119,7 @@ Core Parameters - ranking application - - ``lambdarank``, `lambdarank `__ objective. `label_gain <#label_gain>`__ can be used to set the gain (weight) of ``int`` label and all values in ``label`` must be smaller than number of elements in ``label_gain`` + - ``lambdarank``, `lambdarank `__ objective. `label_gain <#label_gain>`__ can be used to set the gain (weight) of ``int`` label and all values in ``label`` must be smaller than number of elements in ``label_gain`` - ``rank_xendcg``, `XE_NDCG_MART `__ ranking objective function, aliases: ``xendcg``, ``xe_ndcg``, ``xe_ndcg_mart``, ``xendcg_mart`` @@ -536,15 +536,15 @@ Learning Control Parameters - ``basic``, the most basic monotone constraints method. It does not slow the library at all, but over-constrains the predictions - - ``intermediate``, a `more advanced method `__, which may slow the library very slightly. However, this method is much less constraining than the basic method and should significantly improve the results + - ``intermediate``, a `more advanced method `__, which may slow the library very slightly. However, this method is much less constraining than the basic method and should significantly improve the results - - ``advanced``, an `even more advanced method `__, which may slow the library. However, this method is even less constraining than the intermediate method and should again significantly improve the results + - ``advanced``, an `even more advanced method `__, which may slow the library. However, this method is even less constraining than the intermediate method and should again significantly improve the results - ``monotone_penalty`` :raw-html:`🔗︎`, default = ``0.0``, type = double, aliases: ``monotone_splits_penalty``, ``ms_penalty``, ``mc_penalty``, constraints: ``monotone_penalty >= 0.0`` - used only if ``monotone_constraints`` is set - - `monotone penalty `__: a penalization parameter X forbids any monotone splits on the first X (rounded down) level(s) of the tree. The penalty applied to monotone splits on a given depth is a continuous, increasing function the penalization parameter + - `monotone penalty `__: a penalization parameter X forbids any monotone splits on the first X (rounded down) level(s) of the tree. The penalty applied to monotone splits on a given depth is a continuous, increasing function the penalization parameter - if ``0.0`` (the default), no penalization is applied @@ -564,7 +564,7 @@ Learning Control Parameters - **Note**: the forced split logic will be ignored, if the split makes gain worse - - see `this file `__ as an example + - see `this file `__ as an example - ``refit_decay_rate`` :raw-html:`🔗︎`, default = ``0.9``, type = double, constraints: ``0.0 <= refit_decay_rate <= 1.0`` @@ -770,7 +770,7 @@ Dataset Parameters - ``enable_bundle`` :raw-html:`🔗︎`, default = ``true``, type = bool, aliases: ``is_enable_bundle``, ``bundle`` - - set this to ``false`` to disable Exclusive Feature Bundling (EFB), which is described in `LightGBM: A Highly Efficient Gradient Boosting Decision Tree `__ + - set this to ``false`` to disable Exclusive Feature Bundling (EFB), which is described in `LightGBM: A Highly Efficient Gradient Boosting Decision Tree `__ - **Note**: disabling this may cause the slow training speed for sparse datasets @@ -894,7 +894,7 @@ Dataset Parameters - ``.json`` file should contain an array of objects, each containing the word ``feature`` (integer feature index) and ``bin_upper_bound`` (array of thresholds for binning) - - see `this file `__ as an example + - see `this file `__ as an example - ``save_binary`` :raw-html:`🔗︎`, default = ``false``, type = bool, aliases: ``is_save_binary``, ``is_save_binary_file`` @@ -961,7 +961,7 @@ Predict Parameters - produces ``#features + 1`` values where the last value is the expected value of the model output over the training data - - **Note**: if you want to get more explanation for your model's predictions using SHAP values like SHAP interaction values, you can install `shap package `__ + - **Note**: if you want to get more explanation for your model's predictions using SHAP values like SHAP interaction values, you can install `shap package `__ - **Note**: unlike the shap package, with ``predict_contrib`` we return a matrix with an extra column, where the last column is the expected value diff --git a/docs/Quick-Start.rst b/docs/Quick-Start.rst index 04e64beb1281..30b0b3c228a2 100644 --- a/docs/Quick-Start.rst +++ b/docs/Quick-Start.rst @@ -85,4 +85,4 @@ Examples .. _LibSVM: https://www.csie.ntu.edu.tw/~cjlin/libsvm/ -.. _Expo data: http://stat-computing.org/dataexpo/2009/ +.. _Expo data: https://community.amstat.org/jointscsg-section/dataexpo/dataexpo2009 diff --git a/include/LightGBM/config.h b/include/LightGBM/config.h index 6d61bc764924..6500cb77272d 100644 --- a/include/LightGBM/config.h +++ b/include/LightGBM/config.h @@ -150,7 +150,7 @@ struct Config { // descl2 = ``cross_entropy_lambda``, alternative parameterization of cross-entropy, aliases: ``xentlambda`` // descl2 = label is anything in interval [0, 1] // desc = ranking application - // descl2 = ``lambdarank``, `lambdarank `__ objective. `label_gain <#label_gain>`__ can be used to set the gain (weight) of ``int`` label and all values in ``label`` must be smaller than number of elements in ``label_gain`` + // descl2 = ``lambdarank``, `lambdarank `__ objective. `label_gain <#label_gain>`__ can be used to set the gain (weight) of ``int`` label and all values in ``label`` must be smaller than number of elements in ``label_gain`` // descl2 = ``rank_xendcg``, `XE_NDCG_MART `__ ranking objective function, aliases: ``xendcg``, ``xe_ndcg``, ``xe_ndcg_mart``, ``xendcg_mart`` // descl2 = ``rank_xendcg`` is faster than and achieves the similar performance as ``lambdarank`` // descl2 = label should be ``int`` type, and larger number represents the higher relevance (e.g. 0:bad, 1:fair, 2:good, 3:perfect) @@ -501,14 +501,14 @@ struct Config { // desc = used only if ``monotone_constraints`` is set // desc = monotone constraints method // descl2 = ``basic``, the most basic monotone constraints method. It does not slow the library at all, but over-constrains the predictions - // descl2 = ``intermediate``, a `more advanced method `__, which may slow the library very slightly. However, this method is much less constraining than the basic method and should significantly improve the results - // descl2 = ``advanced``, an `even more advanced method `__, which may slow the library. However, this method is even less constraining than the intermediate method and should again significantly improve the results + // descl2 = ``intermediate``, a `more advanced method `__, which may slow the library very slightly. However, this method is much less constraining than the basic method and should significantly improve the results + // descl2 = ``advanced``, an `even more advanced method `__, which may slow the library. However, this method is even less constraining than the intermediate method and should again significantly improve the results std::string monotone_constraints_method = "basic"; // alias = monotone_splits_penalty, ms_penalty, mc_penalty // check = >=0.0 // desc = used only if ``monotone_constraints`` is set - // desc = `monotone penalty `__: a penalization parameter X forbids any monotone splits on the first X (rounded down) level(s) of the tree. The penalty applied to monotone splits on a given depth is a continuous, increasing function the penalization parameter + // desc = `monotone penalty `__: a penalization parameter X forbids any monotone splits on the first X (rounded down) level(s) of the tree. The penalty applied to monotone splits on a given depth is a continuous, increasing function the penalization parameter // desc = if ``0.0`` (the default), no penalization is applied double monotone_penalty = 0.0; @@ -524,7 +524,7 @@ struct Config { // desc = ``.json`` file can be arbitrarily nested, and each split contains ``feature``, ``threshold`` fields, as well as ``left`` and ``right`` fields representing subsplits // desc = categorical splits are forced in a one-hot fashion, with ``left`` representing the split containing the feature value and ``right`` representing other values // desc = **Note**: the forced split logic will be ignored, if the split makes gain worse - // desc = see `this file `__ as an example + // desc = see `this file `__ as an example std::string forcedsplits_filename = ""; // check = >=0.0 @@ -683,7 +683,7 @@ struct Config { bool is_enable_sparse = true; // alias = is_enable_bundle, bundle - // desc = set this to ``false`` to disable Exclusive Feature Bundling (EFB), which is described in `LightGBM: A Highly Efficient Gradient Boosting Decision Tree `__ + // desc = set this to ``false`` to disable Exclusive Feature Bundling (EFB), which is described in `LightGBM: A Highly Efficient Gradient Boosting Decision Tree `__ // desc = **Note**: disabling this may cause the slow training speed for sparse datasets bool enable_bundle = true; @@ -770,7 +770,7 @@ struct Config { // desc = path to a ``.json`` file that specifies bin upper bounds for some or all features // desc = ``.json`` file should contain an array of objects, each containing the word ``feature`` (integer feature index) and ``bin_upper_bound`` (array of thresholds for binning) - // desc = see `this file `__ as an example + // desc = see `this file `__ as an example std::string forcedbins_filename = ""; // [no-save] @@ -826,7 +826,7 @@ struct Config { // desc = used only in ``prediction`` task // desc = set this to ``true`` to estimate `SHAP values `__, which represent how each feature contributes to each prediction // desc = produces ``#features + 1`` values where the last value is the expected value of the model output over the training data - // desc = **Note**: if you want to get more explanation for your model's predictions using SHAP values like SHAP interaction values, you can install `shap package `__ + // desc = **Note**: if you want to get more explanation for your model's predictions using SHAP values like SHAP interaction values, you can install `shap package `__ // desc = **Note**: unlike the shap package, with ``predict_contrib`` we return a matrix with an extra column, where the last column is the expected value // desc = **Note**: this feature is not implemented for linear trees bool predict_contrib = false; From 18dbd65e57995618ee2a8b1f7e4cb0df1f9c6333 Mon Sep 17 00:00:00 2001 From: James Lamb Date: Wed, 15 Nov 2023 22:10:54 -0600 Subject: [PATCH 04/19] [python-package] consolidate pandas-to-numpy conversion code (#6156) --- python-package/lightgbm/basic.py | 53 ++++++++++++++++---------------- 1 file changed, 27 insertions(+), 26 deletions(-) diff --git a/python-package/lightgbm/basic.py b/python-package/lightgbm/basic.py index 939842df3389..b085e6fe8d36 100644 --- a/python-package/lightgbm/basic.py +++ b/python-package/lightgbm/basic.py @@ -758,6 +758,23 @@ def _check_for_bad_pandas_dtypes(pandas_dtypes_series: pd_Series) -> None: f'Fields with bad pandas dtypes: {", ".join(bad_pandas_dtypes)}') +def _pandas_to_numpy( + data: pd_DataFrame, + target_dtype: "np.typing.DTypeLike" +) -> np.ndarray: + _check_for_bad_pandas_dtypes(data.dtypes) + try: + # most common case (no nullable dtypes) + return data.to_numpy(dtype=target_dtype, copy=False) + except TypeError: + # 1.0 <= pd version < 1.1 and nullable dtypes, least common case + # raises error because array is casted to type(pd.NA) and there's no na_value argument + return data.astype(target_dtype, copy=False).values + except ValueError: + # data has nullable dtypes, but we can specify na_value argument and copy will be made + return data.to_numpy(dtype=target_dtype, na_value=np.nan) + + def _data_from_pandas( data: pd_DataFrame, feature_name: _LGBM_FeatureNameConfiguration, @@ -790,22 +807,17 @@ def _data_from_pandas( else: # use cat cols specified by user categorical_feature = list(categorical_feature) # type: ignore[assignment] - # get numpy representation of the data - _check_for_bad_pandas_dtypes(data.dtypes) df_dtypes = [dtype.type for dtype in data.dtypes] - df_dtypes.append(np.float32) # so that the target dtype considers floats + # so that the target dtype considers floats + df_dtypes.append(np.float32) target_dtype = np.result_type(*df_dtypes) - try: - # most common case (no nullable dtypes) - data = data.to_numpy(dtype=target_dtype, copy=False) - except TypeError: - # 1.0 <= pd version < 1.1 and nullable dtypes, least common case - # raises error because array is casted to type(pd.NA) and there's no na_value argument - data = data.astype(target_dtype, copy=False).values - except ValueError: - # data has nullable dtypes, but we can specify na_value argument and copy will be made - data = data.to_numpy(dtype=target_dtype, na_value=np.nan) - return data, feature_name, categorical_feature, pandas_categorical + + return ( + _pandas_to_numpy(data, target_dtype=target_dtype), + feature_name, + categorical_feature, + pandas_categorical + ) def _dump_pandas_categorical( @@ -2805,18 +2817,7 @@ def set_label(self, label: Optional[_LGBM_LabelType]) -> "Dataset": if isinstance(label, pd_DataFrame): if len(label.columns) > 1: raise ValueError('DataFrame for label cannot have multiple columns') - _check_for_bad_pandas_dtypes(label.dtypes) - try: - # most common case (no nullable dtypes) - label = label.to_numpy(dtype=np.float32, copy=False) - except TypeError: - # 1.0 <= pd version < 1.1 and nullable dtypes, least common case - # raises error because array is casted to type(pd.NA) and there's no na_value argument - label = label.astype(np.float32, copy=False).values - except ValueError: - # data has nullable dtypes, but we can specify na_value argument and copy will be made - label = label.to_numpy(dtype=np.float32, na_value=np.nan) - label_array = np.ravel(label) + label_array = np.ravel(_pandas_to_numpy(label, target_dtype=np.float32)) elif _is_pyarrow_array(label): label_array = label else: From d9a4cf498c336d8ca7803f9da3320a36080398af Mon Sep 17 00:00:00 2001 From: James Lamb Date: Wed, 22 Nov 2023 12:36:21 -0600 Subject: [PATCH 05/19] [python-package] ignore mypy errors related to ctypes string buffers (#6198) --- python-package/lightgbm/basic.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/python-package/lightgbm/basic.py b/python-package/lightgbm/basic.py index b085e6fe8d36..008ff1727d78 100644 --- a/python-package/lightgbm/basic.py +++ b/python-package/lightgbm/basic.py @@ -2948,7 +2948,7 @@ def get_feature_name(self) -> List[str]: reserved_string_buffer_size = 255 required_string_buffer_size = ctypes.c_size_t(0) string_buffers = [ctypes.create_string_buffer(reserved_string_buffer_size) for _ in range(num_feature)] - ptr_string_buffers = (ctypes.c_char_p * num_feature)(*map(ctypes.addressof, string_buffers)) + ptr_string_buffers = (ctypes.c_char_p * num_feature)(*map(ctypes.addressof, string_buffers)) # type: ignore[misc] _safe_call(_LIB.LGBM_DatasetGetFeatureNames( self._handle, ctypes.c_int(num_feature), @@ -2962,7 +2962,7 @@ def get_feature_name(self) -> List[str]: # if buffer length is not long enough, reallocate buffers if reserved_string_buffer_size < actual_string_buffer_size: string_buffers = [ctypes.create_string_buffer(actual_string_buffer_size) for _ in range(num_feature)] - ptr_string_buffers = (ctypes.c_char_p * num_feature)(*map(ctypes.addressof, string_buffers)) + ptr_string_buffers = (ctypes.c_char_p * num_feature)(*map(ctypes.addressof, string_buffers)) # type: ignore[misc] _safe_call(_LIB.LGBM_DatasetGetFeatureNames( self._handle, ctypes.c_int(num_feature), @@ -4628,7 +4628,7 @@ def feature_name(self) -> List[str]: reserved_string_buffer_size = 255 required_string_buffer_size = ctypes.c_size_t(0) string_buffers = [ctypes.create_string_buffer(reserved_string_buffer_size) for _ in range(num_feature)] - ptr_string_buffers = (ctypes.c_char_p * num_feature)(*map(ctypes.addressof, string_buffers)) + ptr_string_buffers = (ctypes.c_char_p * num_feature)(*map(ctypes.addressof, string_buffers)) # type: ignore[misc] _safe_call(_LIB.LGBM_BoosterGetFeatureNames( self._handle, ctypes.c_int(num_feature), @@ -4642,7 +4642,7 @@ def feature_name(self) -> List[str]: # if buffer length is not long enough, reallocate buffers if reserved_string_buffer_size < actual_string_buffer_size: string_buffers = [ctypes.create_string_buffer(actual_string_buffer_size) for _ in range(num_feature)] - ptr_string_buffers = (ctypes.c_char_p * num_feature)(*map(ctypes.addressof, string_buffers)) + ptr_string_buffers = (ctypes.c_char_p * num_feature)(*map(ctypes.addressof, string_buffers)) # type: ignore[misc] _safe_call(_LIB.LGBM_BoosterGetFeatureNames( self._handle, ctypes.c_int(num_feature), @@ -4852,7 +4852,7 @@ def __get_eval_info(self) -> None: string_buffers = [ ctypes.create_string_buffer(reserved_string_buffer_size) for _ in range(self.__num_inner_eval) ] - ptr_string_buffers = (ctypes.c_char_p * self.__num_inner_eval)(*map(ctypes.addressof, string_buffers)) + ptr_string_buffers = (ctypes.c_char_p * self.__num_inner_eval)(*map(ctypes.addressof, string_buffers)) # type: ignore[misc] _safe_call(_LIB.LGBM_BoosterGetEvalNames( self._handle, ctypes.c_int(self.__num_inner_eval), @@ -4868,7 +4868,7 @@ def __get_eval_info(self) -> None: string_buffers = [ ctypes.create_string_buffer(actual_string_buffer_size) for _ in range(self.__num_inner_eval) ] - ptr_string_buffers = (ctypes.c_char_p * self.__num_inner_eval)(*map(ctypes.addressof, string_buffers)) + ptr_string_buffers = (ctypes.c_char_p * self.__num_inner_eval)(*map(ctypes.addressof, string_buffers)) # type: ignore[misc] _safe_call(_LIB.LGBM_BoosterGetEvalNames( self._handle, ctypes.c_int(self.__num_inner_eval), From bc6942226e475be048dd0aac7f2cc6334f849aef Mon Sep 17 00:00:00 2001 From: James Lamb Date: Wed, 22 Nov 2023 14:15:11 -0600 Subject: [PATCH 06/19] [CUDA] fix typo in error message (#6207) --- src/treelearner/cuda/cuda_single_gpu_tree_learner.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/treelearner/cuda/cuda_single_gpu_tree_learner.hpp b/src/treelearner/cuda/cuda_single_gpu_tree_learner.hpp index fa782ebaad25..a1ea79efa1a1 100644 --- a/src/treelearner/cuda/cuda_single_gpu_tree_learner.hpp +++ b/src/treelearner/cuda/cuda_single_gpu_tree_learner.hpp @@ -155,7 +155,7 @@ class CUDASingleGPUTreeLearner: public SerialTreeLearner { #pragma warning(disable : 4702) explicit CUDASingleGPUTreeLearner(const Config* tree_config, const bool /*boosting_on_cuda*/) : SerialTreeLearner(tree_config) { Log::Fatal("CUDA Tree Learner was not enabled in this build.\n" - "Please recompile with CMake option -DUSE_CUDAP=1"); + "Please recompile with CMake option -DUSE_CUDA=1"); } }; From 516bde95015b05e57ff41b19d9bec19b0c48d7e6 Mon Sep 17 00:00:00 2001 From: Oliver Borchert Date: Wed, 22 Nov 2023 22:50:31 +0100 Subject: [PATCH 07/19] [python-package] Allow to pass Arrow array as groups (#6166) --- include/LightGBM/c_api.h | 3 +- include/LightGBM/dataset.h | 4 ++ python-package/lightgbm/basic.py | 15 +++-- src/io/dataset.cpp | 2 + src/io/metadata.cpp | 28 ++++++--- tests/python_package_test/test_arrow.py | 77 +++++++++++++++++-------- 6 files changed, 89 insertions(+), 40 deletions(-) diff --git a/include/LightGBM/c_api.h b/include/LightGBM/c_api.h index fd337cbc7cbe..eafe6fab7825 100644 --- a/include/LightGBM/c_api.h +++ b/include/LightGBM/c_api.h @@ -558,9 +558,10 @@ LIGHTGBM_C_EXPORT int LGBM_DatasetSetField(DatasetHandle handle, /*! * \brief Set vector to a content in info. * \note + * - \a group converts input datatype into ``int32``; * - \a label and \a weight convert input datatype into ``float32``. * \param handle Handle of dataset - * \param field_name Field name, can be \a label, \a weight + * \param field_name Field name, can be \a label, \a weight, \a group * \param n_chunks The number of Arrow arrays passed to this function * \param chunks Pointer to the list of Arrow arrays * \param schema Pointer to the schema of all Arrow arrays diff --git a/include/LightGBM/dataset.h b/include/LightGBM/dataset.h index 48c1bee804d7..bf8264276a5f 100644 --- a/include/LightGBM/dataset.h +++ b/include/LightGBM/dataset.h @@ -116,6 +116,7 @@ class Metadata { void SetWeights(const ArrowChunkedArray& array); void SetQuery(const data_size_t* query, data_size_t len); + void SetQuery(const ArrowChunkedArray& array); void SetPosition(const data_size_t* position, data_size_t len); @@ -348,6 +349,9 @@ class Metadata { void InsertInitScores(const double* init_scores, data_size_t start_index, data_size_t len, data_size_t source_size); /*! \brief Insert queries at the given index */ void InsertQueries(const data_size_t* queries, data_size_t start_index, data_size_t len); + /*! \brief Set queries from pointers to the first element and the end of an iterator. */ + template + void SetQueriesFromIterator(It first, It last); /*! \brief Filename of current data */ std::string data_filename_; /*! \brief Number of data */ diff --git a/python-package/lightgbm/basic.py b/python-package/lightgbm/basic.py index 008ff1727d78..b55546941f77 100644 --- a/python-package/lightgbm/basic.py +++ b/python-package/lightgbm/basic.py @@ -70,7 +70,9 @@ List[float], List[int], np.ndarray, - pd_Series + pd_Series, + pa_Array, + pa_ChunkedArray, ] _LGBM_PositionType = Union[ np.ndarray, @@ -1652,7 +1654,7 @@ def __init__( If this is Dataset for validation, training data should be used as reference. weight : list, numpy 1-D array, pandas Series, pyarrow Array, pyarrow ChunkedArray or None, optional (default=None) Weight for each instance. Weights should be non-negative. - group : list, numpy 1-D array, pandas Series or None, optional (default=None) + group : list, numpy 1-D array, pandas Series, pyarrow Array, pyarrow ChunkedArray or None, optional (default=None) Group/query data. Only used in the learning-to-rank task. sum(group) = n_samples. @@ -2432,7 +2434,7 @@ def create_valid( Label of the data. weight : list, numpy 1-D array, pandas Series, pyarrow Array, pyarrow ChunkedArray or None, optional (default=None) Weight for each instance. Weights should be non-negative. - group : list, numpy 1-D array, pandas Series or None, optional (default=None) + group : list, numpy 1-D array, pandas Series, pyarrow Array, pyarrow ChunkedArray or None, optional (default=None) Group/query data. Only used in the learning-to-rank task. sum(group) = n_samples. @@ -2889,7 +2891,7 @@ def set_group( Parameters ---------- - group : list, numpy 1-D array, pandas Series or None + group : list, numpy 1-D array, pandas Series, pyarrow Array, pyarrow ChunkedArray or None Group/query data. Only used in the learning-to-rank task. sum(group) = n_samples. @@ -2903,7 +2905,8 @@ def set_group( """ self.group = group if self._handle is not None and group is not None: - group = _list_to_1d_numpy(group, dtype=np.int32, name='group') + if not _is_pyarrow_array(group): + group = _list_to_1d_numpy(group, dtype=np.int32, name='group') self.set_field('group', group) # original values can be modified at cpp side constructed_group = self.get_field('group') @@ -4431,7 +4434,7 @@ def refit( .. versionadded:: 4.0.0 - group : list, numpy 1-D array, pandas Series or None, optional (default=None) + group : list, numpy 1-D array, pandas Series, pyarrow Array, pyarrow ChunkedArray or None, optional (default=None) Group/query size for ``data``. Only used in the learning-to-rank task. sum(group) = n_samples. diff --git a/src/io/dataset.cpp b/src/io/dataset.cpp index 01eb41b71367..78dd5e4319a5 100644 --- a/src/io/dataset.cpp +++ b/src/io/dataset.cpp @@ -904,6 +904,8 @@ bool Dataset::SetFieldFromArrow(const char* field_name, const ArrowChunkedArray metadata_.SetLabel(ca); } else if (name == std::string("weight") || name == std::string("weights")) { metadata_.SetWeights(ca); + } else if (name == std::string("query") || name == std::string("group")) { + metadata_.SetQuery(ca); } else { return false; } diff --git a/src/io/metadata.cpp b/src/io/metadata.cpp index ed4fb135e62a..d94b0ed3f2f7 100644 --- a/src/io/metadata.cpp +++ b/src/io/metadata.cpp @@ -507,30 +507,34 @@ void Metadata::InsertWeights(const label_t* weights, data_size_t start_index, da // CUDA is handled after all insertions are complete } -void Metadata::SetQuery(const data_size_t* query, data_size_t len) { +template +void Metadata::SetQueriesFromIterator(It first, It last) { std::lock_guard lock(mutex_); - // save to nullptr - if (query == nullptr || len == 0) { + // Clear query boundaries on empty input + if (last - first == 0) { query_boundaries_.clear(); num_queries_ = 0; return; } + data_size_t sum = 0; #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static) reduction(+:sum) - for (data_size_t i = 0; i < len; ++i) { - sum += query[i]; + for (data_size_t i = 0; i < last - first; ++i) { + sum += first[i]; } if (num_data_ != sum) { - Log::Fatal("Sum of query counts is not same with #data"); + Log::Fatal("Sum of query counts (%i) differs from the length of #data (%i)", num_data_, sum); } - num_queries_ = len; + num_queries_ = last - first; + query_boundaries_.resize(num_queries_ + 1); query_boundaries_[0] = 0; for (data_size_t i = 0; i < num_queries_; ++i) { - query_boundaries_[i + 1] = query_boundaries_[i] + query[i]; + query_boundaries_[i + 1] = query_boundaries_[i] + first[i]; } CalculateQueryWeights(); query_load_from_file_ = false; + #ifdef USE_CUDA if (cuda_metadata_ != nullptr) { if (query_weights_.size() > 0) { @@ -543,6 +547,14 @@ void Metadata::SetQuery(const data_size_t* query, data_size_t len) { #endif // USE_CUDA } +void Metadata::SetQuery(const data_size_t* query, data_size_t len) { + SetQueriesFromIterator(query, query + len); +} + +void Metadata::SetQuery(const ArrowChunkedArray& array) { + SetQueriesFromIterator(array.begin(), array.end()); +} + void Metadata::SetPosition(const data_size_t* positions, data_size_t len) { std::lock_guard lock(mutex_); // save to nullptr diff --git a/tests/python_package_test/test_arrow.py b/tests/python_package_test/test_arrow.py index 40482a904a62..38b053e94fd5 100644 --- a/tests/python_package_test/test_arrow.py +++ b/tests/python_package_test/test_arrow.py @@ -1,7 +1,6 @@ # coding: utf-8 import filecmp -from pathlib import Path -from typing import Any, Callable, Dict +from typing import Any, Dict import numpy as np import pyarrow as pa @@ -15,6 +14,21 @@ # UTILITIES # # ----------------------------------------------------------------------------------------------- # +_INTEGER_TYPES = [ + pa.int8(), + pa.int16(), + pa.int32(), + pa.int64(), + pa.uint8(), + pa.uint16(), + pa.uint32(), + pa.uint64(), +] +_FLOAT_TYPES = [ + pa.float32(), + pa.float64(), +] + def generate_simple_arrow_table() -> pa.Table: columns = [ @@ -85,9 +99,7 @@ def dummy_dataset_params() -> Dict[str, Any]: (lambda: generate_random_arrow_table(100, 10000, 43), {}), ], ) -def test_dataset_construct_fuzzy( - tmp_path: Path, arrow_table_fn: Callable[[], pa.Table], dataset_params: Dict[str, Any] -): +def test_dataset_construct_fuzzy(tmp_path, arrow_table_fn, dataset_params): arrow_table = arrow_table_fn() arrow_dataset = lgb.Dataset(arrow_table, params=dataset_params) @@ -108,17 +120,23 @@ def test_dataset_construct_fields_fuzzy(): arrow_table = generate_random_arrow_table(3, 1000, 42) arrow_labels = generate_random_arrow_array(1000, 42) arrow_weights = generate_random_arrow_array(1000, 42) + arrow_groups = pa.chunked_array([[300, 400, 50], [250]], type=pa.int32()) - arrow_dataset = lgb.Dataset(arrow_table, label=arrow_labels, weight=arrow_weights) + arrow_dataset = lgb.Dataset( + arrow_table, label=arrow_labels, weight=arrow_weights, group=arrow_groups + ) arrow_dataset.construct() pandas_dataset = lgb.Dataset( - arrow_table.to_pandas(), label=arrow_labels.to_numpy(), weight=arrow_weights.to_numpy() + arrow_table.to_pandas(), + label=arrow_labels.to_numpy(), + weight=arrow_weights.to_numpy(), + group=arrow_groups.to_numpy(), ) pandas_dataset.construct() # Check for equality - for field in ("label", "weight"): + for field in ("label", "weight", "group"): np_assert_array_equal( arrow_dataset.get_field(field), pandas_dataset.get_field(field), strict=True ) @@ -133,22 +151,8 @@ def test_dataset_construct_fields_fuzzy(): ["array_type", "label_data"], [(pa.array, [0, 1, 0, 0, 1]), (pa.chunked_array, [[0], [1, 0, 0, 1]])], ) -@pytest.mark.parametrize( - "arrow_type", - [ - pa.int8(), - pa.int16(), - pa.int32(), - pa.int64(), - pa.uint8(), - pa.uint16(), - pa.uint32(), - pa.uint64(), - pa.float32(), - pa.float64(), - ], -) -def test_dataset_construct_labels(array_type: Any, label_data: Any, arrow_type: Any): +@pytest.mark.parametrize("arrow_type", _INTEGER_TYPES + _FLOAT_TYPES) +def test_dataset_construct_labels(array_type, label_data, arrow_type): data = generate_dummy_arrow_table() labels = array_type(label_data, type=arrow_type) dataset = lgb.Dataset(data, label=labels, params=dummy_dataset_params()) @@ -175,7 +179,7 @@ def test_dataset_construct_weights_none(): [(pa.array, [3, 0.7, 1.5, 0.5, 0.1]), (pa.chunked_array, [[3], [0.7, 1.5, 0.5, 0.1]])], ) @pytest.mark.parametrize("arrow_type", [pa.float32(), pa.float64()]) -def test_dataset_construct_weights(array_type: Any, weight_data: Any, arrow_type: Any): +def test_dataset_construct_weights(array_type, weight_data, arrow_type): data = generate_dummy_arrow_table() weights = array_type(weight_data, type=arrow_type) dataset = lgb.Dataset(data, weight=weights, params=dummy_dataset_params()) @@ -183,3 +187,26 @@ def test_dataset_construct_weights(array_type: Any, weight_data: Any, arrow_type expected = np.array([3, 0.7, 1.5, 0.5, 0.1], dtype=np.float32) np_assert_array_equal(expected, dataset.get_weight(), strict=True) + + +# -------------------------------------------- GROUPS ------------------------------------------- # + + +@pytest.mark.parametrize( + ["array_type", "group_data"], + [ + (pa.array, [2, 3]), + (pa.chunked_array, [[2], [3]]), + (pa.chunked_array, [[], [2, 3]]), + (pa.chunked_array, [[2], [], [3], []]), + ], +) +@pytest.mark.parametrize("arrow_type", _INTEGER_TYPES) +def test_dataset_construct_groups(array_type, group_data, arrow_type): + data = generate_dummy_arrow_table() + groups = array_type(group_data, type=arrow_type) + dataset = lgb.Dataset(data, group=groups, params=dummy_dataset_params()) + dataset.construct() + + expected = np.array([0, 2, 5], dtype=np.int32) + np_assert_array_equal(expected, dataset.get_field("group"), strict=True) From cd36ffeaedd948330b5512aa7e3b58e9e6a9220c Mon Sep 17 00:00:00 2001 From: david-cortes Date: Sat, 25 Nov 2023 05:48:52 +0100 Subject: [PATCH 08/19] [R-package] Fix inefficiency in retrieving pointers (#6208) --- R-package/src/lightgbm_R.cpp | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/R-package/src/lightgbm_R.cpp b/R-package/src/lightgbm_R.cpp index 21ba801a3a60..270f2a2d54d5 100644 --- a/R-package/src/lightgbm_R.cpp +++ b/R-package/src/lightgbm_R.cpp @@ -226,9 +226,10 @@ SEXP LGBM_DatasetGetSubset_R(SEXP handle, int32_t len = static_cast(Rf_asInteger(len_used_row_indices)); std::vector idxvec(len); // convert from one-based to zero-based index + const int *used_row_indices_ = INTEGER(used_row_indices); #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static, 512) if (len >= 1024) for (int32_t i = 0; i < len; ++i) { - idxvec[i] = static_cast(INTEGER(used_row_indices)[i] - 1); + idxvec[i] = static_cast(used_row_indices_[i] - 1); } const char* parameters_ptr = CHAR(PROTECT(Rf_asChar(parameters))); DatasetHandle res = nullptr; @@ -339,18 +340,20 @@ SEXP LGBM_DatasetSetField_R(SEXP handle, const char* name = CHAR(PROTECT(Rf_asChar(field_name))); if (!strcmp("group", name) || !strcmp("query", name)) { std::vector vec(len); + const int *field_data_ = INTEGER(field_data); #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static, 512) if (len >= 1024) for (int i = 0; i < len; ++i) { - vec[i] = static_cast(INTEGER(field_data)[i]); + vec[i] = static_cast(field_data_[i]); } CHECK_CALL(LGBM_DatasetSetField(R_ExternalPtrAddr(handle), name, vec.data(), len, C_API_DTYPE_INT32)); } else if (!strcmp("init_score", name)) { CHECK_CALL(LGBM_DatasetSetField(R_ExternalPtrAddr(handle), name, REAL(field_data), len, C_API_DTYPE_FLOAT64)); } else { std::vector vec(len); + const double *field_data_ = REAL(field_data); #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static, 512) if (len >= 1024) for (int i = 0; i < len; ++i) { - vec[i] = static_cast(REAL(field_data)[i]); + vec[i] = static_cast(field_data_[i]); } CHECK_CALL(LGBM_DatasetSetField(R_ExternalPtrAddr(handle), name, vec.data(), len, C_API_DTYPE_FLOAT32)); } @@ -372,21 +375,24 @@ SEXP LGBM_DatasetGetField_R(SEXP handle, if (!strcmp("group", name) || !strcmp("query", name)) { auto p_data = reinterpret_cast(res); // convert from boundaries to size + int *field_data_ = INTEGER(field_data); #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static, 512) if (out_len >= 1024) for (int i = 0; i < out_len - 1; ++i) { - INTEGER(field_data)[i] = p_data[i + 1] - p_data[i]; + field_data_[i] = p_data[i + 1] - p_data[i]; } } else if (!strcmp("init_score", name)) { auto p_data = reinterpret_cast(res); + double *field_data_ = REAL(field_data); #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static, 512) if (out_len >= 1024) for (int i = 0; i < out_len; ++i) { - REAL(field_data)[i] = p_data[i]; + field_data_[i] = p_data[i]; } } else { auto p_data = reinterpret_cast(res); + double *field_data_ = REAL(field_data); #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static, 512) if (out_len >= 1024) for (int i = 0; i < out_len; ++i) { - REAL(field_data)[i] = p_data[i]; + field_data_[i] = p_data[i]; } } UNPROTECT(1); @@ -611,10 +617,12 @@ SEXP LGBM_BoosterUpdateOneIterCustom_R(SEXP handle, int is_finished = 0; int int_len = Rf_asInteger(len); std::vector tgrad(int_len), thess(int_len); + const double *grad_ = REAL(grad); + const double *hess_ = REAL(hess); #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static, 512) if (int_len >= 1024) for (int j = 0; j < int_len; ++j) { - tgrad[j] = static_cast(REAL(grad)[j]); - thess[j] = static_cast(REAL(hess)[j]); + tgrad[j] = static_cast(grad_[j]); + thess[j] = static_cast(hess_[j]); } CHECK_CALL(LGBM_BoosterUpdateOneIterCustom(R_ExternalPtrAddr(handle), tgrad.data(), thess.data(), &is_finished)); return R_NilValue; From 2ee3ec84b70df1a9e249d3b3bff9458fe3726cd4 Mon Sep 17 00:00:00 2001 From: shiyu1994 Date: Sat, 25 Nov 2023 13:24:44 +0800 Subject: [PATCH 09/19] [python-package] fix libpath.py (#6192) --- python-package/lightgbm/libpath.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/python-package/lightgbm/libpath.py b/python-package/lightgbm/libpath.py index c096a6f1b5e2..21222228b0c2 100644 --- a/python-package/lightgbm/libpath.py +++ b/python-package/lightgbm/libpath.py @@ -16,8 +16,7 @@ def find_lib_path() -> List[str]: List of all found library paths to LightGBM. """ curr_path = Path(__file__).absolute() - dll_path = [curr_path, - curr_path.parents[1], + dll_path = [curr_path.parents[1], curr_path.parents[0] / 'bin', curr_path.parents[0] / 'lib'] if system() in ('Windows', 'Microsoft'): From 848e76c3d690b3806a1e44809434d34aebad734a Mon Sep 17 00:00:00 2001 From: James Lamb Date: Wed, 29 Nov 2023 22:33:46 -0600 Subject: [PATCH 10/19] [R-package] use safer pattern for error formatting (fixes #6212) (#6216) --- R-package/src/lightgbm_R.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R-package/src/lightgbm_R.cpp b/R-package/src/lightgbm_R.cpp index 270f2a2d54d5..3ae7a98d8537 100644 --- a/R-package/src/lightgbm_R.cpp +++ b/R-package/src/lightgbm_R.cpp @@ -40,7 +40,7 @@ void LGBM_R_save_exception_msg(const std::string &err); catch(std::exception& ex) { LGBM_R_save_exception_msg(ex); } \ catch(std::string& ex) { LGBM_R_save_exception_msg(ex); } \ catch(...) { Rf_error("unknown exception"); } \ - Rf_error(R_errmsg_buffer); \ + Rf_error("%s", R_errmsg_buffer); \ return R_NilValue; /* <- won't be reached */ #define CHECK_CALL(x) \ From 5083df15c6866a78704787ca942931feaa096a76 Mon Sep 17 00:00:00 2001 From: James Lamb Date: Thu, 30 Nov 2023 19:07:33 -0600 Subject: [PATCH 11/19] [docs] remove links to Laurae++ site (#6193) --- README.md | 1 - docs/FAQ.rst | 2 +- docs/Parameters.rst | 6 ------ 3 files changed, 1 insertion(+), 8 deletions(-) diff --git a/README.md b/README.md index f6f4e8c570e0..f3f63404b399 100644 --- a/README.md +++ b/README.md @@ -41,7 +41,6 @@ Next you may want to read: - [**Features**](https://github.com/microsoft/LightGBM/blob/master/docs/Features.rst) and algorithms supported by LightGBM. - [**Parameters**](https://github.com/microsoft/LightGBM/blob/master/docs/Parameters.rst) is an exhaustive list of customization you can make. - [**Distributed Learning**](https://github.com/microsoft/LightGBM/blob/master/docs/Parallel-Learning-Guide.rst) and [**GPU Learning**](https://github.com/microsoft/LightGBM/blob/master/docs/GPU-Tutorial.rst) can speed up computation. -- [**Laurae++ interactive documentation**](https://sites.google.com/view/lauraepp/parameters) is a detailed guide for hyperparameters. - [**FLAML**](https://www.microsoft.com/en-us/research/project/fast-and-lightweight-automl-for-large-scale-data/articles/flaml-a-fast-and-lightweight-automl-library/) provides automated tuning for LightGBM ([code examples](https://microsoft.github.io/FLAML/docs/Examples/AutoML-for-LightGBM/)). - [**Optuna Hyperparameter Tuner**](https://medium.com/optuna/lightgbm-tuner-new-optuna-integration-for-hyperparameter-optimization-8b7095e99258) provides automated tuning for LightGBM hyperparameters ([code examples](https://github.com/optuna/optuna-examples/blob/main/lightgbm/lightgbm_tuner_simple.py)). - [**Understanding LightGBM Parameters (and How to Tune Them using Neptune)**](https://neptune.ai/blog/lightgbm-parameters-guide). diff --git a/docs/FAQ.rst b/docs/FAQ.rst index 2e0002cb6bc1..31b35e4867d4 100644 --- a/docs/FAQ.rst +++ b/docs/FAQ.rst @@ -62,7 +62,7 @@ General LightGBM Questions 1. Where do I find more details about LightGBM parameters? ---------------------------------------------------------- -Take a look at `Parameters <./Parameters.rst>`__ and the `Laurae++/Parameters `__ website. +Take a look at `Parameters <./Parameters.rst>`__. 2. On datasets with millions of features, training does not start (or starts after a very long time). ----------------------------------------------------------------------------------------------------- diff --git a/docs/Parameters.rst b/docs/Parameters.rst index 329f9c38656e..341cdd487c71 100644 --- a/docs/Parameters.rst +++ b/docs/Parameters.rst @@ -14,10 +14,6 @@ This page contains descriptions of all parameters in LightGBM. - `Parameters Tuning <./Parameters-Tuning.rst>`__ -**External Links** - -- `Laurae++ Interactive Documentation`_ - Parameters Format ----------------- @@ -1380,5 +1376,3 @@ If the name of data file is ``train.txt``, the query file should be named as ``t In this case, LightGBM will load the query file automatically if it exists. Also, you can include query/group id column in your data file. Please refer to the ``group_column`` `parameter <#group_column>`__ in above. - -.. _Laurae++ Interactive Documentation: https://sites.google.com/view/lauraepp/parameters From f5b6bd60d9d752c8e5a75b11ab771d0422214bb4 Mon Sep 17 00:00:00 2001 From: Oliver Borchert Date: Mon, 4 Dec 2023 19:26:55 +0000 Subject: [PATCH 12/19] [python-package] Allow to pass Arrow table and array as init scores (#6167) --- include/LightGBM/c_api.h | 5 +-- include/LightGBM/dataset.h | 4 +++ python-package/lightgbm/basic.py | 28 ++++++++++----- python-package/lightgbm/compat.py | 2 ++ src/io/dataset.cpp | 2 ++ src/io/metadata.cpp | 28 ++++++++++----- tests/python_package_test/test_arrow.py | 45 ++++++++++++++++++++++++- 7 files changed, 95 insertions(+), 19 deletions(-) diff --git a/include/LightGBM/c_api.h b/include/LightGBM/c_api.h index eafe6fab7825..ada2e4109638 100644 --- a/include/LightGBM/c_api.h +++ b/include/LightGBM/c_api.h @@ -559,9 +559,10 @@ LIGHTGBM_C_EXPORT int LGBM_DatasetSetField(DatasetHandle handle, * \brief Set vector to a content in info. * \note * - \a group converts input datatype into ``int32``; - * - \a label and \a weight convert input datatype into ``float32``. + * - \a label and \a weight convert input datatype into ``float32``; + * - \a init_score converts input datatype into ``float64``. * \param handle Handle of dataset - * \param field_name Field name, can be \a label, \a weight, \a group + * \param field_name Field name, can be \a label, \a weight, \a init_score, \a group * \param n_chunks The number of Arrow arrays passed to this function * \param chunks Pointer to the list of Arrow arrays * \param schema Pointer to the schema of all Arrow arrays diff --git a/include/LightGBM/dataset.h b/include/LightGBM/dataset.h index bf8264276a5f..220a1f9f009c 100644 --- a/include/LightGBM/dataset.h +++ b/include/LightGBM/dataset.h @@ -125,6 +125,7 @@ class Metadata { * \param init_score Initial scores, this class will manage memory for init_score. */ void SetInitScore(const double* init_score, data_size_t len); + void SetInitScore(const ArrowChunkedArray& array); /*! @@ -347,6 +348,9 @@ class Metadata { void SetWeightsFromIterator(It first, It last); /*! \brief Insert initial scores at the given index */ void InsertInitScores(const double* init_scores, data_size_t start_index, data_size_t len, data_size_t source_size); + /*! \brief Set init scores from pointers to the first element and the end of an iterator. */ + template + void SetInitScoresFromIterator(It first, It last); /*! \brief Insert queries at the given index */ void InsertQueries(const data_size_t* queries, data_size_t start_index, data_size_t len); /*! \brief Set queries from pointers to the first element and the end of an iterator. */ diff --git a/python-package/lightgbm/basic.py b/python-package/lightgbm/basic.py index b55546941f77..31ae5182ee9e 100644 --- a/python-package/lightgbm/basic.py +++ b/python-package/lightgbm/basic.py @@ -19,8 +19,8 @@ import scipy.sparse from .compat import (PANDAS_INSTALLED, PYARROW_INSTALLED, arrow_cffi, arrow_is_floating, arrow_is_integer, concat, - dt_DataTable, pa_Array, pa_ChunkedArray, pa_compute, pa_Table, pd_CategoricalDtype, pd_DataFrame, - pd_Series) + dt_DataTable, pa_Array, pa_chunked_array, pa_ChunkedArray, pa_compute, pa_Table, + pd_CategoricalDtype, pd_DataFrame, pd_Series) from .libpath import find_lib_path if TYPE_CHECKING: @@ -84,6 +84,9 @@ np.ndarray, pd_Series, pd_DataFrame, + pa_Table, + pa_Array, + pa_ChunkedArray, ] _LGBM_TrainDataType = Union[ str, @@ -1660,7 +1663,7 @@ def __init__( sum(group) = n_samples. For example, if you have a 100-document dataset with ``group = [10, 20, 40, 10, 10, 10]``, that means that you have 6 groups, where the first 10 records are in the first group, records 11-30 are in the second group, records 31-70 are in the third group, etc. - init_score : list, list of lists (for multi-class task), numpy array, pandas Series, pandas DataFrame (for multi-class task), or None, optional (default=None) + init_score : list, list of lists (for multi-class task), numpy array, pandas Series, pandas DataFrame (for multi-class task), pyarrow Array, pyarrow ChunkedArray, pyarrow Table (for multi-class task) or None, optional (default=None) Init score for Dataset. feature_name : list of str, or 'auto', optional (default="auto") Feature names. @@ -2440,7 +2443,7 @@ def create_valid( sum(group) = n_samples. For example, if you have a 100-document dataset with ``group = [10, 20, 40, 10, 10, 10]``, that means that you have 6 groups, where the first 10 records are in the first group, records 11-30 are in the second group, records 31-70 are in the third group, etc. - init_score : list, list of lists (for multi-class task), numpy array, pandas Series, pandas DataFrame (for multi-class task), or None, optional (default=None) + init_score : list, list of lists (for multi-class task), numpy array, pandas Series, pandas DataFrame (for multi-class task), pyarrow Array, pyarrow ChunkedArray, pyarrow Table (for multi-class task) or None, optional (default=None) Init score for Dataset. params : dict or None, optional (default=None) Other parameters for validation Dataset. @@ -2547,7 +2550,7 @@ def _reverse_update_params(self) -> "Dataset": def set_field( self, field_name: str, - data: Optional[Union[List[List[float]], List[List[int]], List[float], List[int], np.ndarray, pd_Series, pd_DataFrame, pa_Array, pa_ChunkedArray]] + data: Optional[Union[List[List[float]], List[List[int]], List[float], List[int], np.ndarray, pd_Series, pd_DataFrame, pa_Table, pa_Array, pa_ChunkedArray]] ) -> "Dataset": """Set property into the Dataset. @@ -2576,7 +2579,16 @@ def set_field( return self # If the data is a arrow data, we can just pass it to C - if _is_pyarrow_array(data): + if _is_pyarrow_array(data) or _is_pyarrow_table(data): + # If a table is being passed, we concatenate the columns. This is only valid for + # 'init_score'. + if _is_pyarrow_table(data): + if field_name != "init_score": + raise ValueError(f"pyarrow tables are not supported for field '{field_name}'") + data = pa_chunked_array([ + chunk for array in data.columns for chunk in array.chunks # type: ignore + ]) + c_array = _export_arrow_to_c(data) _safe_call(_LIB.LGBM_DatasetSetFieldFromArrow( self._handle, @@ -2869,7 +2881,7 @@ def set_init_score( Parameters ---------- - init_score : list, list of lists (for multi-class task), numpy array, pandas Series, pandas DataFrame (for multi-class task), or None + init_score : list, list of lists (for multi-class task), numpy array, pandas Series, pandas DataFrame (for multi-class task), pyarrow Array, pyarrow ChunkedArray, pyarrow Table (for multi-class task) or None Init score for Booster. Returns @@ -4443,7 +4455,7 @@ def refit( .. versionadded:: 4.0.0 - init_score : list, list of lists (for multi-class task), numpy array, pandas Series, pandas DataFrame (for multi-class task), or None, optional (default=None) + init_score : list, list of lists (for multi-class task), numpy array, pandas Series, pandas DataFrame (for multi-class task), pyarrow Array, pyarrow ChunkedArray, pyarrow Table (for multi-class task) or None, optional (default=None) Init score for ``data``. .. versionadded:: 4.0.0 diff --git a/python-package/lightgbm/compat.py b/python-package/lightgbm/compat.py index dc48dbf792cf..bd1b29a1e802 100644 --- a/python-package/lightgbm/compat.py +++ b/python-package/lightgbm/compat.py @@ -201,6 +201,7 @@ def __init__(self, *args, **kwargs): from pyarrow import Array as pa_Array from pyarrow import ChunkedArray as pa_ChunkedArray from pyarrow import Table as pa_Table + from pyarrow import chunked_array as pa_chunked_array from pyarrow.cffi import ffi as arrow_cffi from pyarrow.types import is_floating as arrow_is_floating from pyarrow.types import is_integer as arrow_is_integer @@ -243,6 +244,7 @@ class pa_compute: # type: ignore all = None equal = None + pa_chunked_array = None arrow_is_integer = None arrow_is_floating = None diff --git a/src/io/dataset.cpp b/src/io/dataset.cpp index 78dd5e4319a5..058d7bd328ad 100644 --- a/src/io/dataset.cpp +++ b/src/io/dataset.cpp @@ -904,6 +904,8 @@ bool Dataset::SetFieldFromArrow(const char* field_name, const ArrowChunkedArray metadata_.SetLabel(ca); } else if (name == std::string("weight") || name == std::string("weights")) { metadata_.SetWeights(ca); + } else if (name == std::string("init_score")) { + metadata_.SetInitScore(ca); } else if (name == std::string("query") || name == std::string("group")) { metadata_.SetQuery(ca); } else { diff --git a/src/io/metadata.cpp b/src/io/metadata.cpp index d94b0ed3f2f7..55440649f55e 100644 --- a/src/io/metadata.cpp +++ b/src/io/metadata.cpp @@ -355,32 +355,44 @@ void Metadata::CheckOrPartition(data_size_t num_all_data, const std::vector +void Metadata::SetInitScoresFromIterator(It first, It last) { std::lock_guard lock(mutex_); - // save to nullptr - if (init_score == nullptr || len == 0) { + // Clear init scores on empty input + if (last - first == 0) { init_score_.clear(); num_init_score_ = 0; return; } - if ((len % num_data_) != 0) { + if (((last - first) % num_data_) != 0) { Log::Fatal("Initial score size doesn't match data size"); } - if (init_score_.empty()) { init_score_.resize(len); } - num_init_score_ = len; + if (init_score_.empty()) { + init_score_.resize(last - first); + } + num_init_score_ = last - first; #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static, 512) if (num_init_score_ >= 1024) for (int64_t i = 0; i < num_init_score_; ++i) { - init_score_[i] = Common::AvoidInf(init_score[i]); + init_score_[i] = Common::AvoidInf(first[i]); } init_score_load_from_file_ = false; + #ifdef USE_CUDA if (cuda_metadata_ != nullptr) { - cuda_metadata_->SetInitScore(init_score_.data(), len); + cuda_metadata_->SetInitScore(init_score_.data(), init_score_.size()); } #endif // USE_CUDA } +void Metadata::SetInitScore(const double* init_score, data_size_t len) { + SetInitScoresFromIterator(init_score, init_score + len); +} + +void Metadata::SetInitScore(const ArrowChunkedArray& array) { + SetInitScoresFromIterator(array.begin(), array.end()); +} + void Metadata::InsertInitScores(const double* init_scores, data_size_t start_index, data_size_t len, data_size_t source_size) { if (num_init_score_ <= 0) { Log::Fatal("Inserting initial score data into dataset with no initial scores"); diff --git a/tests/python_package_test/test_arrow.py b/tests/python_package_test/test_arrow.py index 38b053e94fd5..fd20df25dd87 100644 --- a/tests/python_package_test/test_arrow.py +++ b/tests/python_package_test/test_arrow.py @@ -178,7 +178,7 @@ def test_dataset_construct_weights_none(): ["array_type", "weight_data"], [(pa.array, [3, 0.7, 1.5, 0.5, 0.1]), (pa.chunked_array, [[3], [0.7, 1.5, 0.5, 0.1]])], ) -@pytest.mark.parametrize("arrow_type", [pa.float32(), pa.float64()]) +@pytest.mark.parametrize("arrow_type", _FLOAT_TYPES) def test_dataset_construct_weights(array_type, weight_data, arrow_type): data = generate_dummy_arrow_table() weights = array_type(weight_data, type=arrow_type) @@ -210,3 +210,46 @@ def test_dataset_construct_groups(array_type, group_data, arrow_type): expected = np.array([0, 2, 5], dtype=np.int32) np_assert_array_equal(expected, dataset.get_field("group"), strict=True) + + +# ----------------------------------------- INIT SCORES ----------------------------------------- # + + +@pytest.mark.parametrize( + ["array_type", "init_score_data"], + [ + (pa.array, [0, 1, 2, 3, 3]), + (pa.chunked_array, [[0, 1, 2], [3, 3]]), + (pa.chunked_array, [[], [0, 1, 2], [3, 3]]), + (pa.chunked_array, [[0, 1], [], [], [2], [3, 3], []]), + ], +) +@pytest.mark.parametrize("arrow_type", _INTEGER_TYPES + _FLOAT_TYPES) +def test_dataset_construct_init_scores_array( + array_type: Any, init_score_data: Any, arrow_type: Any +): + data = generate_dummy_arrow_table() + init_scores = array_type(init_score_data, type=arrow_type) + dataset = lgb.Dataset(data, init_score=init_scores, params=dummy_dataset_params()) + dataset.construct() + + expected = np.array([0, 1, 2, 3, 3], dtype=np.float64) + np_assert_array_equal(expected, dataset.get_init_score(), strict=True) + + +def test_dataset_construct_init_scores_table(): + data = generate_dummy_arrow_table() + init_scores = pa.Table.from_arrays( + [ + generate_random_arrow_array(5, seed=1), + generate_random_arrow_array(5, seed=2), + generate_random_arrow_array(5, seed=3), + ], + names=["a", "b", "c"], + ) + dataset = lgb.Dataset(data, init_score=init_scores, params=dummy_dataset_params()) + dataset.construct() + + actual = dataset.get_init_score() + expected = init_scores.to_pandas().to_numpy().astype(np.float64) + np_assert_array_equal(expected, actual, strict=True) From d84582b746500237c52701975e006ba8a813d229 Mon Sep 17 00:00:00 2001 From: Oliver Borchert Date: Wed, 6 Dec 2023 16:18:28 +0000 Subject: [PATCH 13/19] Fix null handling for Arrow data (#6227) --- include/LightGBM/arrow.tpp | 2 +- tests/cpp_tests/test_arrow.cpp | 6 ++++-- tests/python_package_test/test_arrow.py | 11 +++++++++++ 3 files changed, 16 insertions(+), 3 deletions(-) diff --git a/include/LightGBM/arrow.tpp b/include/LightGBM/arrow.tpp index 67b481c9497e..8d1ce4f4c0c1 100644 --- a/include/LightGBM/arrow.tpp +++ b/include/LightGBM/arrow.tpp @@ -144,7 +144,7 @@ struct ArrayIndexAccessor { // - The structure of validity bitmasks is taken from here: // https://arrow.apache.org/docs/format/Columnar.html#validity-bitmaps // - If the bitmask is NULL, all indices are valid - if (validity == nullptr || !(validity[buffer_idx / 8] & (1 << (buffer_idx % 8)))) { + if (validity == nullptr || (validity[buffer_idx / 8] & (1 << (buffer_idx % 8)))) { // In case the index is valid, we take it from the data buffer auto data = static_cast(array->buffers[1]); return static_cast(data[buffer_idx]); diff --git a/tests/cpp_tests/test_arrow.cpp b/tests/cpp_tests/test_arrow.cpp index 7e3c57c401f4..e975b6ba374b 100644 --- a/tests/cpp_tests/test_arrow.cpp +++ b/tests/cpp_tests/test_arrow.cpp @@ -41,10 +41,12 @@ class ArrowChunkedArrayTest : public testing::Test { // 1) Create validity bitmap char* validity = nullptr; if (!null_indices.empty()) { - validity = static_cast(calloc(values.size() + sizeof(char) - 1, sizeof(char))); + auto num_bytes = (values.size() + 7) / 8; + validity = static_cast(calloc(num_bytes, sizeof(char))); + memset(validity, 0xff, num_bytes * sizeof(char)); for (size_t i = 0; i < values.size(); ++i) { if (std::find(null_indices.begin(), null_indices.end(), i) != null_indices.end()) { - validity[i / 8] |= (1 << (i % 8)); + validity[i / 8] &= ~(1 << (i % 8)); } } } diff --git a/tests/python_package_test/test_arrow.py b/tests/python_package_test/test_arrow.py index fd20df25dd87..5e09465e34b3 100644 --- a/tests/python_package_test/test_arrow.py +++ b/tests/python_package_test/test_arrow.py @@ -46,6 +46,16 @@ def generate_simple_arrow_table() -> pa.Table: return pa.Table.from_arrays(columns, names=[f"col_{i}" for i in range(len(columns))]) +def generate_nullable_arrow_table() -> pa.Table: + columns = [ + pa.chunked_array([[1, None, 3, 4, 5]], type=pa.float32()), + pa.chunked_array([[None, 2, 3, 4, 5]], type=pa.float32()), + pa.chunked_array([[1, 2, 3, 4, None]], type=pa.float32()), + pa.chunked_array([[None, None, None, None, None]], type=pa.float32()), + ] + return pa.Table.from_arrays(columns, names=[f"col_{i}" for i in range(len(columns))]) + + def generate_dummy_arrow_table() -> pa.Table: col1 = pa.chunked_array([[1, 2, 3], [4, 5]], type=pa.uint8()) col2 = pa.chunked_array([[0.5, 0.6], [0.1, 0.8, 1.5]], type=pa.float32()) @@ -95,6 +105,7 @@ def dummy_dataset_params() -> Dict[str, Any]: [ # Use lambda functions here to minimize memory consumption (lambda: generate_simple_arrow_table(), dummy_dataset_params()), (lambda: generate_dummy_arrow_table(), dummy_dataset_params()), + (lambda: generate_nullable_arrow_table(), dummy_dataset_params()), (lambda: generate_random_arrow_table(3, 1000, 42), {}), (lambda: generate_random_arrow_table(100, 10000, 43), {}), ], From 4aba4fc1326210a1501f144bd54d77a64d127362 Mon Sep 17 00:00:00 2001 From: James Lamb Date: Wed, 6 Dec 2023 12:56:27 -0600 Subject: [PATCH 14/19] [R-package] change CRAN maintainer (#6224) --- R-package/DESCRIPTION | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R-package/DESCRIPTION b/R-package/DESCRIPTION index 1193c0d463b9..62b479530b4a 100755 --- a/R-package/DESCRIPTION +++ b/R-package/DESCRIPTION @@ -4,10 +4,10 @@ Title: Light Gradient Boosting Machine Version: ~~VERSION~~ Date: ~~DATE~~ Authors@R: c( - person("Yu", "Shi", email = "yushi2@microsoft.com", role = c("aut", "cre")), + person("Yu", "Shi", email = "yushi2@microsoft.com", role = c("aut")), person("Guolin", "Ke", email = "guolin.ke@outlook.com", role = c("aut")), person("Damien", "Soukhavong", email = "damien.soukhavong@skema.edu", role = c("aut")), - person("James", "Lamb", email="jaylamb20@gmail.com", role = c("aut")), + person("James", "Lamb", email="jaylamb20@gmail.com", role = c("aut", "cre")), person("Qi", "Meng", role = c("aut")), person("Thomas", "Finley", role = c("aut")), person("Taifeng", "Wang", role = c("aut")), From e797985227a012a837c20eddc457de6b7fc7aeaa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20Morales?= Date: Thu, 7 Dec 2023 08:54:18 -0600 Subject: [PATCH 15/19] [python-package] take shallow copy of dataframe in predict (fixes #6195) (#6218) --- python-package/lightgbm/basic.py | 5 ++++- tests/python_package_test/test_basic.py | 19 ++++++++++++++++--- 2 files changed, 20 insertions(+), 4 deletions(-) diff --git a/python-package/lightgbm/basic.py b/python-package/lightgbm/basic.py index 31ae5182ee9e..c4022e7fdd9a 100644 --- a/python-package/lightgbm/basic.py +++ b/python-package/lightgbm/basic.py @@ -789,6 +789,10 @@ def _data_from_pandas( if len(data.shape) != 2 or data.shape[0] < 1: raise ValueError('Input data must be 2 dimensional and non empty.') + # take shallow copy in case we modify categorical columns + # whole column modifications don't change the original df + data = data.copy(deep=False) + # determine feature names if feature_name == 'auto': feature_name = [str(col) for col in data.columns] @@ -805,7 +809,6 @@ def _data_from_pandas( if list(data[col].cat.categories) != list(category): data[col] = data[col].cat.set_categories(category) if len(cat_cols): # cat_cols is list - data = data.copy(deep=False) # not alter origin DataFrame data[cat_cols] = data[cat_cols].apply(lambda x: x.cat.codes).replace({-1: np.nan}) if categorical_feature == 'auto': # use cat cols from DataFrame categorical_feature = cat_cols_not_ordered diff --git a/tests/python_package_test/test_basic.py b/tests/python_package_test/test_basic.py index 2f6b07e7a77f..b8ef43e41397 100644 --- a/tests/python_package_test/test_basic.py +++ b/tests/python_package_test/test_basic.py @@ -822,21 +822,34 @@ def test_no_copy_when_single_float_dtype_dataframe(dtype, feature_name): @pytest.mark.parametrize('feature_name', [['x1'], [42], 'auto']) -def test_categorical_code_conversion_doesnt_modify_original_data(feature_name): +@pytest.mark.parametrize('categories', ['seen', 'unseen']) +def test_categorical_code_conversion_doesnt_modify_original_data(feature_name, categories): pd = pytest.importorskip('pandas') X = np.random.choice(['a', 'b'], 100).reshape(-1, 1) column_name = 'a' if feature_name == 'auto' else feature_name[0] df = pd.DataFrame(X.copy(), columns=[column_name], dtype='category') + if categories == 'seen': + pandas_categorical = [['a', 'b']] + else: + pandas_categorical = [['a']] data = lgb.basic._data_from_pandas( data=df, feature_name=feature_name, categorical_feature="auto", - pandas_categorical=None + pandas_categorical=pandas_categorical, )[0] # check that the original data wasn't modified np.testing.assert_equal(df[column_name], X[:, 0]) # check that the built data has the codes - np.testing.assert_equal(df[column_name].cat.codes, data[:, 0]) + if categories == 'seen': + # if all categories were seen during training we just take the codes + codes = df[column_name].cat.codes + else: + # if we only saw 'a' during training we just replace its code + # and leave the rest as nan + a_code = df[column_name].cat.categories.get_loc('a') + codes = np.where(df[column_name] == 'a', a_code, np.nan) + np.testing.assert_equal(codes, data[:, 0]) @pytest.mark.parametrize('min_data_in_bin', [2, 10]) From 1548b42bac5d5b7c295ba4d3132e8bda47e34fd1 Mon Sep 17 00:00:00 2001 From: James Lamb Date: Thu, 7 Dec 2023 17:03:16 -0600 Subject: [PATCH 16/19] [R-package] [c++] add tighter multithreading control, avoid global OpenMP side effects (fixes #4705, fixes #5102) (#6226) --- .ci/lint-cpp.sh | 3 +- CMakeLists.txt | 1 + R-package/NAMESPACE | 2 + R-package/R/lgb.Booster.R | 12 +++++ R-package/R/lgb.Dataset.R | 22 ++++++++ R-package/R/lgb.cv.R | 2 + R-package/R/lgb.importance.R | 2 + R-package/R/lgb.interprete.R | 2 + R-package/R/lgb.model.dt.tree.R | 2 + R-package/R/lgb.plot.importance.R | 2 + R-package/R/lgb.plot.interpretation.R | 2 + R-package/R/lgb.restore_handle.R | 4 ++ R-package/R/lgb.train.R | 2 + R-package/R/multithreading.R | 51 +++++++++++++++++++ R-package/R/readRDS.lgb.Booster.R | 2 + R-package/R/saveRDS.lgb.Booster.R | 2 + R-package/man/dim.Rd | 2 + R-package/man/dimnames.lgb.Dataset.Rd | 2 + R-package/man/getLGBMThreads.Rd | 26 ++++++++++ R-package/man/get_field.Rd | 2 + R-package/man/lgb.Dataset.Rd | 2 + R-package/man/lgb.Dataset.construct.Rd | 2 + R-package/man/lgb.Dataset.create.valid.Rd | 2 + R-package/man/lgb.Dataset.save.Rd | 2 + R-package/man/lgb.Dataset.set.categorical.Rd | 2 + R-package/man/lgb.Dataset.set.reference.Rd | 2 + R-package/man/lgb.configure_fast_predict.Rd | 2 + R-package/man/lgb.cv.Rd | 2 + R-package/man/lgb.dump.Rd | 2 + R-package/man/lgb.get.eval.result.Rd | 2 + R-package/man/lgb.importance.Rd | 2 + R-package/man/lgb.interprete.Rd | 2 + R-package/man/lgb.load.Rd | 2 + R-package/man/lgb.model.dt.tree.Rd | 2 + R-package/man/lgb.plot.importance.Rd | 2 + R-package/man/lgb.plot.interpretation.Rd | 2 + R-package/man/lgb.restore_handle.Rd | 4 ++ R-package/man/lgb.save.Rd | 2 + R-package/man/lgb.train.Rd | 2 + R-package/man/predict.lgb.Booster.Rd | 2 + R-package/man/readRDS.lgb.Booster.Rd | 2 + R-package/man/saveRDS.lgb.Booster.Rd | 2 + R-package/man/setLGBMThreads.Rd | 32 ++++++++++++ R-package/man/set_field.Rd | 2 + R-package/man/slice.Rd | 2 + R-package/src/Makevars.in | 1 + R-package/src/Makevars.win.in | 1 + R-package/src/lightgbm_R.cpp | 19 +++++++ R-package/src/lightgbm_R.h | 19 +++++++ R-package/tests/testthat/helper.R | 5 ++ .../tests/testthat/test_multithreading.R | 16 ++++++ R-package/vignettes/basic_walkthrough.Rmd | 6 +++ build-cran-package.sh | 1 + include/LightGBM/c_api.h | 14 +++++ include/LightGBM/utils/openmp_wrapper.h | 47 ++++++++++------- src/c_api.cpp | 17 +++++++ src/utils/openmp_wrapper.cpp | 44 ++++++++++++++++ tests/c_api_test/test_.py | 33 ++++++++++++ 58 files changed, 429 insertions(+), 21 deletions(-) create mode 100644 R-package/R/multithreading.R create mode 100644 R-package/man/getLGBMThreads.Rd create mode 100644 R-package/man/setLGBMThreads.Rd create mode 100644 R-package/tests/testthat/test_multithreading.R create mode 100644 src/utils/openmp_wrapper.cpp diff --git a/.ci/lint-cpp.sh b/.ci/lint-cpp.sh index 56489ecf3325..2d91f8e85f00 100755 --- a/.ci/lint-cpp.sh +++ b/.ci/lint-cpp.sh @@ -30,8 +30,7 @@ get_omp_pragmas_without_num_threads() { --include='*.h' \ --include='*.hpp' \ 'pragma omp parallel' \ - | grep -v ' num_threads' \ - | grep -v 'openmp_wrapper.h' + | grep -v ' num_threads' } PROBLEMATIC_LINES=$( get_omp_pragmas_without_num_threads diff --git a/CMakeLists.txt b/CMakeLists.txt index 50b3cbaaf189..aef95871e4cc 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -432,6 +432,7 @@ file( src/objective/*.cpp src/network/*.cpp src/treelearner/*.cpp + src/utils/*.cpp if(USE_CUDA) src/treelearner/*.cu src/boosting/cuda/*.cpp diff --git a/R-package/NAMESPACE b/R-package/NAMESPACE index e07af84d8824..ab987d0593eb 100644 --- a/R-package/NAMESPACE +++ b/R-package/NAMESPACE @@ -9,6 +9,7 @@ S3method(print,lgb.Booster) S3method(set_field,lgb.Dataset) S3method(slice,lgb.Dataset) S3method(summary,lgb.Booster) +export(getLGBMthreads) export(get_field) export(lgb.Dataset) export(lgb.Dataset.construct) @@ -35,6 +36,7 @@ export(lgb.train) export(lightgbm) export(readRDS.lgb.Booster) export(saveRDS.lgb.Booster) +export(setLGBMthreads) export(set_field) export(slice) import(methods) diff --git a/R-package/R/lgb.Booster.R b/R-package/R/lgb.Booster.R index 17da9545ae19..4437c6fa552e 100644 --- a/R-package/R/lgb.Booster.R +++ b/R-package/R/lgb.Booster.R @@ -917,6 +917,8 @@ NULL #' the factor levels not being present in the output. #' @examples #' \donttest{ +#' \dontshow{setLGBMthreads(2L)} +#' \dontshow{data.table::setDTthreads(1L)} #' data(agaricus.train, package = "lightgbm") #' train <- agaricus.train #' dtrain <- lgb.Dataset(train$data, label = train$label) @@ -1082,6 +1084,8 @@ predict.lgb.Booster <- function(object, #' \link{predict.lgb.Booster}. #' @examples #' \donttest{ +#' \dontshow{setLGBMthreads(2L)} +#' \dontshow{data.table::setDTthreads(1L)} #' library(lightgbm) #' data(mtcars) #' X <- as.matrix(mtcars[, -1L]) @@ -1224,6 +1228,8 @@ summary.lgb.Booster <- function(object, ...) { #' #' @examples #' \donttest{ +#' \dontshow{setLGBMthreads(2L)} +#' \dontshow{data.table::setDTthreads(1L)} #' data(agaricus.train, package = "lightgbm") #' train <- agaricus.train #' dtrain <- lgb.Dataset(train$data, label = train$label) @@ -1289,6 +1295,8 @@ lgb.load <- function(filename = NULL, model_str = NULL) { #' #' @examples #' \donttest{ +#' \dontshow{setLGBMthreads(2L)} +#' \dontshow{data.table::setDTthreads(1L)} #' library(lightgbm) #' data(agaricus.train, package = "lightgbm") #' train <- agaricus.train @@ -1346,6 +1354,8 @@ lgb.save <- function(booster, filename, num_iteration = NULL) { #' @examples #' \donttest{ #' library(lightgbm) +#' \dontshow{setLGBMthreads(2L)} +#' \dontshow{data.table::setDTthreads(1L)} #' data(agaricus.train, package = "lightgbm") #' train <- agaricus.train #' dtrain <- lgb.Dataset(train$data, label = train$label) @@ -1396,6 +1406,8 @@ lgb.dump <- function(booster, num_iteration = NULL) { #' #' @examples #' \donttest{ +#' \dontshow{setLGBMthreads(2L)} +#' \dontshow{data.table::setDTthreads(1L)} #' # train a regression model #' data(agaricus.train, package = "lightgbm") #' train <- agaricus.train diff --git a/R-package/R/lgb.Dataset.R b/R-package/R/lgb.Dataset.R index ddc338d2cae3..ff9b0b4fa38a 100644 --- a/R-package/R/lgb.Dataset.R +++ b/R-package/R/lgb.Dataset.R @@ -780,6 +780,8 @@ Dataset <- R6::R6Class( #' #' @examples #' \donttest{ +#' \dontshow{setLGBMthreads(2L)} +#' \dontshow{data.table::setDTthreads(1L)} #' data(agaricus.train, package = "lightgbm") #' train <- agaricus.train #' dtrain <- lgb.Dataset(train$data, label = train$label) @@ -837,6 +839,8 @@ lgb.Dataset <- function(data, #' #' @examples #' \donttest{ +#' \dontshow{setLGBMthreads(2L)} +#' \dontshow{data.table::setDTthreads(1L)} #' data(agaricus.train, package = "lightgbm") #' train <- agaricus.train #' dtrain <- lgb.Dataset(train$data, label = train$label) @@ -913,6 +917,8 @@ lgb.Dataset.create.valid <- function(dataset, #' #' @examples #' \donttest{ +#' \dontshow{setLGBMthreads(2L)} +#' \dontshow{data.table::setDTthreads(1L)} #' data(agaricus.train, package = "lightgbm") #' train <- agaricus.train #' dtrain <- lgb.Dataset(train$data, label = train$label) @@ -942,6 +948,8 @@ lgb.Dataset.construct <- function(dataset) { #' #' @examples #' \donttest{ +#' \dontshow{setLGBMthreads(2L)} +#' \dontshow{data.table::setDTthreads(1L)} #' data(agaricus.train, package = "lightgbm") #' train <- agaricus.train #' dtrain <- lgb.Dataset(train$data, label = train$label) @@ -975,6 +983,8 @@ dim.lgb.Dataset <- function(x) { #' #' @examples #' \donttest{ +#' \dontshow{setLGBMthreads(2L)} +#' \dontshow{data.table::setDTthreads(1L)} #' data(agaricus.train, package = "lightgbm") #' train <- agaricus.train #' dtrain <- lgb.Dataset(train$data, label = train$label) @@ -1045,6 +1055,8 @@ dimnames.lgb.Dataset <- function(x) { #' #' @examples #' \donttest{ +#' \dontshow{setLGBMthreads(2L)} +#' \dontshow{data.table::setDTthreads(1L)} #' data(agaricus.train, package = "lightgbm") #' train <- agaricus.train #' dtrain <- lgb.Dataset(train$data, label = train$label) @@ -1089,6 +1101,8 @@ slice.lgb.Dataset <- function(dataset, idxset) { #' #' @examples #' \donttest{ +#' \dontshow{setLGBMthreads(2L)} +#' \dontshow{data.table::setDTthreads(1L)} #' data(agaricus.train, package = "lightgbm") #' train <- agaricus.train #' dtrain <- lgb.Dataset(train$data, label = train$label) @@ -1138,6 +1152,8 @@ get_field.lgb.Dataset <- function(dataset, field_name) { #' #' @examples #' \donttest{ +#' \dontshow{setLGBMthreads(2L)} +#' \dontshow{data.table::setDTthreads(1L)} #' data(agaricus.train, package = "lightgbm") #' train <- agaricus.train #' dtrain <- lgb.Dataset(train$data, label = train$label) @@ -1177,6 +1193,8 @@ set_field.lgb.Dataset <- function(dataset, field_name, data) { #' #' @examples #' \donttest{ +#' \dontshow{setLGBMthreads(2L)} +#' \dontshow{data.table::setDTthreads(1L)} #' data(agaricus.train, package = "lightgbm") #' train <- agaricus.train #' dtrain <- lgb.Dataset(train$data, label = train$label) @@ -1207,6 +1225,8 @@ lgb.Dataset.set.categorical <- function(dataset, categorical_feature) { #' #' @examples #' \donttest{ +#' \dontshow{setLGBMthreads(2L)} +#' \dontshow{data.table::setDTthreads(1L)} #' # create training Dataset #' data(agaricus.train, package ="lightgbm") #' train <- agaricus.train @@ -1240,6 +1260,8 @@ lgb.Dataset.set.reference <- function(dataset, reference) { #' #' @examples #' \donttest{ +#' \dontshow{setLGBMthreads(2L)} +#' \dontshow{data.table::setDTthreads(1L)} #' data(agaricus.train, package = "lightgbm") #' train <- agaricus.train #' dtrain <- lgb.Dataset(train$data, label = train$label) diff --git a/R-package/R/lgb.cv.R b/R-package/R/lgb.cv.R index 11768c5bfa0b..0545fbf71899 100644 --- a/R-package/R/lgb.cv.R +++ b/R-package/R/lgb.cv.R @@ -51,6 +51,8 @@ CVBooster <- R6::R6Class( #' #' @examples #' \donttest{ +#' \dontshow{setLGBMthreads(2L)} +#' \dontshow{data.table::setDTthreads(1L)} #' data(agaricus.train, package = "lightgbm") #' train <- agaricus.train #' dtrain <- lgb.Dataset(train$data, label = train$label) diff --git a/R-package/R/lgb.importance.R b/R-package/R/lgb.importance.R index 27efb17392df..7c76131f4f53 100644 --- a/R-package/R/lgb.importance.R +++ b/R-package/R/lgb.importance.R @@ -14,6 +14,8 @@ #' #' @examples #' \donttest{ +#' \dontshow{setLGBMthreads(2L)} +#' \dontshow{data.table::setDTthreads(1L)} #' data(agaricus.train, package = "lightgbm") #' train <- agaricus.train #' dtrain <- lgb.Dataset(train$data, label = train$label) diff --git a/R-package/R/lgb.interprete.R b/R-package/R/lgb.interprete.R index 976315262792..8f93d45429f1 100644 --- a/R-package/R/lgb.interprete.R +++ b/R-package/R/lgb.interprete.R @@ -17,6 +17,8 @@ #' #' @examples #' \donttest{ +#' \dontshow{setLGBMthreads(2L)} +#' \dontshow{data.table::setDTthreads(1L)} #' Logit <- function(x) log(x / (1.0 - x)) #' data(agaricus.train, package = "lightgbm") #' train <- agaricus.train diff --git a/R-package/R/lgb.model.dt.tree.R b/R-package/R/lgb.model.dt.tree.R index 5d994accfa7f..bf4562e41018 100644 --- a/R-package/R/lgb.model.dt.tree.R +++ b/R-package/R/lgb.model.dt.tree.R @@ -29,6 +29,8 @@ #' #' @examples #' \donttest{ +#' \dontshow{setLGBMthreads(2L)} +#' \dontshow{data.table::setDTthreads(1L)} #' data(agaricus.train, package = "lightgbm") #' train <- agaricus.train #' dtrain <- lgb.Dataset(train$data, label = train$label) diff --git a/R-package/R/lgb.plot.importance.R b/R-package/R/lgb.plot.importance.R index fc59ebd0efec..b8a90ca158ae 100644 --- a/R-package/R/lgb.plot.importance.R +++ b/R-package/R/lgb.plot.importance.R @@ -19,6 +19,8 @@ #' #' @examples #' \donttest{ +#' \dontshow{setLGBMthreads(2L)} +#' \dontshow{data.table::setDTthreads(1L)} #' data(agaricus.train, package = "lightgbm") #' train <- agaricus.train #' dtrain <- lgb.Dataset(train$data, label = train$label) diff --git a/R-package/R/lgb.plot.interpretation.R b/R-package/R/lgb.plot.interpretation.R index 8b95371eb3c2..97650f30a7d3 100644 --- a/R-package/R/lgb.plot.interpretation.R +++ b/R-package/R/lgb.plot.interpretation.R @@ -16,6 +16,8 @@ #' #' @examples #' \donttest{ +#' \dontshow{setLGBMthreads(2L)} +#' \dontshow{data.table::setDTthreads(1L)} #' Logit <- function(x) { #' log(x / (1.0 - x)) #' } diff --git a/R-package/R/lgb.restore_handle.R b/R-package/R/lgb.restore_handle.R index 0ed25ef26f3d..8a24cc628ca9 100644 --- a/R-package/R/lgb.restore_handle.R +++ b/R-package/R/lgb.restore_handle.R @@ -16,7 +16,10 @@ #' @return \code{lgb.Booster} (the same `model` object that was passed as input, invisibly). #' @seealso \link{lgb.make_serializable}, \link{lgb.drop_serialized}. #' @examples +#' \donttest{ #' library(lightgbm) +#' \dontshow{setLGBMthreads(2L)} +#' \dontshow{data.table::setDTthreads(1L)} #' data("agaricus.train") #' model <- lightgbm( #' agaricus.train$data @@ -33,6 +36,7 @@ #' model_new$check_null_handle() #' lgb.restore_handle(model_new) #' model_new$check_null_handle() +#' } #' @export lgb.restore_handle <- function(model) { if (!.is_Booster(x = model)) { diff --git a/R-package/R/lgb.train.R b/R-package/R/lgb.train.R index 6979558d22cd..8a299fb6b8ac 100644 --- a/R-package/R/lgb.train.R +++ b/R-package/R/lgb.train.R @@ -19,6 +19,8 @@ #' #' @examples #' \donttest{ +#' \dontshow{setLGBMthreads(2L)} +#' \dontshow{data.table::setDTthreads(1L)} #' data(agaricus.train, package = "lightgbm") #' train <- agaricus.train #' dtrain <- lgb.Dataset(train$data, label = train$label) diff --git a/R-package/R/multithreading.R b/R-package/R/multithreading.R new file mode 100644 index 000000000000..a8d6b51a8968 --- /dev/null +++ b/R-package/R/multithreading.R @@ -0,0 +1,51 @@ +#' @name setLGBMThreads +#' @title Set maximum number of threads used by LightGBM +#' @description LightGBM attempts to speed up many operations by using multi-threading. +#' The number of threads used in those operations can be controlled via the +#' \code{num_threads} parameter passed through \code{params} to functions like +#' \link{lgb.train} and \link{lgb.Dataset}. However, some operations (like materializing +#' a model from a text file) are done via code paths that don't explicitly accept thread-control +#' configuration. +#' +#' Use this function to set the maximum number of threads LightGBM will use for such operations. +#' +#' This function affects all LightGBM operations in the same process. +#' +#' So, for example, if you call \code{setLGBMthreads(4)}, no other multi-threaded LightGBM +#' operation in the same process will use more than 4 threads. +#' +#' Call \code{setLGBMthreads(-1)} to remove this limitation. +#' @param num_threads maximum number of threads to be used by LightGBM in multi-threaded operations +#' @return NULL +#' @seealso \link{getLGBMthreads} +#' @export +setLGBMthreads <- function(num_threads) { + .Call( + LGBM_SetMaxThreads_R, + num_threads + ) + return(invisible(NULL)) +} + +#' @name getLGBMThreads +#' @title Get default number of threads used by LightGBM +#' @description LightGBM attempts to speed up many operations by using multi-threading. +#' The number of threads used in those operations can be controlled via the +#' \code{num_threads} parameter passed through \code{params} to functions like +#' \link{lgb.train} and \link{lgb.Dataset}. However, some operations (like materializing +#' a model from a text file) are done via code paths that don't explicitly accept thread-control +#' configuration. +#' +#' Use this function to see the default number of threads LightGBM will use for such operations. +#' @return number of threads as an integer. \code{-1} means that in situations where parameter \code{num_threads} is +#' not explicitly supplied, LightGBM will choose a number of threads to use automatically. +#' @seealso \link{setLGBMthreads} +#' @export +getLGBMthreads <- function() { + out <- 0L + .Call( + LGBM_GetMaxThreads_R, + out + ) + return(out) +} diff --git a/R-package/R/readRDS.lgb.Booster.R b/R-package/R/readRDS.lgb.Booster.R index a8abac642c24..69e954fc75f1 100644 --- a/R-package/R/readRDS.lgb.Booster.R +++ b/R-package/R/readRDS.lgb.Booster.R @@ -12,6 +12,8 @@ #' @examples #' \donttest{ #' library(lightgbm) +#' \dontshow{setLGBMthreads(2L)} +#' \dontshow{data.table::setDTthreads(1L)} #' data(agaricus.train, package = "lightgbm") #' train <- agaricus.train #' dtrain <- lgb.Dataset(train$data, label = train$label) diff --git a/R-package/R/saveRDS.lgb.Booster.R b/R-package/R/saveRDS.lgb.Booster.R index d75056e69734..d227d75eb90d 100644 --- a/R-package/R/saveRDS.lgb.Booster.R +++ b/R-package/R/saveRDS.lgb.Booster.R @@ -22,6 +22,8 @@ #' @examples #' \donttest{ #' library(lightgbm) +#' \dontshow{setLGBMthreads(2L)} +#' \dontshow{data.table::setDTthreads(1L)} #' data(agaricus.train, package = "lightgbm") #' train <- agaricus.train #' dtrain <- lgb.Dataset(train$data, label = train$label) diff --git a/R-package/man/dim.Rd b/R-package/man/dim.Rd index 94ca192d8291..69332d0ec397 100644 --- a/R-package/man/dim.Rd +++ b/R-package/man/dim.Rd @@ -21,6 +21,8 @@ be directly used with an \code{lgb.Dataset} object. } \examples{ \donttest{ +\dontshow{setLGBMthreads(2L)} +\dontshow{data.table::setDTthreads(1L)} data(agaricus.train, package = "lightgbm") train <- agaricus.train dtrain <- lgb.Dataset(train$data, label = train$label) diff --git a/R-package/man/dimnames.lgb.Dataset.Rd b/R-package/man/dimnames.lgb.Dataset.Rd index ec01a04f607b..85f2085f1d77 100644 --- a/R-package/man/dimnames.lgb.Dataset.Rd +++ b/R-package/man/dimnames.lgb.Dataset.Rd @@ -28,6 +28,8 @@ Since row names are irrelevant, it is recommended to use \code{colnames} directl } \examples{ \donttest{ +\dontshow{setLGBMthreads(2L)} +\dontshow{data.table::setDTthreads(1L)} data(agaricus.train, package = "lightgbm") train <- agaricus.train dtrain <- lgb.Dataset(train$data, label = train$label) diff --git a/R-package/man/getLGBMThreads.Rd b/R-package/man/getLGBMThreads.Rd new file mode 100644 index 000000000000..21af4f4849d4 --- /dev/null +++ b/R-package/man/getLGBMThreads.Rd @@ -0,0 +1,26 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/multithreading.R +\name{getLGBMThreads} +\alias{getLGBMThreads} +\alias{getLGBMthreads} +\title{Get default number of threads used by LightGBM} +\usage{ +getLGBMthreads() +} +\value{ +number of threads as an integer. \code{-1} means that in situations where parameter \code{num_threads} is + not explicitly supplied, LightGBM will choose a number of threads to use automatically. +} +\description{ +LightGBM attempts to speed up many operations by using multi-threading. + The number of threads used in those operations can be controlled via the + \code{num_threads} parameter passed through \code{params} to functions like + \link{lgb.train} and \link{lgb.Dataset}. However, some operations (like materializing + a model from a text file) are done via code paths that don't explicitly accept thread-control + configuration. + + Use this function to see the default number of threads LightGBM will use for such operations. +} +\seealso{ +\link{setLGBMthreads} +} diff --git a/R-package/man/get_field.Rd b/R-package/man/get_field.Rd index 1b6692fcf807..e2562cc21364 100644 --- a/R-package/man/get_field.Rd +++ b/R-package/man/get_field.Rd @@ -32,6 +32,8 @@ Get one attribute of a \code{lgb.Dataset} } \examples{ \donttest{ +\dontshow{setLGBMthreads(2L)} +\dontshow{data.table::setDTthreads(1L)} data(agaricus.train, package = "lightgbm") train <- agaricus.train dtrain <- lgb.Dataset(train$data, label = train$label) diff --git a/R-package/man/lgb.Dataset.Rd b/R-package/man/lgb.Dataset.Rd index 4895600ff922..2605657b060a 100644 --- a/R-package/man/lgb.Dataset.Rd +++ b/R-package/man/lgb.Dataset.Rd @@ -65,6 +65,8 @@ Construct \code{lgb.Dataset} object from dense matrix, sparse matrix } \examples{ \donttest{ +\dontshow{setLGBMthreads(2L)} +\dontshow{data.table::setDTthreads(1L)} data(agaricus.train, package = "lightgbm") train <- agaricus.train dtrain <- lgb.Dataset(train$data, label = train$label) diff --git a/R-package/man/lgb.Dataset.construct.Rd b/R-package/man/lgb.Dataset.construct.Rd index 97c9e7887602..e400e0a5f8d5 100644 --- a/R-package/man/lgb.Dataset.construct.Rd +++ b/R-package/man/lgb.Dataset.construct.Rd @@ -17,6 +17,8 @@ Construct Dataset explicitly } \examples{ \donttest{ +\dontshow{setLGBMthreads(2L)} +\dontshow{data.table::setDTthreads(1L)} data(agaricus.train, package = "lightgbm") train <- agaricus.train dtrain <- lgb.Dataset(train$data, label = train$label) diff --git a/R-package/man/lgb.Dataset.create.valid.Rd b/R-package/man/lgb.Dataset.create.valid.Rd index ab8ca753c2b9..fc50dff19986 100644 --- a/R-package/man/lgb.Dataset.create.valid.Rd +++ b/R-package/man/lgb.Dataset.create.valid.Rd @@ -48,6 +48,8 @@ Construct validation data according to training data } \examples{ \donttest{ +\dontshow{setLGBMthreads(2L)} +\dontshow{data.table::setDTthreads(1L)} data(agaricus.train, package = "lightgbm") train <- agaricus.train dtrain <- lgb.Dataset(train$data, label = train$label) diff --git a/R-package/man/lgb.Dataset.save.Rd b/R-package/man/lgb.Dataset.save.Rd index 5ea38227ba66..b03c2c5e0ac5 100644 --- a/R-package/man/lgb.Dataset.save.Rd +++ b/R-package/man/lgb.Dataset.save.Rd @@ -20,6 +20,8 @@ Please note that \code{init_score} is not saved in binary file. } \examples{ \donttest{ +\dontshow{setLGBMthreads(2L)} +\dontshow{data.table::setDTthreads(1L)} data(agaricus.train, package = "lightgbm") train <- agaricus.train dtrain <- lgb.Dataset(train$data, label = train$label) diff --git a/R-package/man/lgb.Dataset.set.categorical.Rd b/R-package/man/lgb.Dataset.set.categorical.Rd index 26eb10770e47..5dfcc9a771e8 100644 --- a/R-package/man/lgb.Dataset.set.categorical.Rd +++ b/R-package/man/lgb.Dataset.set.categorical.Rd @@ -22,6 +22,8 @@ Set the categorical features of an \code{lgb.Dataset} object. Use this function } \examples{ \donttest{ +\dontshow{setLGBMthreads(2L)} +\dontshow{data.table::setDTthreads(1L)} data(agaricus.train, package = "lightgbm") train <- agaricus.train dtrain <- lgb.Dataset(train$data, label = train$label) diff --git a/R-package/man/lgb.Dataset.set.reference.Rd b/R-package/man/lgb.Dataset.set.reference.Rd index 349b0b22913e..a4efbfac5962 100644 --- a/R-package/man/lgb.Dataset.set.reference.Rd +++ b/R-package/man/lgb.Dataset.set.reference.Rd @@ -19,6 +19,8 @@ If you want to use validation data, you should set reference to training data } \examples{ \donttest{ +\dontshow{setLGBMthreads(2L)} +\dontshow{data.table::setDTthreads(1L)} # create training Dataset data(agaricus.train, package ="lightgbm") train <- agaricus.train diff --git a/R-package/man/lgb.configure_fast_predict.Rd b/R-package/man/lgb.configure_fast_predict.Rd index 39fe6afa6b18..e02600451df5 100644 --- a/R-package/man/lgb.configure_fast_predict.Rd +++ b/R-package/man/lgb.configure_fast_predict.Rd @@ -114,6 +114,8 @@ Calling this function multiple times with different parameters might not overrid } \examples{ \donttest{ +\dontshow{setLGBMthreads(2L)} +\dontshow{data.table::setDTthreads(1L)} library(lightgbm) data(mtcars) X <- as.matrix(mtcars[, -1L]) diff --git a/R-package/man/lgb.cv.Rd b/R-package/man/lgb.cv.Rd index 555cb11c7bb3..7ea2928c6166 100644 --- a/R-package/man/lgb.cv.Rd +++ b/R-package/man/lgb.cv.Rd @@ -152,6 +152,8 @@ Cross validation logic used by LightGBM \examples{ \donttest{ +\dontshow{setLGBMthreads(2L)} +\dontshow{data.table::setDTthreads(1L)} data(agaricus.train, package = "lightgbm") train <- agaricus.train dtrain <- lgb.Dataset(train$data, label = train$label) diff --git a/R-package/man/lgb.dump.Rd b/R-package/man/lgb.dump.Rd index f4e90242fd75..39f0e3018ac7 100644 --- a/R-package/man/lgb.dump.Rd +++ b/R-package/man/lgb.dump.Rd @@ -20,6 +20,8 @@ Dump LightGBM model to json \examples{ \donttest{ library(lightgbm) +\dontshow{setLGBMthreads(2L)} +\dontshow{data.table::setDTthreads(1L)} data(agaricus.train, package = "lightgbm") train <- agaricus.train dtrain <- lgb.Dataset(train$data, label = train$label) diff --git a/R-package/man/lgb.get.eval.result.Rd b/R-package/man/lgb.get.eval.result.Rd index 9c2293a0f909..0dc7eb0845c3 100644 --- a/R-package/man/lgb.get.eval.result.Rd +++ b/R-package/man/lgb.get.eval.result.Rd @@ -33,6 +33,8 @@ Given a \code{lgb.Booster}, return evaluation results for a } \examples{ \donttest{ +\dontshow{setLGBMthreads(2L)} +\dontshow{data.table::setDTthreads(1L)} # train a regression model data(agaricus.train, package = "lightgbm") train <- agaricus.train diff --git a/R-package/man/lgb.importance.Rd b/R-package/man/lgb.importance.Rd index 89a3d4e6b5b7..79cb82f5d8ef 100644 --- a/R-package/man/lgb.importance.Rd +++ b/R-package/man/lgb.importance.Rd @@ -25,6 +25,8 @@ Creates a \code{data.table} of feature importances in a model. } \examples{ \donttest{ +\dontshow{setLGBMthreads(2L)} +\dontshow{data.table::setDTthreads(1L)} data(agaricus.train, package = "lightgbm") train <- agaricus.train dtrain <- lgb.Dataset(train$data, label = train$label) diff --git a/R-package/man/lgb.interprete.Rd b/R-package/man/lgb.interprete.Rd index c1166b2c1cc9..3acc27955c46 100644 --- a/R-package/man/lgb.interprete.Rd +++ b/R-package/man/lgb.interprete.Rd @@ -30,6 +30,8 @@ Computes feature contribution components of rawscore prediction. } \examples{ \donttest{ +\dontshow{setLGBMthreads(2L)} +\dontshow{data.table::setDTthreads(1L)} Logit <- function(x) log(x / (1.0 - x)) data(agaricus.train, package = "lightgbm") train <- agaricus.train diff --git a/R-package/man/lgb.load.Rd b/R-package/man/lgb.load.Rd index c1a00a20974b..f145db5a245e 100644 --- a/R-package/man/lgb.load.Rd +++ b/R-package/man/lgb.load.Rd @@ -20,6 +20,8 @@ Load LightGBM takes in either a file path or model string. } \examples{ \donttest{ +\dontshow{setLGBMthreads(2L)} +\dontshow{data.table::setDTthreads(1L)} data(agaricus.train, package = "lightgbm") train <- agaricus.train dtrain <- lgb.Dataset(train$data, label = train$label) diff --git a/R-package/man/lgb.model.dt.tree.Rd b/R-package/man/lgb.model.dt.tree.Rd index 4d02ede9a001..60ef8cdac133 100644 --- a/R-package/man/lgb.model.dt.tree.Rd +++ b/R-package/man/lgb.model.dt.tree.Rd @@ -40,6 +40,8 @@ Parse a LightGBM model json dump into a \code{data.table} structure. } \examples{ \donttest{ +\dontshow{setLGBMthreads(2L)} +\dontshow{data.table::setDTthreads(1L)} data(agaricus.train, package = "lightgbm") train <- agaricus.train dtrain <- lgb.Dataset(train$data, label = train$label) diff --git a/R-package/man/lgb.plot.importance.Rd b/R-package/man/lgb.plot.importance.Rd index 302f46460e3f..bdf354da0385 100644 --- a/R-package/man/lgb.plot.importance.Rd +++ b/R-package/man/lgb.plot.importance.Rd @@ -38,6 +38,8 @@ Features are shown ranked in a decreasing importance order. } \examples{ \donttest{ +\dontshow{setLGBMthreads(2L)} +\dontshow{data.table::setDTthreads(1L)} data(agaricus.train, package = "lightgbm") train <- agaricus.train dtrain <- lgb.Dataset(train$data, label = train$label) diff --git a/R-package/man/lgb.plot.interpretation.Rd b/R-package/man/lgb.plot.interpretation.Rd index a914071e896f..6f168e120a4e 100644 --- a/R-package/man/lgb.plot.interpretation.Rd +++ b/R-package/man/lgb.plot.interpretation.Rd @@ -35,6 +35,8 @@ contribution of a feature. Features are shown ranked in a decreasing contributio } \examples{ \donttest{ +\dontshow{setLGBMthreads(2L)} +\dontshow{data.table::setDTthreads(1L)} Logit <- function(x) { log(x / (1.0 - x)) } diff --git a/R-package/man/lgb.restore_handle.Rd b/R-package/man/lgb.restore_handle.Rd index 95cbdc64485d..37922c077642 100644 --- a/R-package/man/lgb.restore_handle.Rd +++ b/R-package/man/lgb.restore_handle.Rd @@ -27,7 +27,10 @@ function. If you wish to make fast single-row predictions using a \code{lgb.Boos call \link{lgb.configure_fast_predict} on the loaded \code{lgb.Booster} object. } \examples{ +\donttest{ library(lightgbm) +\dontshow{setLGBMthreads(2L)} +\dontshow{data.table::setDTthreads(1L)} data("agaricus.train") model <- lightgbm( agaricus.train$data @@ -45,6 +48,7 @@ model_new$check_null_handle() lgb.restore_handle(model_new) model_new$check_null_handle() } +} \seealso{ \link{lgb.make_serializable}, \link{lgb.drop_serialized}. } diff --git a/R-package/man/lgb.save.Rd b/R-package/man/lgb.save.Rd index efd110c7d816..62ec0ed462f6 100644 --- a/R-package/man/lgb.save.Rd +++ b/R-package/man/lgb.save.Rd @@ -21,6 +21,8 @@ Save LightGBM model } \examples{ \donttest{ +\dontshow{setLGBMthreads(2L)} +\dontshow{data.table::setDTthreads(1L)} library(lightgbm) data(agaricus.train, package = "lightgbm") train <- agaricus.train diff --git a/R-package/man/lgb.train.Rd b/R-package/man/lgb.train.Rd index 0f2961edc415..557c85b7f9dc 100644 --- a/R-package/man/lgb.train.Rd +++ b/R-package/man/lgb.train.Rd @@ -130,6 +130,8 @@ Low-level R interface to train a LightGBM model. Unlike \code{\link{lightgbm}}, \examples{ \donttest{ +\dontshow{setLGBMthreads(2L)} +\dontshow{data.table::setDTthreads(1L)} data(agaricus.train, package = "lightgbm") train <- agaricus.train dtrain <- lgb.Dataset(train$data, label = train$label) diff --git a/R-package/man/predict.lgb.Booster.Rd b/R-package/man/predict.lgb.Booster.Rd index 2df13b9bc374..bcb2f3f980fb 100644 --- a/R-package/man/predict.lgb.Booster.Rd +++ b/R-package/man/predict.lgb.Booster.Rd @@ -121,6 +121,8 @@ If the model object has been configured for fast single-row predictions through } \examples{ \donttest{ +\dontshow{setLGBMthreads(2L)} +\dontshow{data.table::setDTthreads(1L)} data(agaricus.train, package = "lightgbm") train <- agaricus.train dtrain <- lgb.Dataset(train$data, label = train$label) diff --git a/R-package/man/readRDS.lgb.Booster.Rd b/R-package/man/readRDS.lgb.Booster.Rd index 6a8e4c80ca91..0a144434cd36 100644 --- a/R-package/man/readRDS.lgb.Booster.Rd +++ b/R-package/man/readRDS.lgb.Booster.Rd @@ -23,6 +23,8 @@ Calls \code{readRDS} in what is expected to be a serialized \code{lgb.Booster} o \examples{ \donttest{ library(lightgbm) +\dontshow{setLGBMthreads(2L)} +\dontshow{data.table::setDTthreads(1L)} data(agaricus.train, package = "lightgbm") train <- agaricus.train dtrain <- lgb.Dataset(train$data, label = train$label) diff --git a/R-package/man/saveRDS.lgb.Booster.Rd b/R-package/man/saveRDS.lgb.Booster.Rd index a8664243dce2..b9b34e1fd021 100644 --- a/R-package/man/saveRDS.lgb.Booster.Rd +++ b/R-package/man/saveRDS.lgb.Booster.Rd @@ -46,6 +46,8 @@ Calls \code{saveRDS} on an \code{lgb.Booster} object, making it serializable bef \examples{ \donttest{ library(lightgbm) +\dontshow{setLGBMthreads(2L)} +\dontshow{data.table::setDTthreads(1L)} data(agaricus.train, package = "lightgbm") train <- agaricus.train dtrain <- lgb.Dataset(train$data, label = train$label) diff --git a/R-package/man/setLGBMThreads.Rd b/R-package/man/setLGBMThreads.Rd new file mode 100644 index 000000000000..53336fc2548e --- /dev/null +++ b/R-package/man/setLGBMThreads.Rd @@ -0,0 +1,32 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/multithreading.R +\name{setLGBMThreads} +\alias{setLGBMThreads} +\alias{setLGBMthreads} +\title{Set maximum number of threads used by LightGBM} +\usage{ +setLGBMthreads(num_threads) +} +\arguments{ +\item{num_threads}{maximum number of threads to be used by LightGBM in multi-threaded operations} +} +\description{ +LightGBM attempts to speed up many operations by using multi-threading. + The number of threads used in those operations can be controlled via the + \code{num_threads} parameter passed through \code{params} to functions like + \link{lgb.train} and \link{lgb.Dataset}. However, some operations (like materializing + a model from a text file) are done via code paths that don't explicitly accept thread-control + configuration. + + Use this function to set the maximum number of threads LightGBM will use for such operations. + + This function affects all LightGBM operations in the same process. + + So, for example, if you call \code{setLGBMthreads(4)}, no other multi-threaded LightGBM + operation in the same process will use more than 4 threads. + + Call \code{setLGBMthreads(-1)} to remove this limitation. +} +\seealso{ +\link{getLGBMthreads} +} diff --git a/R-package/man/set_field.Rd b/R-package/man/set_field.Rd index f9901e27eefd..2ceebfb87753 100644 --- a/R-package/man/set_field.Rd +++ b/R-package/man/set_field.Rd @@ -34,6 +34,8 @@ Set one attribute of a \code{lgb.Dataset} } \examples{ \donttest{ +\dontshow{setLGBMthreads(2L)} +\dontshow{data.table::setDTthreads(1L)} data(agaricus.train, package = "lightgbm") train <- agaricus.train dtrain <- lgb.Dataset(train$data, label = train$label) diff --git a/R-package/man/slice.Rd b/R-package/man/slice.Rd index 1d7bec08de0f..a65809a239d8 100644 --- a/R-package/man/slice.Rd +++ b/R-package/man/slice.Rd @@ -23,6 +23,8 @@ Get a new \code{lgb.Dataset} containing the specified rows of } \examples{ \donttest{ +\dontshow{setLGBMthreads(2L)} +\dontshow{data.table::setDTthreads(1L)} data(agaricus.train, package = "lightgbm") train <- agaricus.train dtrain <- lgb.Dataset(train$data, label = train$label) diff --git a/R-package/src/Makevars.in b/R-package/src/Makevars.in index ba9ef054bfab..c04263f62c1c 100644 --- a/R-package/src/Makevars.in +++ b/R-package/src/Makevars.in @@ -53,5 +53,6 @@ OBJECTS = \ treelearner/serial_tree_learner.o \ treelearner/tree_learner.o \ treelearner/voting_parallel_tree_learner.o \ + utils/openmp_wrapper.o \ c_api.o \ lightgbm_R.o diff --git a/R-package/src/Makevars.win.in b/R-package/src/Makevars.win.in index 14f5afde002f..86d56fecdf34 100644 --- a/R-package/src/Makevars.win.in +++ b/R-package/src/Makevars.win.in @@ -54,5 +54,6 @@ OBJECTS = \ treelearner/serial_tree_learner.o \ treelearner/tree_learner.o \ treelearner/voting_parallel_tree_learner.o \ + utils/openmp_wrapper.o \ c_api.o \ lightgbm_R.o diff --git a/R-package/src/lightgbm_R.cpp b/R-package/src/lightgbm_R.cpp index 3ae7a98d8537..4799f8540497 100644 --- a/R-package/src/lightgbm_R.cpp +++ b/R-package/src/lightgbm_R.cpp @@ -1212,6 +1212,23 @@ SEXP LGBM_BoosterGetLoadedParam_R(SEXP handle) { R_API_END(); } +SEXP LGBM_GetMaxThreads_R(SEXP out) { + R_API_BEGIN(); + int num_threads; + CHECK_CALL(LGBM_GetMaxThreads(&num_threads)); + INTEGER(out)[0] = num_threads; + return R_NilValue; + R_API_END(); +} + +SEXP LGBM_SetMaxThreads_R(SEXP num_threads) { + R_API_BEGIN(); + int new_num_threads = Rf_asInteger(num_threads); + CHECK_CALL(LGBM_SetMaxThreads(new_num_threads)); + return R_NilValue; + R_API_END(); +} + // .Call() calls static const R_CallMethodDef CallEntries[] = { {"LGBM_HandleIsNull_R" , (DL_FUNC) &LGBM_HandleIsNull_R , 1}, @@ -1268,6 +1285,8 @@ static const R_CallMethodDef CallEntries[] = { {"LGBM_BoosterDumpModel_R" , (DL_FUNC) &LGBM_BoosterDumpModel_R , 3}, {"LGBM_NullBoosterHandleError_R" , (DL_FUNC) &LGBM_NullBoosterHandleError_R , 0}, {"LGBM_DumpParamAliases_R" , (DL_FUNC) &LGBM_DumpParamAliases_R , 0}, + {"LGBM_GetMaxThreads_R" , (DL_FUNC) &LGBM_GetMaxThreads_R , 1}, + {"LGBM_SetMaxThreads_R" , (DL_FUNC) &LGBM_SetMaxThreads_R , 1}, {NULL, NULL, 0} }; diff --git a/R-package/src/lightgbm_R.h b/R-package/src/lightgbm_R.h index 7141a06a207c..4f0407e8f2ec 100644 --- a/R-package/src/lightgbm_R.h +++ b/R-package/src/lightgbm_R.h @@ -850,4 +850,23 @@ LIGHTGBM_C_EXPORT SEXP LGBM_BoosterDumpModel_R( */ LIGHTGBM_C_EXPORT SEXP LGBM_DumpParamAliases_R(); +/*! +* \brief Get current maximum number of threads used by LightGBM routines in this process. +* \param[out] out current maximum number of threads used by LightGBM. -1 means defaulting to omp_get_num_threads(). +* \return R NULL value +*/ +LIGHTGBM_C_EXPORT SEXP LGBM_GetMaxThreads_R( + SEXP out +); + + +/*! +* \brief Set maximum number of threads used by LightGBM routines in this process. +* \param num_threads maximum number of threads used by LightGBM. -1 means defaulting to omp_get_num_threads(). +* \return R NULL value +*/ +LIGHTGBM_C_EXPORT SEXP LGBM_SetMaxThreads_R( + SEXP num_threads +); + #endif // LIGHTGBM_R_H_ diff --git a/R-package/tests/testthat/helper.R b/R-package/tests/testthat/helper.R index 9c928c1f71d1..45edf40efbeb 100644 --- a/R-package/tests/testthat/helper.R +++ b/R-package/tests/testthat/helper.R @@ -11,6 +11,11 @@ # the check farm is a shared resource and will typically be running many checks simultaneously. # .LGB_MAX_THREADS <- 2L +setLGBMthreads(.LGB_MAX_THREADS) + +# control data.table parallelism +# ref: https://github.com/Rdatatable/data.table/issues/5658 +data.table::setDTthreads(1L) # by default, how much should results in tests be allowed to differ from hard-coded expected numbers? .LGB_NUMERIC_TOLERANCE <- 1e-6 diff --git a/R-package/tests/testthat/test_multithreading.R b/R-package/tests/testthat/test_multithreading.R new file mode 100644 index 000000000000..e2f3169627a2 --- /dev/null +++ b/R-package/tests/testthat/test_multithreading.R @@ -0,0 +1,16 @@ +test_that("getLGBMthreads() and setLGBMthreads() work as expected", { + # works with integer input + ret <- setLGBMthreads(2L) + expect_null(ret) + expect_equal(getLGBMthreads(), 2L) + + # works with float input + ret <- setLGBMthreads(1.0) + expect_null(ret) + expect_equal(getLGBMthreads(), 1L) + + # setting to any negative number sets max threads to -1 + ret <- setLGBMthreads(-312L) + expect_null(ret) + expect_equal(getLGBMthreads(), -1L) +}) diff --git a/R-package/vignettes/basic_walkthrough.Rmd b/R-package/vignettes/basic_walkthrough.Rmd index d7aaf676f386..82bd6957640c 100644 --- a/R-package/vignettes/basic_walkthrough.Rmd +++ b/R-package/vignettes/basic_walkthrough.Rmd @@ -27,6 +27,12 @@ Welcome to the world of [LightGBM](https://lightgbm.readthedocs.io/en/latest/), library(lightgbm) ``` +```{r, include=FALSE} +# limit number of threads used, to be respectful of CRAN's resources when it checks this vignette +data.table::setDTthreads(1L) +setLGBMthreads(2L) +``` + This vignette will guide you through its basic usage. It will show how to build a simple binary classification model based on a subset of the `bank` dataset (Moro, Cortez, and Rita 2014). You will use the two input features "age" and "balance" to predict whether a client has subscribed a term deposit. ## The dataset diff --git a/build-cran-package.sh b/build-cran-package.sh index 1c8a5dfbdc48..9fa0c5877085 100755 --- a/build-cran-package.sh +++ b/build-cran-package.sh @@ -227,6 +227,7 @@ if ${BUILD_VIGNETTES} ; then rm -f ./lightgbm/src/network/*.o rm -f ./lightgbm/src/objective/*.o rm -f ./lightgbm/src/treelearner/*.o + rm -f ./lightgbm/src/utils/*.o echo "re-tarring ${TARBALL_NAME}" tar \ diff --git a/include/LightGBM/c_api.h b/include/LightGBM/c_api.h index ada2e4109638..397005477a5c 100644 --- a/include/LightGBM/c_api.h +++ b/include/LightGBM/c_api.h @@ -1561,6 +1561,20 @@ LIGHTGBM_C_EXPORT int LGBM_NetworkInitWithFunctions(int num_machines, void* reduce_scatter_ext_fun, void* allgather_ext_fun); +/*! + * \brief Set maximum number of threads used by LightGBM routines in this process. + * \param num_threads maximum number of threads used by LightGBM. -1 means defaulting to omp_get_num_threads(). + * \return 0 when succeed, -1 when failure happens + */ +LIGHTGBM_C_EXPORT int LGBM_SetMaxThreads(int num_threads); + +/*! + * \brief Get current maximum number of threads used by LightGBM routines in this process. + * \param[out] out current maximum number of threads used by LightGBM. -1 means defaulting to omp_get_num_threads(). + * \return 0 when succeed, -1 when failure happens + */ +LIGHTGBM_C_EXPORT int LGBM_GetMaxThreads(int* out); + #if !defined(__cplusplus) && (!defined(__STDC__) || (__STDC_VERSION__ < 199901L)) /*! \brief Inline specifier no-op in C using standards before C99. */ #define INLINE_FUNCTION diff --git a/include/LightGBM/utils/openmp_wrapper.h b/include/LightGBM/utils/openmp_wrapper.h index a337fc353b75..b9a8ea2982fc 100644 --- a/include/LightGBM/utils/openmp_wrapper.h +++ b/include/LightGBM/utils/openmp_wrapper.h @@ -5,6 +5,15 @@ #ifndef LIGHTGBM_OPENMP_WRAPPER_H_ #define LIGHTGBM_OPENMP_WRAPPER_H_ +#include + +// this can only be changed by LGBM_SetMaxThreads() +LIGHTGBM_EXTERN_C int LGBM_MAX_NUM_THREADS; + +// this is modified by OMP_SET_NUM_THREADS(), for example +// by passing num_thread through params +LIGHTGBM_EXTERN_C int LGBM_DEFAULT_NUM_THREADS; + #ifdef _OPENMP #include @@ -17,22 +26,25 @@ #include #include -inline int OMP_NUM_THREADS() { - int ret = 1; -#pragma omp parallel -#pragma omp master - { ret = omp_get_num_threads(); } - return ret; -} - -inline void OMP_SET_NUM_THREADS(int num_threads) { - static const int default_omp_num_threads = OMP_NUM_THREADS(); - if (num_threads > 0) { - omp_set_num_threads(num_threads); - } else { - omp_set_num_threads(default_omp_num_threads); - } -} +/* + Get number of threads to use in OpenMP parallel regions. + + By default, this will return the result of omp_get_max_threads(), + which is OpenMP-implementation dependent but generally can be controlled + by environment variable OMP_NUM_THREADS. + + ref: + - https://www.openmp.org/spec-html/5.0/openmpsu112.html + - https://gcc.gnu.org/onlinedocs/libgomp/omp_005fget_005fmax_005fthreads.html +*/ +LIGHTGBM_EXTERN_C int OMP_NUM_THREADS(); + +/* + Update the default number of threads that'll be used in OpenMP parallel + regions for LightGBM routines where the number of threads aren't directly + supplied. +*/ +LIGHTGBM_EXTERN_C void OMP_SET_NUM_THREADS(int num_threads); class ThreadExceptionHelper { public: @@ -102,10 +114,7 @@ class ThreadExceptionHelper { /** Fall here if no OPENMP support, so just simulate a single thread running. All #pragma omp should be ignored by the compiler **/ - inline void omp_set_num_threads(int) __GOMP_NOTHROW {} // NOLINT (no cast done here) inline void OMP_SET_NUM_THREADS(int) __GOMP_NOTHROW {} - inline int omp_get_num_threads() __GOMP_NOTHROW {return 1;} - inline int omp_get_max_threads() __GOMP_NOTHROW {return 1;} inline int omp_get_thread_num() __GOMP_NOTHROW {return 0;} inline int OMP_NUM_THREADS() __GOMP_NOTHROW { return 1; } #ifdef __cplusplus diff --git a/src/c_api.cpp b/src/c_api.cpp index baf934db42b1..dbe5425bd0aa 100644 --- a/src/c_api.cpp +++ b/src/c_api.cpp @@ -2699,6 +2699,23 @@ int LGBM_NetworkInitWithFunctions(int num_machines, int rank, API_END(); } +int LGBM_SetMaxThreads(int num_threads) { + API_BEGIN(); + if (num_threads <= 0) { + LGBM_MAX_NUM_THREADS = -1; + } else { + LGBM_MAX_NUM_THREADS = num_threads; + } + API_END(); +} + +int LGBM_GetMaxThreads(int* out) { + API_BEGIN(); + *out = LGBM_MAX_NUM_THREADS; + API_END(); +} + + // ---- start of some help functions diff --git a/src/utils/openmp_wrapper.cpp b/src/utils/openmp_wrapper.cpp new file mode 100644 index 000000000000..fb6e661eb67c --- /dev/null +++ b/src/utils/openmp_wrapper.cpp @@ -0,0 +1,44 @@ +/*! + * Copyright (c) 2023 Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See LICENSE file in the project root for license information. + */ +#include + +int LGBM_MAX_NUM_THREADS = -1; + +int LGBM_DEFAULT_NUM_THREADS = -1; + +#ifdef _OPENMP + +#include + +int OMP_NUM_THREADS() { + int default_num_threads = 1; + + if (LGBM_DEFAULT_NUM_THREADS > 0) { + // if LightGBM-specific default has been set, ignore OpenMP-global config + default_num_threads = LGBM_DEFAULT_NUM_THREADS; + } else { + // otherwise, default to OpenMP-global config + #pragma omp single + { default_num_threads = omp_get_max_threads(); } + } + + // ensure that if LGBM_SetMaxThreads() was ever called, LightGBM doesn't + // use more than that many threads + if (LGBM_MAX_NUM_THREADS > 0 && default_num_threads > LGBM_MAX_NUM_THREADS) { + return LGBM_MAX_NUM_THREADS; + } + + return default_num_threads; +} + +void OMP_SET_NUM_THREADS(int num_threads) { + if (num_threads <= 0) { + LGBM_DEFAULT_NUM_THREADS = -1; + } else { + LGBM_DEFAULT_NUM_THREADS = num_threads; + } +} + +#endif // _OPENMP diff --git a/tests/c_api_test/test_.py b/tests/c_api_test/test_.py index 4bb76e4aba19..6cfec1c445fc 100644 --- a/tests/c_api_test/test_.py +++ b/tests/c_api_test/test_.py @@ -247,3 +247,36 @@ def test_booster(): c_str(''), c_str('preb.txt')) LIB.LGBM_BoosterFree(booster2) + + +def test_max_thread_control(): + # at initialization, should be -1 + num_threads = ctypes.c_int(0) + ret = LIB.LGBM_GetMaxThreads( + ctypes.byref(num_threads) + ) + assert ret == 0 + assert num_threads.value == -1 + + # updating that value through the C API should work + ret = LIB.LGBM_SetMaxThreads( + ctypes.c_int(6) + ) + assert ret == 0 + + ret = LIB.LGBM_GetMaxThreads( + ctypes.byref(num_threads) + ) + assert ret == 0 + assert num_threads.value == 6 + + # resetting to any negative number should set it to -1 + ret = LIB.LGBM_SetMaxThreads( + ctypes.c_int(-123) + ) + assert ret == 0 + ret = LIB.LGBM_GetMaxThreads( + ctypes.byref(num_threads) + ) + assert ret == 0 + assert num_threads.value == -1 From 522f0f07b0eba0e3190c3e5c8e149a205bd20bf3 Mon Sep 17 00:00:00 2001 From: Oliver Borchert Date: Sun, 10 Dec 2023 18:29:04 +0100 Subject: [PATCH 17/19] [python-package] Add tests for passing Arrow arrays with empty chunks (#6210) --- include/LightGBM/arrow.h | 2 ++ tests/python_package_test/test_arrow.py | 38 ++++++++++++++++--------- 2 files changed, 27 insertions(+), 13 deletions(-) diff --git a/include/LightGBM/arrow.h b/include/LightGBM/arrow.h index 3d1c74713bd3..75511e17e72a 100644 --- a/include/LightGBM/arrow.h +++ b/include/LightGBM/arrow.h @@ -117,6 +117,7 @@ class ArrowChunkedArray { const struct ArrowSchema* schema) { chunks_.reserve(n_chunks); for (auto k = 0; k < n_chunks; ++k) { + if (chunks[k].length == 0) continue; chunks_.push_back(&chunks[k]); } schema_ = schema; @@ -220,6 +221,7 @@ class ArrowTable { std::vector children_chunks; children_chunks.reserve(n_chunks); for (int64_t k = 0; k < n_chunks; ++k) { + if (chunks[k].length == 0) continue; children_chunks.push_back(chunks[k].children[j]); } columns_.emplace_back(children_chunks, schema->children[j]); diff --git a/tests/python_package_test/test_arrow.py b/tests/python_package_test/test_arrow.py index 5e09465e34b3..7542368dcd63 100644 --- a/tests/python_package_test/test_arrow.py +++ b/tests/python_package_test/test_arrow.py @@ -30,18 +30,19 @@ ] -def generate_simple_arrow_table() -> pa.Table: +def generate_simple_arrow_table(empty_chunks: bool = False) -> pa.Table: + c: list[list[int]] = [[]] if empty_chunks else [] columns = [ - pa.chunked_array([[1, 2, 3, 4, 5]], type=pa.uint8()), - pa.chunked_array([[1, 2, 3, 4, 5]], type=pa.int8()), - pa.chunked_array([[1, 2, 3, 4, 5]], type=pa.uint16()), - pa.chunked_array([[1, 2, 3, 4, 5]], type=pa.int16()), - pa.chunked_array([[1, 2, 3, 4, 5]], type=pa.uint32()), - pa.chunked_array([[1, 2, 3, 4, 5]], type=pa.int32()), - pa.chunked_array([[1, 2, 3, 4, 5]], type=pa.uint64()), - pa.chunked_array([[1, 2, 3, 4, 5]], type=pa.int64()), - pa.chunked_array([[1, 2, 3, 4, 5]], type=pa.float32()), - pa.chunked_array([[1, 2, 3, 4, 5]], type=pa.float64()), + pa.chunked_array(c + [[1, 2, 3]] + c + [[4, 5]] + c, type=pa.uint8()), + pa.chunked_array(c + [[1, 2, 3]] + c + [[4, 5]] + c, type=pa.int8()), + pa.chunked_array(c + [[1, 2, 3]] + c + [[4, 5]] + c, type=pa.uint16()), + pa.chunked_array(c + [[1, 2, 3]] + c + [[4, 5]] + c, type=pa.int16()), + pa.chunked_array(c + [[1, 2, 3]] + c + [[4, 5]] + c, type=pa.uint32()), + pa.chunked_array(c + [[1, 2, 3]] + c + [[4, 5]] + c, type=pa.int32()), + pa.chunked_array(c + [[1, 2, 3]] + c + [[4, 5]] + c, type=pa.uint64()), + pa.chunked_array(c + [[1, 2, 3]] + c + [[4, 5]] + c, type=pa.int64()), + pa.chunked_array(c + [[1, 2, 3]] + c + [[4, 5]] + c, type=pa.float32()), + pa.chunked_array(c + [[1, 2, 3]] + c + [[4, 5]] + c, type=pa.float64()), ] return pa.Table.from_arrays(columns, names=[f"col_{i}" for i in range(len(columns))]) @@ -104,6 +105,7 @@ def dummy_dataset_params() -> Dict[str, Any]: ("arrow_table_fn", "dataset_params"), [ # Use lambda functions here to minimize memory consumption (lambda: generate_simple_arrow_table(), dummy_dataset_params()), + (lambda: generate_simple_arrow_table(empty_chunks=True), dummy_dataset_params()), (lambda: generate_dummy_arrow_table(), dummy_dataset_params()), (lambda: generate_nullable_arrow_table(), dummy_dataset_params()), (lambda: generate_random_arrow_table(3, 1000, 42), {}), @@ -160,7 +162,12 @@ def test_dataset_construct_fields_fuzzy(): @pytest.mark.parametrize( ["array_type", "label_data"], - [(pa.array, [0, 1, 0, 0, 1]), (pa.chunked_array, [[0], [1, 0, 0, 1]])], + [ + (pa.array, [0, 1, 0, 0, 1]), + (pa.chunked_array, [[0], [1, 0, 0, 1]]), + (pa.chunked_array, [[], [0], [1, 0, 0, 1]]), + (pa.chunked_array, [[0], [], [1, 0], [], [], [0, 1], []]), + ], ) @pytest.mark.parametrize("arrow_type", _INTEGER_TYPES + _FLOAT_TYPES) def test_dataset_construct_labels(array_type, label_data, arrow_type): @@ -187,7 +194,12 @@ def test_dataset_construct_weights_none(): @pytest.mark.parametrize( ["array_type", "weight_data"], - [(pa.array, [3, 0.7, 1.5, 0.5, 0.1]), (pa.chunked_array, [[3], [0.7, 1.5, 0.5, 0.1]])], + [ + (pa.array, [3, 0.7, 1.5, 0.5, 0.1]), + (pa.chunked_array, [[3], [0.7, 1.5, 0.5, 0.1]]), + (pa.chunked_array, [[], [3], [0.7, 1.5, 0.5, 0.1]]), + (pa.chunked_array, [[3], [0.7], [], [], [1.5, 0.5, 0.1], []]), + ], ) @pytest.mark.parametrize("arrow_type", _FLOAT_TYPES) def test_dataset_construct_weights(array_type, weight_data, arrow_type): From 6fc80528f15b92921ecffaaa14b6bddaa0de3404 Mon Sep 17 00:00:00 2001 From: June Liu <103498042+Zhaojun-Liu@users.noreply.github.com> Date: Wed, 13 Dec 2023 12:06:28 +0800 Subject: [PATCH 18/19] fix errors from MSVC '/permissive-' mode (fixes #6230) (#6232) --- include/LightGBM/arrow.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/LightGBM/arrow.h b/include/LightGBM/arrow.h index 75511e17e72a..767da12a9809 100644 --- a/include/LightGBM/arrow.h +++ b/include/LightGBM/arrow.h @@ -16,6 +16,7 @@ #include #include #include +#include /* -------------------------------------- C DATA INTERFACE ------------------------------------- */ // The C data interface is taken from From 2dfb9a40478b965db8325baa21a63d9281f96b7c Mon Sep 17 00:00:00 2001 From: Oliver Borchert Date: Thu, 14 Dec 2023 04:35:46 +0100 Subject: [PATCH 19/19] [python-package] Allow to pass Arrow table for prediction (#6168) --- include/LightGBM/c_api.h | 34 ++++++ python-package/lightgbm/basic.py | 56 +++++++++- src/c_api.cpp | 51 +++++++++ tests/python_package_test/test_arrow.py | 133 +++++++++++++++++++++--- 4 files changed, 259 insertions(+), 15 deletions(-) diff --git a/include/LightGBM/c_api.h b/include/LightGBM/c_api.h index 397005477a5c..b43f096c31ee 100644 --- a/include/LightGBM/c_api.h +++ b/include/LightGBM/c_api.h @@ -1417,6 +1417,40 @@ LIGHTGBM_C_EXPORT int LGBM_BoosterPredictForMats(BoosterHandle handle, int64_t* out_len, double* out_result); +/*! + * \brief Make prediction for a new dataset. + * \note + * You should pre-allocate memory for ``out_result``: + * - for normal and raw score, its length is equal to ``num_class * num_data``; + * - for leaf index, its length is equal to ``num_class * num_data * num_iteration``; + * - for feature contributions, its length is equal to ``num_class * num_data * (num_feature + 1)``. + * \param handle Handle of booster + * \param n_chunks The number of Arrow arrays passed to this function + * \param chunks Pointer to the list of Arrow arrays + * \param schema Pointer to the schema of all Arrow arrays + * \param predict_type What should be predicted + * - ``C_API_PREDICT_NORMAL``: normal prediction, with transform (if needed); + * - ``C_API_PREDICT_RAW_SCORE``: raw score; + * - ``C_API_PREDICT_LEAF_INDEX``: leaf index; + * - ``C_API_PREDICT_CONTRIB``: feature contributions (SHAP values) + * \param start_iteration Start index of the iteration to predict + * \param num_iteration Number of iteration for prediction, <= 0 means no limit + * \param parameter Other parameters for prediction, e.g. early stopping for prediction + * \param[out] out_len Length of output result + * \param[out] out_result Pointer to array with predictions + * \return 0 when succeed, -1 when failure happens + */ +LIGHTGBM_C_EXPORT int LGBM_BoosterPredictForArrow(BoosterHandle handle, + int64_t n_chunks, + const ArrowArray* chunks, + const ArrowSchema* schema, + int predict_type, + int start_iteration, + int num_iteration, + const char* parameter, + int64_t* out_len, + double* out_result); + /*! * \brief Save model into file. * \param handle Handle of booster diff --git a/python-package/lightgbm/basic.py b/python-package/lightgbm/basic.py index c4022e7fdd9a..560a9a438872 100644 --- a/python-package/lightgbm/basic.py +++ b/python-package/lightgbm/basic.py @@ -115,7 +115,8 @@ np.ndarray, pd_DataFrame, dt_DataTable, - scipy.sparse.spmatrix + scipy.sparse.spmatrix, + pa_Table, ] _LGBM_WeightType = Union[ List[float], @@ -1069,7 +1070,7 @@ def predict( Parameters ---------- - data : str, pathlib.Path, numpy array, pandas DataFrame, H2O DataTable's Frame or scipy.sparse + data : str, pathlib.Path, numpy array, pandas DataFrame, pyarrow Table, H2O DataTable's Frame or scipy.sparse Data source for prediction. If str or pathlib.Path, it represents the path to a text file (CSV, TSV, or LibSVM). start_iteration : int, optional (default=0) @@ -1161,6 +1162,13 @@ def predict( num_iteration=num_iteration, predict_type=predict_type ) + elif _is_pyarrow_table(data): + preds, nrow = self.__pred_for_pyarrow_table( + table=data, + start_iteration=start_iteration, + num_iteration=num_iteration, + predict_type=predict_type + ) elif isinstance(data, list): try: data = np.array(data) @@ -1614,6 +1622,48 @@ def __pred_for_csc( if n_preds != out_num_preds.value: raise ValueError("Wrong length for predict results") return preds, nrow + + def __pred_for_pyarrow_table( + self, + table: pa_Table, + start_iteration: int, + num_iteration: int, + predict_type: int + ) -> Tuple[np.ndarray, int]: + """Predict for a PyArrow table.""" + if not PYARROW_INSTALLED: + raise LightGBMError("Cannot predict from Arrow without `pyarrow` installed.") + + # Check that the input is valid: we only handle numbers (for now) + if not all(arrow_is_integer(t) or arrow_is_floating(t) for t in table.schema.types): + raise ValueError("Arrow table may only have integer or floating point datatypes") + + # Prepare prediction output array + n_preds = self.__get_num_preds( + start_iteration=start_iteration, + num_iteration=num_iteration, + nrow=table.num_rows, + predict_type=predict_type + ) + preds = np.empty(n_preds, dtype=np.float64) + out_num_preds = ctypes.c_int64(0) + + # Export Arrow table to C and run prediction + c_array = _export_arrow_to_c(table) + _safe_call(_LIB.LGBM_BoosterPredictForArrow( + self._handle, + ctypes.c_int64(c_array.n_chunks), + ctypes.c_void_p(c_array.chunks_ptr), + ctypes.c_void_p(c_array.schema_ptr), + ctypes.c_int(predict_type), + ctypes.c_int(start_iteration), + ctypes.c_int(num_iteration), + _c_str(self.pred_parameter), + ctypes.byref(out_num_preds), + preds.ctypes.data_as(ctypes.POINTER(ctypes.c_double)))) + if n_preds != out_num_preds.value: + raise ValueError("Wrong length for predict results") + return preds, table.num_rows def current_iteration(self) -> int: """Get the index of the current iteration. @@ -4350,7 +4400,7 @@ def predict( Parameters ---------- - data : str, pathlib.Path, numpy array, pandas DataFrame, H2O DataTable's Frame or scipy.sparse + data : str, pathlib.Path, numpy array, pandas DataFrame, pyarrow Table, H2O DataTable's Frame or scipy.sparse Data source for prediction. If str or pathlib.Path, it represents the path to a text file (CSV, TSV, or LibSVM). start_iteration : int, optional (default=0) diff --git a/src/c_api.cpp b/src/c_api.cpp index dbe5425bd0aa..67b18003588a 100644 --- a/src/c_api.cpp +++ b/src/c_api.cpp @@ -2568,6 +2568,57 @@ int LGBM_BoosterPredictForMats(BoosterHandle handle, API_END(); } +int LGBM_BoosterPredictForArrow(BoosterHandle handle, + int64_t n_chunks, + const ArrowArray* chunks, + const ArrowSchema* schema, + int predict_type, + int start_iteration, + int num_iteration, + const char* parameter, + int64_t* out_len, + double* out_result) { + API_BEGIN(); + + // Apply the configuration + auto param = Config::Str2Map(parameter); + Config config; + config.Set(param); + OMP_SET_NUM_THREADS(config.num_threads); + + // Set up chunked array and iterators for all columns + ArrowTable table(n_chunks, chunks, schema); + std::vector> its; + its.reserve(table.get_num_columns()); + for (int64_t j = 0; j < table.get_num_columns(); ++j) { + its.emplace_back(table.get_column(j).begin()); + } + + // Build row function + auto num_columns = table.get_num_columns(); + auto row_fn = [num_columns, &its] (int row_idx) { + std::vector> result; + result.reserve(num_columns); + for (int64_t j = 0; j < num_columns; ++j) { + result.emplace_back(static_cast(j), its[j][row_idx]); + } + return result; + }; + + // Run prediction + Booster* ref_booster = reinterpret_cast(handle); + ref_booster->Predict(start_iteration, + num_iteration, + predict_type, + static_cast(table.get_num_rows()), + static_cast(table.get_num_columns()), + row_fn, + config, + out_result, + out_len); + API_END(); +} + int LGBM_BoosterSaveModel(BoosterHandle handle, int start_iteration, int num_iteration, diff --git a/tests/python_package_test/test_arrow.py b/tests/python_package_test/test_arrow.py index 7542368dcd63..593c03d8c7ef 100644 --- a/tests/python_package_test/test_arrow.py +++ b/tests/python_package_test/test_arrow.py @@ -1,6 +1,6 @@ # coding: utf-8 import filecmp -from typing import Any, Dict +from typing import Any, Dict, Optional import numpy as np import pyarrow as pa @@ -63,19 +63,40 @@ def generate_dummy_arrow_table() -> pa.Table: return pa.Table.from_arrays([col1, col2], names=["a", "b"]) -def generate_random_arrow_table(num_columns: int, num_datapoints: int, seed: int) -> pa.Table: - columns = [generate_random_arrow_array(num_datapoints, seed + i) for i in range(num_columns)] +def generate_random_arrow_table( + num_columns: int, + num_datapoints: int, + seed: int, + generate_nulls: bool = True, + values: Optional[np.ndarray] = None, +) -> pa.Table: + columns = [ + generate_random_arrow_array( + num_datapoints, seed + i, generate_nulls=generate_nulls, values=values + ) + for i in range(num_columns) + ] names = [f"col_{i}" for i in range(num_columns)] return pa.Table.from_arrays(columns, names=names) -def generate_random_arrow_array(num_datapoints: int, seed: int) -> pa.ChunkedArray: +def generate_random_arrow_array( + num_datapoints: int, + seed: int, + generate_nulls: bool = True, + values: Optional[np.ndarray] = None, +) -> pa.ChunkedArray: generator = np.random.default_rng(seed) - data = generator.standard_normal(num_datapoints) + data = ( + generator.standard_normal(num_datapoints) + if values is None + else generator.choice(values, size=num_datapoints, replace=True) + ) # Set random nulls - indices = generator.choice(len(data), size=num_datapoints // 10) - data[indices] = None + if generate_nulls: + indices = generator.choice(len(data), size=num_datapoints // 10) + data[indices] = None # Split data into <=2 random chunks split_points = np.sort(generator.choice(np.arange(1, num_datapoints), 2, replace=False)) @@ -131,8 +152,8 @@ def test_dataset_construct_fuzzy(tmp_path, arrow_table_fn, dataset_params): def test_dataset_construct_fields_fuzzy(): arrow_table = generate_random_arrow_table(3, 1000, 42) - arrow_labels = generate_random_arrow_array(1000, 42) - arrow_weights = generate_random_arrow_array(1000, 42) + arrow_labels = generate_random_arrow_array(1000, 42, generate_nulls=False) + arrow_weights = generate_random_arrow_array(1000, 42, generate_nulls=False) arrow_groups = pa.chunked_array([[300, 400, 50], [250]], type=pa.int32()) arrow_dataset = lgb.Dataset( @@ -264,9 +285,9 @@ def test_dataset_construct_init_scores_table(): data = generate_dummy_arrow_table() init_scores = pa.Table.from_arrays( [ - generate_random_arrow_array(5, seed=1), - generate_random_arrow_array(5, seed=2), - generate_random_arrow_array(5, seed=3), + generate_random_arrow_array(5, seed=1, generate_nulls=False), + generate_random_arrow_array(5, seed=2, generate_nulls=False), + generate_random_arrow_array(5, seed=3, generate_nulls=False), ], names=["a", "b", "c"], ) @@ -276,3 +297,91 @@ def test_dataset_construct_init_scores_table(): actual = dataset.get_init_score() expected = init_scores.to_pandas().to_numpy().astype(np.float64) np_assert_array_equal(expected, actual, strict=True) + + +# ------------------------------------------ PREDICTION ----------------------------------------- # + + +def assert_equal_predict_arrow_pandas(booster: lgb.Booster, data: pa.Table): + p_arrow = booster.predict(data) + p_pandas = booster.predict(data.to_pandas()) + np_assert_array_equal(p_arrow, p_pandas, strict=True) + + p_raw_arrow = booster.predict(data, raw_score=True) + p_raw_pandas = booster.predict(data.to_pandas(), raw_score=True) + np_assert_array_equal(p_raw_arrow, p_raw_pandas, strict=True) + + p_leaf_arrow = booster.predict(data, pred_leaf=True) + p_leaf_pandas = booster.predict(data.to_pandas(), pred_leaf=True) + np_assert_array_equal(p_leaf_arrow, p_leaf_pandas, strict=True) + + p_pred_contrib_arrow = booster.predict(data, pred_contrib=True) + p_pred_contrib_pandas = booster.predict(data.to_pandas(), pred_contrib=True) + np_assert_array_equal(p_pred_contrib_arrow, p_pred_contrib_pandas, strict=True) + + p_first_iter_arrow = booster.predict(data, start_iteration=0, num_iteration=1, raw_score=True) + p_first_iter_pandas = booster.predict( + data.to_pandas(), start_iteration=0, num_iteration=1, raw_score=True + ) + np_assert_array_equal(p_first_iter_arrow, p_first_iter_pandas, strict=True) + + +def test_predict_regression(): + data = generate_random_arrow_table(10, 10000, 42) + dataset = lgb.Dataset( + data, + label=generate_random_arrow_array(10000, 43, generate_nulls=False), + params=dummy_dataset_params(), + ) + booster = lgb.train( + {"objective": "regression", "num_leaves": 7}, + dataset, + num_boost_round=5, + ) + assert_equal_predict_arrow_pandas(booster, data) + + +def test_predict_binary_classification(): + data = generate_random_arrow_table(10, 10000, 42) + dataset = lgb.Dataset( + data, + label=generate_random_arrow_array(10000, 43, generate_nulls=False, values=np.arange(2)), + params=dummy_dataset_params(), + ) + booster = lgb.train( + {"objective": "binary", "num_leaves": 7}, + dataset, + num_boost_round=5, + ) + assert_equal_predict_arrow_pandas(booster, data) + + +def test_predict_multiclass_classification(): + data = generate_random_arrow_table(10, 10000, 42) + dataset = lgb.Dataset( + data, + label=generate_random_arrow_array(10000, 43, generate_nulls=False, values=np.arange(5)), + params=dummy_dataset_params(), + ) + booster = lgb.train( + {"objective": "multiclass", "num_leaves": 7, "num_class": 5}, + dataset, + num_boost_round=5, + ) + assert_equal_predict_arrow_pandas(booster, data) + + +def test_predict_ranking(): + data = generate_random_arrow_table(10, 10000, 42) + dataset = lgb.Dataset( + data, + label=generate_random_arrow_array(10000, 43, generate_nulls=False, values=np.arange(4)), + group=np.array([1000, 2000, 3000, 4000]), + params=dummy_dataset_params(), + ) + booster = lgb.train( + {"objective": "lambdarank", "num_leaves": 7}, + dataset, + num_boost_round=5, + ) + assert_equal_predict_arrow_pandas(booster, data)