From f5b6bd60d9d752c8e5a75b11ab771d0422214bb4 Mon Sep 17 00:00:00 2001 From: Oliver Borchert Date: Mon, 4 Dec 2023 19:26:55 +0000 Subject: [PATCH 1/8] [python-package] Allow to pass Arrow table and array as init scores (#6167) --- include/LightGBM/c_api.h | 5 +-- include/LightGBM/dataset.h | 4 +++ python-package/lightgbm/basic.py | 28 ++++++++++----- python-package/lightgbm/compat.py | 2 ++ src/io/dataset.cpp | 2 ++ src/io/metadata.cpp | 28 ++++++++++----- tests/python_package_test/test_arrow.py | 45 ++++++++++++++++++++++++- 7 files changed, 95 insertions(+), 19 deletions(-) diff --git a/include/LightGBM/c_api.h b/include/LightGBM/c_api.h index eafe6fab7825..ada2e4109638 100644 --- a/include/LightGBM/c_api.h +++ b/include/LightGBM/c_api.h @@ -559,9 +559,10 @@ LIGHTGBM_C_EXPORT int LGBM_DatasetSetField(DatasetHandle handle, * \brief Set vector to a content in info. * \note * - \a group converts input datatype into ``int32``; - * - \a label and \a weight convert input datatype into ``float32``. + * - \a label and \a weight convert input datatype into ``float32``; + * - \a init_score converts input datatype into ``float64``. * \param handle Handle of dataset - * \param field_name Field name, can be \a label, \a weight, \a group + * \param field_name Field name, can be \a label, \a weight, \a init_score, \a group * \param n_chunks The number of Arrow arrays passed to this function * \param chunks Pointer to the list of Arrow arrays * \param schema Pointer to the schema of all Arrow arrays diff --git a/include/LightGBM/dataset.h b/include/LightGBM/dataset.h index bf8264276a5f..220a1f9f009c 100644 --- a/include/LightGBM/dataset.h +++ b/include/LightGBM/dataset.h @@ -125,6 +125,7 @@ class Metadata { * \param init_score Initial scores, this class will manage memory for init_score. */ void SetInitScore(const double* init_score, data_size_t len); + void SetInitScore(const ArrowChunkedArray& array); /*! @@ -347,6 +348,9 @@ class Metadata { void SetWeightsFromIterator(It first, It last); /*! \brief Insert initial scores at the given index */ void InsertInitScores(const double* init_scores, data_size_t start_index, data_size_t len, data_size_t source_size); + /*! \brief Set init scores from pointers to the first element and the end of an iterator. */ + template + void SetInitScoresFromIterator(It first, It last); /*! \brief Insert queries at the given index */ void InsertQueries(const data_size_t* queries, data_size_t start_index, data_size_t len); /*! \brief Set queries from pointers to the first element and the end of an iterator. */ diff --git a/python-package/lightgbm/basic.py b/python-package/lightgbm/basic.py index b55546941f77..31ae5182ee9e 100644 --- a/python-package/lightgbm/basic.py +++ b/python-package/lightgbm/basic.py @@ -19,8 +19,8 @@ import scipy.sparse from .compat import (PANDAS_INSTALLED, PYARROW_INSTALLED, arrow_cffi, arrow_is_floating, arrow_is_integer, concat, - dt_DataTable, pa_Array, pa_ChunkedArray, pa_compute, pa_Table, pd_CategoricalDtype, pd_DataFrame, - pd_Series) + dt_DataTable, pa_Array, pa_chunked_array, pa_ChunkedArray, pa_compute, pa_Table, + pd_CategoricalDtype, pd_DataFrame, pd_Series) from .libpath import find_lib_path if TYPE_CHECKING: @@ -84,6 +84,9 @@ np.ndarray, pd_Series, pd_DataFrame, + pa_Table, + pa_Array, + pa_ChunkedArray, ] _LGBM_TrainDataType = Union[ str, @@ -1660,7 +1663,7 @@ def __init__( sum(group) = n_samples. For example, if you have a 100-document dataset with ``group = [10, 20, 40, 10, 10, 10]``, that means that you have 6 groups, where the first 10 records are in the first group, records 11-30 are in the second group, records 31-70 are in the third group, etc. - init_score : list, list of lists (for multi-class task), numpy array, pandas Series, pandas DataFrame (for multi-class task), or None, optional (default=None) + init_score : list, list of lists (for multi-class task), numpy array, pandas Series, pandas DataFrame (for multi-class task), pyarrow Array, pyarrow ChunkedArray, pyarrow Table (for multi-class task) or None, optional (default=None) Init score for Dataset. feature_name : list of str, or 'auto', optional (default="auto") Feature names. @@ -2440,7 +2443,7 @@ def create_valid( sum(group) = n_samples. For example, if you have a 100-document dataset with ``group = [10, 20, 40, 10, 10, 10]``, that means that you have 6 groups, where the first 10 records are in the first group, records 11-30 are in the second group, records 31-70 are in the third group, etc. - init_score : list, list of lists (for multi-class task), numpy array, pandas Series, pandas DataFrame (for multi-class task), or None, optional (default=None) + init_score : list, list of lists (for multi-class task), numpy array, pandas Series, pandas DataFrame (for multi-class task), pyarrow Array, pyarrow ChunkedArray, pyarrow Table (for multi-class task) or None, optional (default=None) Init score for Dataset. params : dict or None, optional (default=None) Other parameters for validation Dataset. @@ -2547,7 +2550,7 @@ def _reverse_update_params(self) -> "Dataset": def set_field( self, field_name: str, - data: Optional[Union[List[List[float]], List[List[int]], List[float], List[int], np.ndarray, pd_Series, pd_DataFrame, pa_Array, pa_ChunkedArray]] + data: Optional[Union[List[List[float]], List[List[int]], List[float], List[int], np.ndarray, pd_Series, pd_DataFrame, pa_Table, pa_Array, pa_ChunkedArray]] ) -> "Dataset": """Set property into the Dataset. @@ -2576,7 +2579,16 @@ def set_field( return self # If the data is a arrow data, we can just pass it to C - if _is_pyarrow_array(data): + if _is_pyarrow_array(data) or _is_pyarrow_table(data): + # If a table is being passed, we concatenate the columns. This is only valid for + # 'init_score'. + if _is_pyarrow_table(data): + if field_name != "init_score": + raise ValueError(f"pyarrow tables are not supported for field '{field_name}'") + data = pa_chunked_array([ + chunk for array in data.columns for chunk in array.chunks # type: ignore + ]) + c_array = _export_arrow_to_c(data) _safe_call(_LIB.LGBM_DatasetSetFieldFromArrow( self._handle, @@ -2869,7 +2881,7 @@ def set_init_score( Parameters ---------- - init_score : list, list of lists (for multi-class task), numpy array, pandas Series, pandas DataFrame (for multi-class task), or None + init_score : list, list of lists (for multi-class task), numpy array, pandas Series, pandas DataFrame (for multi-class task), pyarrow Array, pyarrow ChunkedArray, pyarrow Table (for multi-class task) or None Init score for Booster. Returns @@ -4443,7 +4455,7 @@ def refit( .. versionadded:: 4.0.0 - init_score : list, list of lists (for multi-class task), numpy array, pandas Series, pandas DataFrame (for multi-class task), or None, optional (default=None) + init_score : list, list of lists (for multi-class task), numpy array, pandas Series, pandas DataFrame (for multi-class task), pyarrow Array, pyarrow ChunkedArray, pyarrow Table (for multi-class task) or None, optional (default=None) Init score for ``data``. .. versionadded:: 4.0.0 diff --git a/python-package/lightgbm/compat.py b/python-package/lightgbm/compat.py index dc48dbf792cf..bd1b29a1e802 100644 --- a/python-package/lightgbm/compat.py +++ b/python-package/lightgbm/compat.py @@ -201,6 +201,7 @@ def __init__(self, *args, **kwargs): from pyarrow import Array as pa_Array from pyarrow import ChunkedArray as pa_ChunkedArray from pyarrow import Table as pa_Table + from pyarrow import chunked_array as pa_chunked_array from pyarrow.cffi import ffi as arrow_cffi from pyarrow.types import is_floating as arrow_is_floating from pyarrow.types import is_integer as arrow_is_integer @@ -243,6 +244,7 @@ class pa_compute: # type: ignore all = None equal = None + pa_chunked_array = None arrow_is_integer = None arrow_is_floating = None diff --git a/src/io/dataset.cpp b/src/io/dataset.cpp index 78dd5e4319a5..058d7bd328ad 100644 --- a/src/io/dataset.cpp +++ b/src/io/dataset.cpp @@ -904,6 +904,8 @@ bool Dataset::SetFieldFromArrow(const char* field_name, const ArrowChunkedArray metadata_.SetLabel(ca); } else if (name == std::string("weight") || name == std::string("weights")) { metadata_.SetWeights(ca); + } else if (name == std::string("init_score")) { + metadata_.SetInitScore(ca); } else if (name == std::string("query") || name == std::string("group")) { metadata_.SetQuery(ca); } else { diff --git a/src/io/metadata.cpp b/src/io/metadata.cpp index d94b0ed3f2f7..55440649f55e 100644 --- a/src/io/metadata.cpp +++ b/src/io/metadata.cpp @@ -355,32 +355,44 @@ void Metadata::CheckOrPartition(data_size_t num_all_data, const std::vector +void Metadata::SetInitScoresFromIterator(It first, It last) { std::lock_guard lock(mutex_); - // save to nullptr - if (init_score == nullptr || len == 0) { + // Clear init scores on empty input + if (last - first == 0) { init_score_.clear(); num_init_score_ = 0; return; } - if ((len % num_data_) != 0) { + if (((last - first) % num_data_) != 0) { Log::Fatal("Initial score size doesn't match data size"); } - if (init_score_.empty()) { init_score_.resize(len); } - num_init_score_ = len; + if (init_score_.empty()) { + init_score_.resize(last - first); + } + num_init_score_ = last - first; #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static, 512) if (num_init_score_ >= 1024) for (int64_t i = 0; i < num_init_score_; ++i) { - init_score_[i] = Common::AvoidInf(init_score[i]); + init_score_[i] = Common::AvoidInf(first[i]); } init_score_load_from_file_ = false; + #ifdef USE_CUDA if (cuda_metadata_ != nullptr) { - cuda_metadata_->SetInitScore(init_score_.data(), len); + cuda_metadata_->SetInitScore(init_score_.data(), init_score_.size()); } #endif // USE_CUDA } +void Metadata::SetInitScore(const double* init_score, data_size_t len) { + SetInitScoresFromIterator(init_score, init_score + len); +} + +void Metadata::SetInitScore(const ArrowChunkedArray& array) { + SetInitScoresFromIterator(array.begin(), array.end()); +} + void Metadata::InsertInitScores(const double* init_scores, data_size_t start_index, data_size_t len, data_size_t source_size) { if (num_init_score_ <= 0) { Log::Fatal("Inserting initial score data into dataset with no initial scores"); diff --git a/tests/python_package_test/test_arrow.py b/tests/python_package_test/test_arrow.py index 38b053e94fd5..fd20df25dd87 100644 --- a/tests/python_package_test/test_arrow.py +++ b/tests/python_package_test/test_arrow.py @@ -178,7 +178,7 @@ def test_dataset_construct_weights_none(): ["array_type", "weight_data"], [(pa.array, [3, 0.7, 1.5, 0.5, 0.1]), (pa.chunked_array, [[3], [0.7, 1.5, 0.5, 0.1]])], ) -@pytest.mark.parametrize("arrow_type", [pa.float32(), pa.float64()]) +@pytest.mark.parametrize("arrow_type", _FLOAT_TYPES) def test_dataset_construct_weights(array_type, weight_data, arrow_type): data = generate_dummy_arrow_table() weights = array_type(weight_data, type=arrow_type) @@ -210,3 +210,46 @@ def test_dataset_construct_groups(array_type, group_data, arrow_type): expected = np.array([0, 2, 5], dtype=np.int32) np_assert_array_equal(expected, dataset.get_field("group"), strict=True) + + +# ----------------------------------------- INIT SCORES ----------------------------------------- # + + +@pytest.mark.parametrize( + ["array_type", "init_score_data"], + [ + (pa.array, [0, 1, 2, 3, 3]), + (pa.chunked_array, [[0, 1, 2], [3, 3]]), + (pa.chunked_array, [[], [0, 1, 2], [3, 3]]), + (pa.chunked_array, [[0, 1], [], [], [2], [3, 3], []]), + ], +) +@pytest.mark.parametrize("arrow_type", _INTEGER_TYPES + _FLOAT_TYPES) +def test_dataset_construct_init_scores_array( + array_type: Any, init_score_data: Any, arrow_type: Any +): + data = generate_dummy_arrow_table() + init_scores = array_type(init_score_data, type=arrow_type) + dataset = lgb.Dataset(data, init_score=init_scores, params=dummy_dataset_params()) + dataset.construct() + + expected = np.array([0, 1, 2, 3, 3], dtype=np.float64) + np_assert_array_equal(expected, dataset.get_init_score(), strict=True) + + +def test_dataset_construct_init_scores_table(): + data = generate_dummy_arrow_table() + init_scores = pa.Table.from_arrays( + [ + generate_random_arrow_array(5, seed=1), + generate_random_arrow_array(5, seed=2), + generate_random_arrow_array(5, seed=3), + ], + names=["a", "b", "c"], + ) + dataset = lgb.Dataset(data, init_score=init_scores, params=dummy_dataset_params()) + dataset.construct() + + actual = dataset.get_init_score() + expected = init_scores.to_pandas().to_numpy().astype(np.float64) + np_assert_array_equal(expected, actual, strict=True) From d84582b746500237c52701975e006ba8a813d229 Mon Sep 17 00:00:00 2001 From: Oliver Borchert Date: Wed, 6 Dec 2023 16:18:28 +0000 Subject: [PATCH 2/8] Fix null handling for Arrow data (#6227) --- include/LightGBM/arrow.tpp | 2 +- tests/cpp_tests/test_arrow.cpp | 6 ++++-- tests/python_package_test/test_arrow.py | 11 +++++++++++ 3 files changed, 16 insertions(+), 3 deletions(-) diff --git a/include/LightGBM/arrow.tpp b/include/LightGBM/arrow.tpp index 67b481c9497e..8d1ce4f4c0c1 100644 --- a/include/LightGBM/arrow.tpp +++ b/include/LightGBM/arrow.tpp @@ -144,7 +144,7 @@ struct ArrayIndexAccessor { // - The structure of validity bitmasks is taken from here: // https://arrow.apache.org/docs/format/Columnar.html#validity-bitmaps // - If the bitmask is NULL, all indices are valid - if (validity == nullptr || !(validity[buffer_idx / 8] & (1 << (buffer_idx % 8)))) { + if (validity == nullptr || (validity[buffer_idx / 8] & (1 << (buffer_idx % 8)))) { // In case the index is valid, we take it from the data buffer auto data = static_cast(array->buffers[1]); return static_cast(data[buffer_idx]); diff --git a/tests/cpp_tests/test_arrow.cpp b/tests/cpp_tests/test_arrow.cpp index 7e3c57c401f4..e975b6ba374b 100644 --- a/tests/cpp_tests/test_arrow.cpp +++ b/tests/cpp_tests/test_arrow.cpp @@ -41,10 +41,12 @@ class ArrowChunkedArrayTest : public testing::Test { // 1) Create validity bitmap char* validity = nullptr; if (!null_indices.empty()) { - validity = static_cast(calloc(values.size() + sizeof(char) - 1, sizeof(char))); + auto num_bytes = (values.size() + 7) / 8; + validity = static_cast(calloc(num_bytes, sizeof(char))); + memset(validity, 0xff, num_bytes * sizeof(char)); for (size_t i = 0; i < values.size(); ++i) { if (std::find(null_indices.begin(), null_indices.end(), i) != null_indices.end()) { - validity[i / 8] |= (1 << (i % 8)); + validity[i / 8] &= ~(1 << (i % 8)); } } } diff --git a/tests/python_package_test/test_arrow.py b/tests/python_package_test/test_arrow.py index fd20df25dd87..5e09465e34b3 100644 --- a/tests/python_package_test/test_arrow.py +++ b/tests/python_package_test/test_arrow.py @@ -46,6 +46,16 @@ def generate_simple_arrow_table() -> pa.Table: return pa.Table.from_arrays(columns, names=[f"col_{i}" for i in range(len(columns))]) +def generate_nullable_arrow_table() -> pa.Table: + columns = [ + pa.chunked_array([[1, None, 3, 4, 5]], type=pa.float32()), + pa.chunked_array([[None, 2, 3, 4, 5]], type=pa.float32()), + pa.chunked_array([[1, 2, 3, 4, None]], type=pa.float32()), + pa.chunked_array([[None, None, None, None, None]], type=pa.float32()), + ] + return pa.Table.from_arrays(columns, names=[f"col_{i}" for i in range(len(columns))]) + + def generate_dummy_arrow_table() -> pa.Table: col1 = pa.chunked_array([[1, 2, 3], [4, 5]], type=pa.uint8()) col2 = pa.chunked_array([[0.5, 0.6], [0.1, 0.8, 1.5]], type=pa.float32()) @@ -95,6 +105,7 @@ def dummy_dataset_params() -> Dict[str, Any]: [ # Use lambda functions here to minimize memory consumption (lambda: generate_simple_arrow_table(), dummy_dataset_params()), (lambda: generate_dummy_arrow_table(), dummy_dataset_params()), + (lambda: generate_nullable_arrow_table(), dummy_dataset_params()), (lambda: generate_random_arrow_table(3, 1000, 42), {}), (lambda: generate_random_arrow_table(100, 10000, 43), {}), ], From 4aba4fc1326210a1501f144bd54d77a64d127362 Mon Sep 17 00:00:00 2001 From: James Lamb Date: Wed, 6 Dec 2023 12:56:27 -0600 Subject: [PATCH 3/8] [R-package] change CRAN maintainer (#6224) --- R-package/DESCRIPTION | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R-package/DESCRIPTION b/R-package/DESCRIPTION index 1193c0d463b9..62b479530b4a 100755 --- a/R-package/DESCRIPTION +++ b/R-package/DESCRIPTION @@ -4,10 +4,10 @@ Title: Light Gradient Boosting Machine Version: ~~VERSION~~ Date: ~~DATE~~ Authors@R: c( - person("Yu", "Shi", email = "yushi2@microsoft.com", role = c("aut", "cre")), + person("Yu", "Shi", email = "yushi2@microsoft.com", role = c("aut")), person("Guolin", "Ke", email = "guolin.ke@outlook.com", role = c("aut")), person("Damien", "Soukhavong", email = "damien.soukhavong@skema.edu", role = c("aut")), - person("James", "Lamb", email="jaylamb20@gmail.com", role = c("aut")), + person("James", "Lamb", email="jaylamb20@gmail.com", role = c("aut", "cre")), person("Qi", "Meng", role = c("aut")), person("Thomas", "Finley", role = c("aut")), person("Taifeng", "Wang", role = c("aut")), From e797985227a012a837c20eddc457de6b7fc7aeaa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20Morales?= Date: Thu, 7 Dec 2023 08:54:18 -0600 Subject: [PATCH 4/8] [python-package] take shallow copy of dataframe in predict (fixes #6195) (#6218) --- python-package/lightgbm/basic.py | 5 ++++- tests/python_package_test/test_basic.py | 19 ++++++++++++++++--- 2 files changed, 20 insertions(+), 4 deletions(-) diff --git a/python-package/lightgbm/basic.py b/python-package/lightgbm/basic.py index 31ae5182ee9e..c4022e7fdd9a 100644 --- a/python-package/lightgbm/basic.py +++ b/python-package/lightgbm/basic.py @@ -789,6 +789,10 @@ def _data_from_pandas( if len(data.shape) != 2 or data.shape[0] < 1: raise ValueError('Input data must be 2 dimensional and non empty.') + # take shallow copy in case we modify categorical columns + # whole column modifications don't change the original df + data = data.copy(deep=False) + # determine feature names if feature_name == 'auto': feature_name = [str(col) for col in data.columns] @@ -805,7 +809,6 @@ def _data_from_pandas( if list(data[col].cat.categories) != list(category): data[col] = data[col].cat.set_categories(category) if len(cat_cols): # cat_cols is list - data = data.copy(deep=False) # not alter origin DataFrame data[cat_cols] = data[cat_cols].apply(lambda x: x.cat.codes).replace({-1: np.nan}) if categorical_feature == 'auto': # use cat cols from DataFrame categorical_feature = cat_cols_not_ordered diff --git a/tests/python_package_test/test_basic.py b/tests/python_package_test/test_basic.py index 2f6b07e7a77f..b8ef43e41397 100644 --- a/tests/python_package_test/test_basic.py +++ b/tests/python_package_test/test_basic.py @@ -822,21 +822,34 @@ def test_no_copy_when_single_float_dtype_dataframe(dtype, feature_name): @pytest.mark.parametrize('feature_name', [['x1'], [42], 'auto']) -def test_categorical_code_conversion_doesnt_modify_original_data(feature_name): +@pytest.mark.parametrize('categories', ['seen', 'unseen']) +def test_categorical_code_conversion_doesnt_modify_original_data(feature_name, categories): pd = pytest.importorskip('pandas') X = np.random.choice(['a', 'b'], 100).reshape(-1, 1) column_name = 'a' if feature_name == 'auto' else feature_name[0] df = pd.DataFrame(X.copy(), columns=[column_name], dtype='category') + if categories == 'seen': + pandas_categorical = [['a', 'b']] + else: + pandas_categorical = [['a']] data = lgb.basic._data_from_pandas( data=df, feature_name=feature_name, categorical_feature="auto", - pandas_categorical=None + pandas_categorical=pandas_categorical, )[0] # check that the original data wasn't modified np.testing.assert_equal(df[column_name], X[:, 0]) # check that the built data has the codes - np.testing.assert_equal(df[column_name].cat.codes, data[:, 0]) + if categories == 'seen': + # if all categories were seen during training we just take the codes + codes = df[column_name].cat.codes + else: + # if we only saw 'a' during training we just replace its code + # and leave the rest as nan + a_code = df[column_name].cat.categories.get_loc('a') + codes = np.where(df[column_name] == 'a', a_code, np.nan) + np.testing.assert_equal(codes, data[:, 0]) @pytest.mark.parametrize('min_data_in_bin', [2, 10]) From 1548b42bac5d5b7c295ba4d3132e8bda47e34fd1 Mon Sep 17 00:00:00 2001 From: James Lamb Date: Thu, 7 Dec 2023 17:03:16 -0600 Subject: [PATCH 5/8] [R-package] [c++] add tighter multithreading control, avoid global OpenMP side effects (fixes #4705, fixes #5102) (#6226) --- .ci/lint-cpp.sh | 3 +- CMakeLists.txt | 1 + R-package/NAMESPACE | 2 + R-package/R/lgb.Booster.R | 12 +++++ R-package/R/lgb.Dataset.R | 22 ++++++++ R-package/R/lgb.cv.R | 2 + R-package/R/lgb.importance.R | 2 + R-package/R/lgb.interprete.R | 2 + R-package/R/lgb.model.dt.tree.R | 2 + R-package/R/lgb.plot.importance.R | 2 + R-package/R/lgb.plot.interpretation.R | 2 + R-package/R/lgb.restore_handle.R | 4 ++ R-package/R/lgb.train.R | 2 + R-package/R/multithreading.R | 51 +++++++++++++++++++ R-package/R/readRDS.lgb.Booster.R | 2 + R-package/R/saveRDS.lgb.Booster.R | 2 + R-package/man/dim.Rd | 2 + R-package/man/dimnames.lgb.Dataset.Rd | 2 + R-package/man/getLGBMThreads.Rd | 26 ++++++++++ R-package/man/get_field.Rd | 2 + R-package/man/lgb.Dataset.Rd | 2 + R-package/man/lgb.Dataset.construct.Rd | 2 + R-package/man/lgb.Dataset.create.valid.Rd | 2 + R-package/man/lgb.Dataset.save.Rd | 2 + R-package/man/lgb.Dataset.set.categorical.Rd | 2 + R-package/man/lgb.Dataset.set.reference.Rd | 2 + R-package/man/lgb.configure_fast_predict.Rd | 2 + R-package/man/lgb.cv.Rd | 2 + R-package/man/lgb.dump.Rd | 2 + R-package/man/lgb.get.eval.result.Rd | 2 + R-package/man/lgb.importance.Rd | 2 + R-package/man/lgb.interprete.Rd | 2 + R-package/man/lgb.load.Rd | 2 + R-package/man/lgb.model.dt.tree.Rd | 2 + R-package/man/lgb.plot.importance.Rd | 2 + R-package/man/lgb.plot.interpretation.Rd | 2 + R-package/man/lgb.restore_handle.Rd | 4 ++ R-package/man/lgb.save.Rd | 2 + R-package/man/lgb.train.Rd | 2 + R-package/man/predict.lgb.Booster.Rd | 2 + R-package/man/readRDS.lgb.Booster.Rd | 2 + R-package/man/saveRDS.lgb.Booster.Rd | 2 + R-package/man/setLGBMThreads.Rd | 32 ++++++++++++ R-package/man/set_field.Rd | 2 + R-package/man/slice.Rd | 2 + R-package/src/Makevars.in | 1 + R-package/src/Makevars.win.in | 1 + R-package/src/lightgbm_R.cpp | 19 +++++++ R-package/src/lightgbm_R.h | 19 +++++++ R-package/tests/testthat/helper.R | 5 ++ .../tests/testthat/test_multithreading.R | 16 ++++++ R-package/vignettes/basic_walkthrough.Rmd | 6 +++ build-cran-package.sh | 1 + include/LightGBM/c_api.h | 14 +++++ include/LightGBM/utils/openmp_wrapper.h | 47 ++++++++++------- src/c_api.cpp | 17 +++++++ src/utils/openmp_wrapper.cpp | 44 ++++++++++++++++ tests/c_api_test/test_.py | 33 ++++++++++++ 58 files changed, 429 insertions(+), 21 deletions(-) create mode 100644 R-package/R/multithreading.R create mode 100644 R-package/man/getLGBMThreads.Rd create mode 100644 R-package/man/setLGBMThreads.Rd create mode 100644 R-package/tests/testthat/test_multithreading.R create mode 100644 src/utils/openmp_wrapper.cpp diff --git a/.ci/lint-cpp.sh b/.ci/lint-cpp.sh index 56489ecf3325..2d91f8e85f00 100755 --- a/.ci/lint-cpp.sh +++ b/.ci/lint-cpp.sh @@ -30,8 +30,7 @@ get_omp_pragmas_without_num_threads() { --include='*.h' \ --include='*.hpp' \ 'pragma omp parallel' \ - | grep -v ' num_threads' \ - | grep -v 'openmp_wrapper.h' + | grep -v ' num_threads' } PROBLEMATIC_LINES=$( get_omp_pragmas_without_num_threads diff --git a/CMakeLists.txt b/CMakeLists.txt index 50b3cbaaf189..aef95871e4cc 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -432,6 +432,7 @@ file( src/objective/*.cpp src/network/*.cpp src/treelearner/*.cpp + src/utils/*.cpp if(USE_CUDA) src/treelearner/*.cu src/boosting/cuda/*.cpp diff --git a/R-package/NAMESPACE b/R-package/NAMESPACE index e07af84d8824..ab987d0593eb 100644 --- a/R-package/NAMESPACE +++ b/R-package/NAMESPACE @@ -9,6 +9,7 @@ S3method(print,lgb.Booster) S3method(set_field,lgb.Dataset) S3method(slice,lgb.Dataset) S3method(summary,lgb.Booster) +export(getLGBMthreads) export(get_field) export(lgb.Dataset) export(lgb.Dataset.construct) @@ -35,6 +36,7 @@ export(lgb.train) export(lightgbm) export(readRDS.lgb.Booster) export(saveRDS.lgb.Booster) +export(setLGBMthreads) export(set_field) export(slice) import(methods) diff --git a/R-package/R/lgb.Booster.R b/R-package/R/lgb.Booster.R index 17da9545ae19..4437c6fa552e 100644 --- a/R-package/R/lgb.Booster.R +++ b/R-package/R/lgb.Booster.R @@ -917,6 +917,8 @@ NULL #' the factor levels not being present in the output. #' @examples #' \donttest{ +#' \dontshow{setLGBMthreads(2L)} +#' \dontshow{data.table::setDTthreads(1L)} #' data(agaricus.train, package = "lightgbm") #' train <- agaricus.train #' dtrain <- lgb.Dataset(train$data, label = train$label) @@ -1082,6 +1084,8 @@ predict.lgb.Booster <- function(object, #' \link{predict.lgb.Booster}. #' @examples #' \donttest{ +#' \dontshow{setLGBMthreads(2L)} +#' \dontshow{data.table::setDTthreads(1L)} #' library(lightgbm) #' data(mtcars) #' X <- as.matrix(mtcars[, -1L]) @@ -1224,6 +1228,8 @@ summary.lgb.Booster <- function(object, ...) { #' #' @examples #' \donttest{ +#' \dontshow{setLGBMthreads(2L)} +#' \dontshow{data.table::setDTthreads(1L)} #' data(agaricus.train, package = "lightgbm") #' train <- agaricus.train #' dtrain <- lgb.Dataset(train$data, label = train$label) @@ -1289,6 +1295,8 @@ lgb.load <- function(filename = NULL, model_str = NULL) { #' #' @examples #' \donttest{ +#' \dontshow{setLGBMthreads(2L)} +#' \dontshow{data.table::setDTthreads(1L)} #' library(lightgbm) #' data(agaricus.train, package = "lightgbm") #' train <- agaricus.train @@ -1346,6 +1354,8 @@ lgb.save <- function(booster, filename, num_iteration = NULL) { #' @examples #' \donttest{ #' library(lightgbm) +#' \dontshow{setLGBMthreads(2L)} +#' \dontshow{data.table::setDTthreads(1L)} #' data(agaricus.train, package = "lightgbm") #' train <- agaricus.train #' dtrain <- lgb.Dataset(train$data, label = train$label) @@ -1396,6 +1406,8 @@ lgb.dump <- function(booster, num_iteration = NULL) { #' #' @examples #' \donttest{ +#' \dontshow{setLGBMthreads(2L)} +#' \dontshow{data.table::setDTthreads(1L)} #' # train a regression model #' data(agaricus.train, package = "lightgbm") #' train <- agaricus.train diff --git a/R-package/R/lgb.Dataset.R b/R-package/R/lgb.Dataset.R index ddc338d2cae3..ff9b0b4fa38a 100644 --- a/R-package/R/lgb.Dataset.R +++ b/R-package/R/lgb.Dataset.R @@ -780,6 +780,8 @@ Dataset <- R6::R6Class( #' #' @examples #' \donttest{ +#' \dontshow{setLGBMthreads(2L)} +#' \dontshow{data.table::setDTthreads(1L)} #' data(agaricus.train, package = "lightgbm") #' train <- agaricus.train #' dtrain <- lgb.Dataset(train$data, label = train$label) @@ -837,6 +839,8 @@ lgb.Dataset <- function(data, #' #' @examples #' \donttest{ +#' \dontshow{setLGBMthreads(2L)} +#' \dontshow{data.table::setDTthreads(1L)} #' data(agaricus.train, package = "lightgbm") #' train <- agaricus.train #' dtrain <- lgb.Dataset(train$data, label = train$label) @@ -913,6 +917,8 @@ lgb.Dataset.create.valid <- function(dataset, #' #' @examples #' \donttest{ +#' \dontshow{setLGBMthreads(2L)} +#' \dontshow{data.table::setDTthreads(1L)} #' data(agaricus.train, package = "lightgbm") #' train <- agaricus.train #' dtrain <- lgb.Dataset(train$data, label = train$label) @@ -942,6 +948,8 @@ lgb.Dataset.construct <- function(dataset) { #' #' @examples #' \donttest{ +#' \dontshow{setLGBMthreads(2L)} +#' \dontshow{data.table::setDTthreads(1L)} #' data(agaricus.train, package = "lightgbm") #' train <- agaricus.train #' dtrain <- lgb.Dataset(train$data, label = train$label) @@ -975,6 +983,8 @@ dim.lgb.Dataset <- function(x) { #' #' @examples #' \donttest{ +#' \dontshow{setLGBMthreads(2L)} +#' \dontshow{data.table::setDTthreads(1L)} #' data(agaricus.train, package = "lightgbm") #' train <- agaricus.train #' dtrain <- lgb.Dataset(train$data, label = train$label) @@ -1045,6 +1055,8 @@ dimnames.lgb.Dataset <- function(x) { #' #' @examples #' \donttest{ +#' \dontshow{setLGBMthreads(2L)} +#' \dontshow{data.table::setDTthreads(1L)} #' data(agaricus.train, package = "lightgbm") #' train <- agaricus.train #' dtrain <- lgb.Dataset(train$data, label = train$label) @@ -1089,6 +1101,8 @@ slice.lgb.Dataset <- function(dataset, idxset) { #' #' @examples #' \donttest{ +#' \dontshow{setLGBMthreads(2L)} +#' \dontshow{data.table::setDTthreads(1L)} #' data(agaricus.train, package = "lightgbm") #' train <- agaricus.train #' dtrain <- lgb.Dataset(train$data, label = train$label) @@ -1138,6 +1152,8 @@ get_field.lgb.Dataset <- function(dataset, field_name) { #' #' @examples #' \donttest{ +#' \dontshow{setLGBMthreads(2L)} +#' \dontshow{data.table::setDTthreads(1L)} #' data(agaricus.train, package = "lightgbm") #' train <- agaricus.train #' dtrain <- lgb.Dataset(train$data, label = train$label) @@ -1177,6 +1193,8 @@ set_field.lgb.Dataset <- function(dataset, field_name, data) { #' #' @examples #' \donttest{ +#' \dontshow{setLGBMthreads(2L)} +#' \dontshow{data.table::setDTthreads(1L)} #' data(agaricus.train, package = "lightgbm") #' train <- agaricus.train #' dtrain <- lgb.Dataset(train$data, label = train$label) @@ -1207,6 +1225,8 @@ lgb.Dataset.set.categorical <- function(dataset, categorical_feature) { #' #' @examples #' \donttest{ +#' \dontshow{setLGBMthreads(2L)} +#' \dontshow{data.table::setDTthreads(1L)} #' # create training Dataset #' data(agaricus.train, package ="lightgbm") #' train <- agaricus.train @@ -1240,6 +1260,8 @@ lgb.Dataset.set.reference <- function(dataset, reference) { #' #' @examples #' \donttest{ +#' \dontshow{setLGBMthreads(2L)} +#' \dontshow{data.table::setDTthreads(1L)} #' data(agaricus.train, package = "lightgbm") #' train <- agaricus.train #' dtrain <- lgb.Dataset(train$data, label = train$label) diff --git a/R-package/R/lgb.cv.R b/R-package/R/lgb.cv.R index 11768c5bfa0b..0545fbf71899 100644 --- a/R-package/R/lgb.cv.R +++ b/R-package/R/lgb.cv.R @@ -51,6 +51,8 @@ CVBooster <- R6::R6Class( #' #' @examples #' \donttest{ +#' \dontshow{setLGBMthreads(2L)} +#' \dontshow{data.table::setDTthreads(1L)} #' data(agaricus.train, package = "lightgbm") #' train <- agaricus.train #' dtrain <- lgb.Dataset(train$data, label = train$label) diff --git a/R-package/R/lgb.importance.R b/R-package/R/lgb.importance.R index 27efb17392df..7c76131f4f53 100644 --- a/R-package/R/lgb.importance.R +++ b/R-package/R/lgb.importance.R @@ -14,6 +14,8 @@ #' #' @examples #' \donttest{ +#' \dontshow{setLGBMthreads(2L)} +#' \dontshow{data.table::setDTthreads(1L)} #' data(agaricus.train, package = "lightgbm") #' train <- agaricus.train #' dtrain <- lgb.Dataset(train$data, label = train$label) diff --git a/R-package/R/lgb.interprete.R b/R-package/R/lgb.interprete.R index 976315262792..8f93d45429f1 100644 --- a/R-package/R/lgb.interprete.R +++ b/R-package/R/lgb.interprete.R @@ -17,6 +17,8 @@ #' #' @examples #' \donttest{ +#' \dontshow{setLGBMthreads(2L)} +#' \dontshow{data.table::setDTthreads(1L)} #' Logit <- function(x) log(x / (1.0 - x)) #' data(agaricus.train, package = "lightgbm") #' train <- agaricus.train diff --git a/R-package/R/lgb.model.dt.tree.R b/R-package/R/lgb.model.dt.tree.R index 5d994accfa7f..bf4562e41018 100644 --- a/R-package/R/lgb.model.dt.tree.R +++ b/R-package/R/lgb.model.dt.tree.R @@ -29,6 +29,8 @@ #' #' @examples #' \donttest{ +#' \dontshow{setLGBMthreads(2L)} +#' \dontshow{data.table::setDTthreads(1L)} #' data(agaricus.train, package = "lightgbm") #' train <- agaricus.train #' dtrain <- lgb.Dataset(train$data, label = train$label) diff --git a/R-package/R/lgb.plot.importance.R b/R-package/R/lgb.plot.importance.R index fc59ebd0efec..b8a90ca158ae 100644 --- a/R-package/R/lgb.plot.importance.R +++ b/R-package/R/lgb.plot.importance.R @@ -19,6 +19,8 @@ #' #' @examples #' \donttest{ +#' \dontshow{setLGBMthreads(2L)} +#' \dontshow{data.table::setDTthreads(1L)} #' data(agaricus.train, package = "lightgbm") #' train <- agaricus.train #' dtrain <- lgb.Dataset(train$data, label = train$label) diff --git a/R-package/R/lgb.plot.interpretation.R b/R-package/R/lgb.plot.interpretation.R index 8b95371eb3c2..97650f30a7d3 100644 --- a/R-package/R/lgb.plot.interpretation.R +++ b/R-package/R/lgb.plot.interpretation.R @@ -16,6 +16,8 @@ #' #' @examples #' \donttest{ +#' \dontshow{setLGBMthreads(2L)} +#' \dontshow{data.table::setDTthreads(1L)} #' Logit <- function(x) { #' log(x / (1.0 - x)) #' } diff --git a/R-package/R/lgb.restore_handle.R b/R-package/R/lgb.restore_handle.R index 0ed25ef26f3d..8a24cc628ca9 100644 --- a/R-package/R/lgb.restore_handle.R +++ b/R-package/R/lgb.restore_handle.R @@ -16,7 +16,10 @@ #' @return \code{lgb.Booster} (the same `model` object that was passed as input, invisibly). #' @seealso \link{lgb.make_serializable}, \link{lgb.drop_serialized}. #' @examples +#' \donttest{ #' library(lightgbm) +#' \dontshow{setLGBMthreads(2L)} +#' \dontshow{data.table::setDTthreads(1L)} #' data("agaricus.train") #' model <- lightgbm( #' agaricus.train$data @@ -33,6 +36,7 @@ #' model_new$check_null_handle() #' lgb.restore_handle(model_new) #' model_new$check_null_handle() +#' } #' @export lgb.restore_handle <- function(model) { if (!.is_Booster(x = model)) { diff --git a/R-package/R/lgb.train.R b/R-package/R/lgb.train.R index 6979558d22cd..8a299fb6b8ac 100644 --- a/R-package/R/lgb.train.R +++ b/R-package/R/lgb.train.R @@ -19,6 +19,8 @@ #' #' @examples #' \donttest{ +#' \dontshow{setLGBMthreads(2L)} +#' \dontshow{data.table::setDTthreads(1L)} #' data(agaricus.train, package = "lightgbm") #' train <- agaricus.train #' dtrain <- lgb.Dataset(train$data, label = train$label) diff --git a/R-package/R/multithreading.R b/R-package/R/multithreading.R new file mode 100644 index 000000000000..a8d6b51a8968 --- /dev/null +++ b/R-package/R/multithreading.R @@ -0,0 +1,51 @@ +#' @name setLGBMThreads +#' @title Set maximum number of threads used by LightGBM +#' @description LightGBM attempts to speed up many operations by using multi-threading. +#' The number of threads used in those operations can be controlled via the +#' \code{num_threads} parameter passed through \code{params} to functions like +#' \link{lgb.train} and \link{lgb.Dataset}. However, some operations (like materializing +#' a model from a text file) are done via code paths that don't explicitly accept thread-control +#' configuration. +#' +#' Use this function to set the maximum number of threads LightGBM will use for such operations. +#' +#' This function affects all LightGBM operations in the same process. +#' +#' So, for example, if you call \code{setLGBMthreads(4)}, no other multi-threaded LightGBM +#' operation in the same process will use more than 4 threads. +#' +#' Call \code{setLGBMthreads(-1)} to remove this limitation. +#' @param num_threads maximum number of threads to be used by LightGBM in multi-threaded operations +#' @return NULL +#' @seealso \link{getLGBMthreads} +#' @export +setLGBMthreads <- function(num_threads) { + .Call( + LGBM_SetMaxThreads_R, + num_threads + ) + return(invisible(NULL)) +} + +#' @name getLGBMThreads +#' @title Get default number of threads used by LightGBM +#' @description LightGBM attempts to speed up many operations by using multi-threading. +#' The number of threads used in those operations can be controlled via the +#' \code{num_threads} parameter passed through \code{params} to functions like +#' \link{lgb.train} and \link{lgb.Dataset}. However, some operations (like materializing +#' a model from a text file) are done via code paths that don't explicitly accept thread-control +#' configuration. +#' +#' Use this function to see the default number of threads LightGBM will use for such operations. +#' @return number of threads as an integer. \code{-1} means that in situations where parameter \code{num_threads} is +#' not explicitly supplied, LightGBM will choose a number of threads to use automatically. +#' @seealso \link{setLGBMthreads} +#' @export +getLGBMthreads <- function() { + out <- 0L + .Call( + LGBM_GetMaxThreads_R, + out + ) + return(out) +} diff --git a/R-package/R/readRDS.lgb.Booster.R b/R-package/R/readRDS.lgb.Booster.R index a8abac642c24..69e954fc75f1 100644 --- a/R-package/R/readRDS.lgb.Booster.R +++ b/R-package/R/readRDS.lgb.Booster.R @@ -12,6 +12,8 @@ #' @examples #' \donttest{ #' library(lightgbm) +#' \dontshow{setLGBMthreads(2L)} +#' \dontshow{data.table::setDTthreads(1L)} #' data(agaricus.train, package = "lightgbm") #' train <- agaricus.train #' dtrain <- lgb.Dataset(train$data, label = train$label) diff --git a/R-package/R/saveRDS.lgb.Booster.R b/R-package/R/saveRDS.lgb.Booster.R index d75056e69734..d227d75eb90d 100644 --- a/R-package/R/saveRDS.lgb.Booster.R +++ b/R-package/R/saveRDS.lgb.Booster.R @@ -22,6 +22,8 @@ #' @examples #' \donttest{ #' library(lightgbm) +#' \dontshow{setLGBMthreads(2L)} +#' \dontshow{data.table::setDTthreads(1L)} #' data(agaricus.train, package = "lightgbm") #' train <- agaricus.train #' dtrain <- lgb.Dataset(train$data, label = train$label) diff --git a/R-package/man/dim.Rd b/R-package/man/dim.Rd index 94ca192d8291..69332d0ec397 100644 --- a/R-package/man/dim.Rd +++ b/R-package/man/dim.Rd @@ -21,6 +21,8 @@ be directly used with an \code{lgb.Dataset} object. } \examples{ \donttest{ +\dontshow{setLGBMthreads(2L)} +\dontshow{data.table::setDTthreads(1L)} data(agaricus.train, package = "lightgbm") train <- agaricus.train dtrain <- lgb.Dataset(train$data, label = train$label) diff --git a/R-package/man/dimnames.lgb.Dataset.Rd b/R-package/man/dimnames.lgb.Dataset.Rd index ec01a04f607b..85f2085f1d77 100644 --- a/R-package/man/dimnames.lgb.Dataset.Rd +++ b/R-package/man/dimnames.lgb.Dataset.Rd @@ -28,6 +28,8 @@ Since row names are irrelevant, it is recommended to use \code{colnames} directl } \examples{ \donttest{ +\dontshow{setLGBMthreads(2L)} +\dontshow{data.table::setDTthreads(1L)} data(agaricus.train, package = "lightgbm") train <- agaricus.train dtrain <- lgb.Dataset(train$data, label = train$label) diff --git a/R-package/man/getLGBMThreads.Rd b/R-package/man/getLGBMThreads.Rd new file mode 100644 index 000000000000..21af4f4849d4 --- /dev/null +++ b/R-package/man/getLGBMThreads.Rd @@ -0,0 +1,26 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/multithreading.R +\name{getLGBMThreads} +\alias{getLGBMThreads} +\alias{getLGBMthreads} +\title{Get default number of threads used by LightGBM} +\usage{ +getLGBMthreads() +} +\value{ +number of threads as an integer. \code{-1} means that in situations where parameter \code{num_threads} is + not explicitly supplied, LightGBM will choose a number of threads to use automatically. +} +\description{ +LightGBM attempts to speed up many operations by using multi-threading. + The number of threads used in those operations can be controlled via the + \code{num_threads} parameter passed through \code{params} to functions like + \link{lgb.train} and \link{lgb.Dataset}. However, some operations (like materializing + a model from a text file) are done via code paths that don't explicitly accept thread-control + configuration. + + Use this function to see the default number of threads LightGBM will use for such operations. +} +\seealso{ +\link{setLGBMthreads} +} diff --git a/R-package/man/get_field.Rd b/R-package/man/get_field.Rd index 1b6692fcf807..e2562cc21364 100644 --- a/R-package/man/get_field.Rd +++ b/R-package/man/get_field.Rd @@ -32,6 +32,8 @@ Get one attribute of a \code{lgb.Dataset} } \examples{ \donttest{ +\dontshow{setLGBMthreads(2L)} +\dontshow{data.table::setDTthreads(1L)} data(agaricus.train, package = "lightgbm") train <- agaricus.train dtrain <- lgb.Dataset(train$data, label = train$label) diff --git a/R-package/man/lgb.Dataset.Rd b/R-package/man/lgb.Dataset.Rd index 4895600ff922..2605657b060a 100644 --- a/R-package/man/lgb.Dataset.Rd +++ b/R-package/man/lgb.Dataset.Rd @@ -65,6 +65,8 @@ Construct \code{lgb.Dataset} object from dense matrix, sparse matrix } \examples{ \donttest{ +\dontshow{setLGBMthreads(2L)} +\dontshow{data.table::setDTthreads(1L)} data(agaricus.train, package = "lightgbm") train <- agaricus.train dtrain <- lgb.Dataset(train$data, label = train$label) diff --git a/R-package/man/lgb.Dataset.construct.Rd b/R-package/man/lgb.Dataset.construct.Rd index 97c9e7887602..e400e0a5f8d5 100644 --- a/R-package/man/lgb.Dataset.construct.Rd +++ b/R-package/man/lgb.Dataset.construct.Rd @@ -17,6 +17,8 @@ Construct Dataset explicitly } \examples{ \donttest{ +\dontshow{setLGBMthreads(2L)} +\dontshow{data.table::setDTthreads(1L)} data(agaricus.train, package = "lightgbm") train <- agaricus.train dtrain <- lgb.Dataset(train$data, label = train$label) diff --git a/R-package/man/lgb.Dataset.create.valid.Rd b/R-package/man/lgb.Dataset.create.valid.Rd index ab8ca753c2b9..fc50dff19986 100644 --- a/R-package/man/lgb.Dataset.create.valid.Rd +++ b/R-package/man/lgb.Dataset.create.valid.Rd @@ -48,6 +48,8 @@ Construct validation data according to training data } \examples{ \donttest{ +\dontshow{setLGBMthreads(2L)} +\dontshow{data.table::setDTthreads(1L)} data(agaricus.train, package = "lightgbm") train <- agaricus.train dtrain <- lgb.Dataset(train$data, label = train$label) diff --git a/R-package/man/lgb.Dataset.save.Rd b/R-package/man/lgb.Dataset.save.Rd index 5ea38227ba66..b03c2c5e0ac5 100644 --- a/R-package/man/lgb.Dataset.save.Rd +++ b/R-package/man/lgb.Dataset.save.Rd @@ -20,6 +20,8 @@ Please note that \code{init_score} is not saved in binary file. } \examples{ \donttest{ +\dontshow{setLGBMthreads(2L)} +\dontshow{data.table::setDTthreads(1L)} data(agaricus.train, package = "lightgbm") train <- agaricus.train dtrain <- lgb.Dataset(train$data, label = train$label) diff --git a/R-package/man/lgb.Dataset.set.categorical.Rd b/R-package/man/lgb.Dataset.set.categorical.Rd index 26eb10770e47..5dfcc9a771e8 100644 --- a/R-package/man/lgb.Dataset.set.categorical.Rd +++ b/R-package/man/lgb.Dataset.set.categorical.Rd @@ -22,6 +22,8 @@ Set the categorical features of an \code{lgb.Dataset} object. Use this function } \examples{ \donttest{ +\dontshow{setLGBMthreads(2L)} +\dontshow{data.table::setDTthreads(1L)} data(agaricus.train, package = "lightgbm") train <- agaricus.train dtrain <- lgb.Dataset(train$data, label = train$label) diff --git a/R-package/man/lgb.Dataset.set.reference.Rd b/R-package/man/lgb.Dataset.set.reference.Rd index 349b0b22913e..a4efbfac5962 100644 --- a/R-package/man/lgb.Dataset.set.reference.Rd +++ b/R-package/man/lgb.Dataset.set.reference.Rd @@ -19,6 +19,8 @@ If you want to use validation data, you should set reference to training data } \examples{ \donttest{ +\dontshow{setLGBMthreads(2L)} +\dontshow{data.table::setDTthreads(1L)} # create training Dataset data(agaricus.train, package ="lightgbm") train <- agaricus.train diff --git a/R-package/man/lgb.configure_fast_predict.Rd b/R-package/man/lgb.configure_fast_predict.Rd index 39fe6afa6b18..e02600451df5 100644 --- a/R-package/man/lgb.configure_fast_predict.Rd +++ b/R-package/man/lgb.configure_fast_predict.Rd @@ -114,6 +114,8 @@ Calling this function multiple times with different parameters might not overrid } \examples{ \donttest{ +\dontshow{setLGBMthreads(2L)} +\dontshow{data.table::setDTthreads(1L)} library(lightgbm) data(mtcars) X <- as.matrix(mtcars[, -1L]) diff --git a/R-package/man/lgb.cv.Rd b/R-package/man/lgb.cv.Rd index 555cb11c7bb3..7ea2928c6166 100644 --- a/R-package/man/lgb.cv.Rd +++ b/R-package/man/lgb.cv.Rd @@ -152,6 +152,8 @@ Cross validation logic used by LightGBM \examples{ \donttest{ +\dontshow{setLGBMthreads(2L)} +\dontshow{data.table::setDTthreads(1L)} data(agaricus.train, package = "lightgbm") train <- agaricus.train dtrain <- lgb.Dataset(train$data, label = train$label) diff --git a/R-package/man/lgb.dump.Rd b/R-package/man/lgb.dump.Rd index f4e90242fd75..39f0e3018ac7 100644 --- a/R-package/man/lgb.dump.Rd +++ b/R-package/man/lgb.dump.Rd @@ -20,6 +20,8 @@ Dump LightGBM model to json \examples{ \donttest{ library(lightgbm) +\dontshow{setLGBMthreads(2L)} +\dontshow{data.table::setDTthreads(1L)} data(agaricus.train, package = "lightgbm") train <- agaricus.train dtrain <- lgb.Dataset(train$data, label = train$label) diff --git a/R-package/man/lgb.get.eval.result.Rd b/R-package/man/lgb.get.eval.result.Rd index 9c2293a0f909..0dc7eb0845c3 100644 --- a/R-package/man/lgb.get.eval.result.Rd +++ b/R-package/man/lgb.get.eval.result.Rd @@ -33,6 +33,8 @@ Given a \code{lgb.Booster}, return evaluation results for a } \examples{ \donttest{ +\dontshow{setLGBMthreads(2L)} +\dontshow{data.table::setDTthreads(1L)} # train a regression model data(agaricus.train, package = "lightgbm") train <- agaricus.train diff --git a/R-package/man/lgb.importance.Rd b/R-package/man/lgb.importance.Rd index 89a3d4e6b5b7..79cb82f5d8ef 100644 --- a/R-package/man/lgb.importance.Rd +++ b/R-package/man/lgb.importance.Rd @@ -25,6 +25,8 @@ Creates a \code{data.table} of feature importances in a model. } \examples{ \donttest{ +\dontshow{setLGBMthreads(2L)} +\dontshow{data.table::setDTthreads(1L)} data(agaricus.train, package = "lightgbm") train <- agaricus.train dtrain <- lgb.Dataset(train$data, label = train$label) diff --git a/R-package/man/lgb.interprete.Rd b/R-package/man/lgb.interprete.Rd index c1166b2c1cc9..3acc27955c46 100644 --- a/R-package/man/lgb.interprete.Rd +++ b/R-package/man/lgb.interprete.Rd @@ -30,6 +30,8 @@ Computes feature contribution components of rawscore prediction. } \examples{ \donttest{ +\dontshow{setLGBMthreads(2L)} +\dontshow{data.table::setDTthreads(1L)} Logit <- function(x) log(x / (1.0 - x)) data(agaricus.train, package = "lightgbm") train <- agaricus.train diff --git a/R-package/man/lgb.load.Rd b/R-package/man/lgb.load.Rd index c1a00a20974b..f145db5a245e 100644 --- a/R-package/man/lgb.load.Rd +++ b/R-package/man/lgb.load.Rd @@ -20,6 +20,8 @@ Load LightGBM takes in either a file path or model string. } \examples{ \donttest{ +\dontshow{setLGBMthreads(2L)} +\dontshow{data.table::setDTthreads(1L)} data(agaricus.train, package = "lightgbm") train <- agaricus.train dtrain <- lgb.Dataset(train$data, label = train$label) diff --git a/R-package/man/lgb.model.dt.tree.Rd b/R-package/man/lgb.model.dt.tree.Rd index 4d02ede9a001..60ef8cdac133 100644 --- a/R-package/man/lgb.model.dt.tree.Rd +++ b/R-package/man/lgb.model.dt.tree.Rd @@ -40,6 +40,8 @@ Parse a LightGBM model json dump into a \code{data.table} structure. } \examples{ \donttest{ +\dontshow{setLGBMthreads(2L)} +\dontshow{data.table::setDTthreads(1L)} data(agaricus.train, package = "lightgbm") train <- agaricus.train dtrain <- lgb.Dataset(train$data, label = train$label) diff --git a/R-package/man/lgb.plot.importance.Rd b/R-package/man/lgb.plot.importance.Rd index 302f46460e3f..bdf354da0385 100644 --- a/R-package/man/lgb.plot.importance.Rd +++ b/R-package/man/lgb.plot.importance.Rd @@ -38,6 +38,8 @@ Features are shown ranked in a decreasing importance order. } \examples{ \donttest{ +\dontshow{setLGBMthreads(2L)} +\dontshow{data.table::setDTthreads(1L)} data(agaricus.train, package = "lightgbm") train <- agaricus.train dtrain <- lgb.Dataset(train$data, label = train$label) diff --git a/R-package/man/lgb.plot.interpretation.Rd b/R-package/man/lgb.plot.interpretation.Rd index a914071e896f..6f168e120a4e 100644 --- a/R-package/man/lgb.plot.interpretation.Rd +++ b/R-package/man/lgb.plot.interpretation.Rd @@ -35,6 +35,8 @@ contribution of a feature. Features are shown ranked in a decreasing contributio } \examples{ \donttest{ +\dontshow{setLGBMthreads(2L)} +\dontshow{data.table::setDTthreads(1L)} Logit <- function(x) { log(x / (1.0 - x)) } diff --git a/R-package/man/lgb.restore_handle.Rd b/R-package/man/lgb.restore_handle.Rd index 95cbdc64485d..37922c077642 100644 --- a/R-package/man/lgb.restore_handle.Rd +++ b/R-package/man/lgb.restore_handle.Rd @@ -27,7 +27,10 @@ function. If you wish to make fast single-row predictions using a \code{lgb.Boos call \link{lgb.configure_fast_predict} on the loaded \code{lgb.Booster} object. } \examples{ +\donttest{ library(lightgbm) +\dontshow{setLGBMthreads(2L)} +\dontshow{data.table::setDTthreads(1L)} data("agaricus.train") model <- lightgbm( agaricus.train$data @@ -45,6 +48,7 @@ model_new$check_null_handle() lgb.restore_handle(model_new) model_new$check_null_handle() } +} \seealso{ \link{lgb.make_serializable}, \link{lgb.drop_serialized}. } diff --git a/R-package/man/lgb.save.Rd b/R-package/man/lgb.save.Rd index efd110c7d816..62ec0ed462f6 100644 --- a/R-package/man/lgb.save.Rd +++ b/R-package/man/lgb.save.Rd @@ -21,6 +21,8 @@ Save LightGBM model } \examples{ \donttest{ +\dontshow{setLGBMthreads(2L)} +\dontshow{data.table::setDTthreads(1L)} library(lightgbm) data(agaricus.train, package = "lightgbm") train <- agaricus.train diff --git a/R-package/man/lgb.train.Rd b/R-package/man/lgb.train.Rd index 0f2961edc415..557c85b7f9dc 100644 --- a/R-package/man/lgb.train.Rd +++ b/R-package/man/lgb.train.Rd @@ -130,6 +130,8 @@ Low-level R interface to train a LightGBM model. Unlike \code{\link{lightgbm}}, \examples{ \donttest{ +\dontshow{setLGBMthreads(2L)} +\dontshow{data.table::setDTthreads(1L)} data(agaricus.train, package = "lightgbm") train <- agaricus.train dtrain <- lgb.Dataset(train$data, label = train$label) diff --git a/R-package/man/predict.lgb.Booster.Rd b/R-package/man/predict.lgb.Booster.Rd index 2df13b9bc374..bcb2f3f980fb 100644 --- a/R-package/man/predict.lgb.Booster.Rd +++ b/R-package/man/predict.lgb.Booster.Rd @@ -121,6 +121,8 @@ If the model object has been configured for fast single-row predictions through } \examples{ \donttest{ +\dontshow{setLGBMthreads(2L)} +\dontshow{data.table::setDTthreads(1L)} data(agaricus.train, package = "lightgbm") train <- agaricus.train dtrain <- lgb.Dataset(train$data, label = train$label) diff --git a/R-package/man/readRDS.lgb.Booster.Rd b/R-package/man/readRDS.lgb.Booster.Rd index 6a8e4c80ca91..0a144434cd36 100644 --- a/R-package/man/readRDS.lgb.Booster.Rd +++ b/R-package/man/readRDS.lgb.Booster.Rd @@ -23,6 +23,8 @@ Calls \code{readRDS} in what is expected to be a serialized \code{lgb.Booster} o \examples{ \donttest{ library(lightgbm) +\dontshow{setLGBMthreads(2L)} +\dontshow{data.table::setDTthreads(1L)} data(agaricus.train, package = "lightgbm") train <- agaricus.train dtrain <- lgb.Dataset(train$data, label = train$label) diff --git a/R-package/man/saveRDS.lgb.Booster.Rd b/R-package/man/saveRDS.lgb.Booster.Rd index a8664243dce2..b9b34e1fd021 100644 --- a/R-package/man/saveRDS.lgb.Booster.Rd +++ b/R-package/man/saveRDS.lgb.Booster.Rd @@ -46,6 +46,8 @@ Calls \code{saveRDS} on an \code{lgb.Booster} object, making it serializable bef \examples{ \donttest{ library(lightgbm) +\dontshow{setLGBMthreads(2L)} +\dontshow{data.table::setDTthreads(1L)} data(agaricus.train, package = "lightgbm") train <- agaricus.train dtrain <- lgb.Dataset(train$data, label = train$label) diff --git a/R-package/man/setLGBMThreads.Rd b/R-package/man/setLGBMThreads.Rd new file mode 100644 index 000000000000..53336fc2548e --- /dev/null +++ b/R-package/man/setLGBMThreads.Rd @@ -0,0 +1,32 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/multithreading.R +\name{setLGBMThreads} +\alias{setLGBMThreads} +\alias{setLGBMthreads} +\title{Set maximum number of threads used by LightGBM} +\usage{ +setLGBMthreads(num_threads) +} +\arguments{ +\item{num_threads}{maximum number of threads to be used by LightGBM in multi-threaded operations} +} +\description{ +LightGBM attempts to speed up many operations by using multi-threading. + The number of threads used in those operations can be controlled via the + \code{num_threads} parameter passed through \code{params} to functions like + \link{lgb.train} and \link{lgb.Dataset}. However, some operations (like materializing + a model from a text file) are done via code paths that don't explicitly accept thread-control + configuration. + + Use this function to set the maximum number of threads LightGBM will use for such operations. + + This function affects all LightGBM operations in the same process. + + So, for example, if you call \code{setLGBMthreads(4)}, no other multi-threaded LightGBM + operation in the same process will use more than 4 threads. + + Call \code{setLGBMthreads(-1)} to remove this limitation. +} +\seealso{ +\link{getLGBMthreads} +} diff --git a/R-package/man/set_field.Rd b/R-package/man/set_field.Rd index f9901e27eefd..2ceebfb87753 100644 --- a/R-package/man/set_field.Rd +++ b/R-package/man/set_field.Rd @@ -34,6 +34,8 @@ Set one attribute of a \code{lgb.Dataset} } \examples{ \donttest{ +\dontshow{setLGBMthreads(2L)} +\dontshow{data.table::setDTthreads(1L)} data(agaricus.train, package = "lightgbm") train <- agaricus.train dtrain <- lgb.Dataset(train$data, label = train$label) diff --git a/R-package/man/slice.Rd b/R-package/man/slice.Rd index 1d7bec08de0f..a65809a239d8 100644 --- a/R-package/man/slice.Rd +++ b/R-package/man/slice.Rd @@ -23,6 +23,8 @@ Get a new \code{lgb.Dataset} containing the specified rows of } \examples{ \donttest{ +\dontshow{setLGBMthreads(2L)} +\dontshow{data.table::setDTthreads(1L)} data(agaricus.train, package = "lightgbm") train <- agaricus.train dtrain <- lgb.Dataset(train$data, label = train$label) diff --git a/R-package/src/Makevars.in b/R-package/src/Makevars.in index ba9ef054bfab..c04263f62c1c 100644 --- a/R-package/src/Makevars.in +++ b/R-package/src/Makevars.in @@ -53,5 +53,6 @@ OBJECTS = \ treelearner/serial_tree_learner.o \ treelearner/tree_learner.o \ treelearner/voting_parallel_tree_learner.o \ + utils/openmp_wrapper.o \ c_api.o \ lightgbm_R.o diff --git a/R-package/src/Makevars.win.in b/R-package/src/Makevars.win.in index 14f5afde002f..86d56fecdf34 100644 --- a/R-package/src/Makevars.win.in +++ b/R-package/src/Makevars.win.in @@ -54,5 +54,6 @@ OBJECTS = \ treelearner/serial_tree_learner.o \ treelearner/tree_learner.o \ treelearner/voting_parallel_tree_learner.o \ + utils/openmp_wrapper.o \ c_api.o \ lightgbm_R.o diff --git a/R-package/src/lightgbm_R.cpp b/R-package/src/lightgbm_R.cpp index 3ae7a98d8537..4799f8540497 100644 --- a/R-package/src/lightgbm_R.cpp +++ b/R-package/src/lightgbm_R.cpp @@ -1212,6 +1212,23 @@ SEXP LGBM_BoosterGetLoadedParam_R(SEXP handle) { R_API_END(); } +SEXP LGBM_GetMaxThreads_R(SEXP out) { + R_API_BEGIN(); + int num_threads; + CHECK_CALL(LGBM_GetMaxThreads(&num_threads)); + INTEGER(out)[0] = num_threads; + return R_NilValue; + R_API_END(); +} + +SEXP LGBM_SetMaxThreads_R(SEXP num_threads) { + R_API_BEGIN(); + int new_num_threads = Rf_asInteger(num_threads); + CHECK_CALL(LGBM_SetMaxThreads(new_num_threads)); + return R_NilValue; + R_API_END(); +} + // .Call() calls static const R_CallMethodDef CallEntries[] = { {"LGBM_HandleIsNull_R" , (DL_FUNC) &LGBM_HandleIsNull_R , 1}, @@ -1268,6 +1285,8 @@ static const R_CallMethodDef CallEntries[] = { {"LGBM_BoosterDumpModel_R" , (DL_FUNC) &LGBM_BoosterDumpModel_R , 3}, {"LGBM_NullBoosterHandleError_R" , (DL_FUNC) &LGBM_NullBoosterHandleError_R , 0}, {"LGBM_DumpParamAliases_R" , (DL_FUNC) &LGBM_DumpParamAliases_R , 0}, + {"LGBM_GetMaxThreads_R" , (DL_FUNC) &LGBM_GetMaxThreads_R , 1}, + {"LGBM_SetMaxThreads_R" , (DL_FUNC) &LGBM_SetMaxThreads_R , 1}, {NULL, NULL, 0} }; diff --git a/R-package/src/lightgbm_R.h b/R-package/src/lightgbm_R.h index 7141a06a207c..4f0407e8f2ec 100644 --- a/R-package/src/lightgbm_R.h +++ b/R-package/src/lightgbm_R.h @@ -850,4 +850,23 @@ LIGHTGBM_C_EXPORT SEXP LGBM_BoosterDumpModel_R( */ LIGHTGBM_C_EXPORT SEXP LGBM_DumpParamAliases_R(); +/*! +* \brief Get current maximum number of threads used by LightGBM routines in this process. +* \param[out] out current maximum number of threads used by LightGBM. -1 means defaulting to omp_get_num_threads(). +* \return R NULL value +*/ +LIGHTGBM_C_EXPORT SEXP LGBM_GetMaxThreads_R( + SEXP out +); + + +/*! +* \brief Set maximum number of threads used by LightGBM routines in this process. +* \param num_threads maximum number of threads used by LightGBM. -1 means defaulting to omp_get_num_threads(). +* \return R NULL value +*/ +LIGHTGBM_C_EXPORT SEXP LGBM_SetMaxThreads_R( + SEXP num_threads +); + #endif // LIGHTGBM_R_H_ diff --git a/R-package/tests/testthat/helper.R b/R-package/tests/testthat/helper.R index 9c928c1f71d1..45edf40efbeb 100644 --- a/R-package/tests/testthat/helper.R +++ b/R-package/tests/testthat/helper.R @@ -11,6 +11,11 @@ # the check farm is a shared resource and will typically be running many checks simultaneously. # .LGB_MAX_THREADS <- 2L +setLGBMthreads(.LGB_MAX_THREADS) + +# control data.table parallelism +# ref: https://github.com/Rdatatable/data.table/issues/5658 +data.table::setDTthreads(1L) # by default, how much should results in tests be allowed to differ from hard-coded expected numbers? .LGB_NUMERIC_TOLERANCE <- 1e-6 diff --git a/R-package/tests/testthat/test_multithreading.R b/R-package/tests/testthat/test_multithreading.R new file mode 100644 index 000000000000..e2f3169627a2 --- /dev/null +++ b/R-package/tests/testthat/test_multithreading.R @@ -0,0 +1,16 @@ +test_that("getLGBMthreads() and setLGBMthreads() work as expected", { + # works with integer input + ret <- setLGBMthreads(2L) + expect_null(ret) + expect_equal(getLGBMthreads(), 2L) + + # works with float input + ret <- setLGBMthreads(1.0) + expect_null(ret) + expect_equal(getLGBMthreads(), 1L) + + # setting to any negative number sets max threads to -1 + ret <- setLGBMthreads(-312L) + expect_null(ret) + expect_equal(getLGBMthreads(), -1L) +}) diff --git a/R-package/vignettes/basic_walkthrough.Rmd b/R-package/vignettes/basic_walkthrough.Rmd index d7aaf676f386..82bd6957640c 100644 --- a/R-package/vignettes/basic_walkthrough.Rmd +++ b/R-package/vignettes/basic_walkthrough.Rmd @@ -27,6 +27,12 @@ Welcome to the world of [LightGBM](https://lightgbm.readthedocs.io/en/latest/), library(lightgbm) ``` +```{r, include=FALSE} +# limit number of threads used, to be respectful of CRAN's resources when it checks this vignette +data.table::setDTthreads(1L) +setLGBMthreads(2L) +``` + This vignette will guide you through its basic usage. It will show how to build a simple binary classification model based on a subset of the `bank` dataset (Moro, Cortez, and Rita 2014). You will use the two input features "age" and "balance" to predict whether a client has subscribed a term deposit. ## The dataset diff --git a/build-cran-package.sh b/build-cran-package.sh index 1c8a5dfbdc48..9fa0c5877085 100755 --- a/build-cran-package.sh +++ b/build-cran-package.sh @@ -227,6 +227,7 @@ if ${BUILD_VIGNETTES} ; then rm -f ./lightgbm/src/network/*.o rm -f ./lightgbm/src/objective/*.o rm -f ./lightgbm/src/treelearner/*.o + rm -f ./lightgbm/src/utils/*.o echo "re-tarring ${TARBALL_NAME}" tar \ diff --git a/include/LightGBM/c_api.h b/include/LightGBM/c_api.h index ada2e4109638..397005477a5c 100644 --- a/include/LightGBM/c_api.h +++ b/include/LightGBM/c_api.h @@ -1561,6 +1561,20 @@ LIGHTGBM_C_EXPORT int LGBM_NetworkInitWithFunctions(int num_machines, void* reduce_scatter_ext_fun, void* allgather_ext_fun); +/*! + * \brief Set maximum number of threads used by LightGBM routines in this process. + * \param num_threads maximum number of threads used by LightGBM. -1 means defaulting to omp_get_num_threads(). + * \return 0 when succeed, -1 when failure happens + */ +LIGHTGBM_C_EXPORT int LGBM_SetMaxThreads(int num_threads); + +/*! + * \brief Get current maximum number of threads used by LightGBM routines in this process. + * \param[out] out current maximum number of threads used by LightGBM. -1 means defaulting to omp_get_num_threads(). + * \return 0 when succeed, -1 when failure happens + */ +LIGHTGBM_C_EXPORT int LGBM_GetMaxThreads(int* out); + #if !defined(__cplusplus) && (!defined(__STDC__) || (__STDC_VERSION__ < 199901L)) /*! \brief Inline specifier no-op in C using standards before C99. */ #define INLINE_FUNCTION diff --git a/include/LightGBM/utils/openmp_wrapper.h b/include/LightGBM/utils/openmp_wrapper.h index a337fc353b75..b9a8ea2982fc 100644 --- a/include/LightGBM/utils/openmp_wrapper.h +++ b/include/LightGBM/utils/openmp_wrapper.h @@ -5,6 +5,15 @@ #ifndef LIGHTGBM_OPENMP_WRAPPER_H_ #define LIGHTGBM_OPENMP_WRAPPER_H_ +#include + +// this can only be changed by LGBM_SetMaxThreads() +LIGHTGBM_EXTERN_C int LGBM_MAX_NUM_THREADS; + +// this is modified by OMP_SET_NUM_THREADS(), for example +// by passing num_thread through params +LIGHTGBM_EXTERN_C int LGBM_DEFAULT_NUM_THREADS; + #ifdef _OPENMP #include @@ -17,22 +26,25 @@ #include #include -inline int OMP_NUM_THREADS() { - int ret = 1; -#pragma omp parallel -#pragma omp master - { ret = omp_get_num_threads(); } - return ret; -} - -inline void OMP_SET_NUM_THREADS(int num_threads) { - static const int default_omp_num_threads = OMP_NUM_THREADS(); - if (num_threads > 0) { - omp_set_num_threads(num_threads); - } else { - omp_set_num_threads(default_omp_num_threads); - } -} +/* + Get number of threads to use in OpenMP parallel regions. + + By default, this will return the result of omp_get_max_threads(), + which is OpenMP-implementation dependent but generally can be controlled + by environment variable OMP_NUM_THREADS. + + ref: + - https://www.openmp.org/spec-html/5.0/openmpsu112.html + - https://gcc.gnu.org/onlinedocs/libgomp/omp_005fget_005fmax_005fthreads.html +*/ +LIGHTGBM_EXTERN_C int OMP_NUM_THREADS(); + +/* + Update the default number of threads that'll be used in OpenMP parallel + regions for LightGBM routines where the number of threads aren't directly + supplied. +*/ +LIGHTGBM_EXTERN_C void OMP_SET_NUM_THREADS(int num_threads); class ThreadExceptionHelper { public: @@ -102,10 +114,7 @@ class ThreadExceptionHelper { /** Fall here if no OPENMP support, so just simulate a single thread running. All #pragma omp should be ignored by the compiler **/ - inline void omp_set_num_threads(int) __GOMP_NOTHROW {} // NOLINT (no cast done here) inline void OMP_SET_NUM_THREADS(int) __GOMP_NOTHROW {} - inline int omp_get_num_threads() __GOMP_NOTHROW {return 1;} - inline int omp_get_max_threads() __GOMP_NOTHROW {return 1;} inline int omp_get_thread_num() __GOMP_NOTHROW {return 0;} inline int OMP_NUM_THREADS() __GOMP_NOTHROW { return 1; } #ifdef __cplusplus diff --git a/src/c_api.cpp b/src/c_api.cpp index baf934db42b1..dbe5425bd0aa 100644 --- a/src/c_api.cpp +++ b/src/c_api.cpp @@ -2699,6 +2699,23 @@ int LGBM_NetworkInitWithFunctions(int num_machines, int rank, API_END(); } +int LGBM_SetMaxThreads(int num_threads) { + API_BEGIN(); + if (num_threads <= 0) { + LGBM_MAX_NUM_THREADS = -1; + } else { + LGBM_MAX_NUM_THREADS = num_threads; + } + API_END(); +} + +int LGBM_GetMaxThreads(int* out) { + API_BEGIN(); + *out = LGBM_MAX_NUM_THREADS; + API_END(); +} + + // ---- start of some help functions diff --git a/src/utils/openmp_wrapper.cpp b/src/utils/openmp_wrapper.cpp new file mode 100644 index 000000000000..fb6e661eb67c --- /dev/null +++ b/src/utils/openmp_wrapper.cpp @@ -0,0 +1,44 @@ +/*! + * Copyright (c) 2023 Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See LICENSE file in the project root for license information. + */ +#include + +int LGBM_MAX_NUM_THREADS = -1; + +int LGBM_DEFAULT_NUM_THREADS = -1; + +#ifdef _OPENMP + +#include + +int OMP_NUM_THREADS() { + int default_num_threads = 1; + + if (LGBM_DEFAULT_NUM_THREADS > 0) { + // if LightGBM-specific default has been set, ignore OpenMP-global config + default_num_threads = LGBM_DEFAULT_NUM_THREADS; + } else { + // otherwise, default to OpenMP-global config + #pragma omp single + { default_num_threads = omp_get_max_threads(); } + } + + // ensure that if LGBM_SetMaxThreads() was ever called, LightGBM doesn't + // use more than that many threads + if (LGBM_MAX_NUM_THREADS > 0 && default_num_threads > LGBM_MAX_NUM_THREADS) { + return LGBM_MAX_NUM_THREADS; + } + + return default_num_threads; +} + +void OMP_SET_NUM_THREADS(int num_threads) { + if (num_threads <= 0) { + LGBM_DEFAULT_NUM_THREADS = -1; + } else { + LGBM_DEFAULT_NUM_THREADS = num_threads; + } +} + +#endif // _OPENMP diff --git a/tests/c_api_test/test_.py b/tests/c_api_test/test_.py index 4bb76e4aba19..6cfec1c445fc 100644 --- a/tests/c_api_test/test_.py +++ b/tests/c_api_test/test_.py @@ -247,3 +247,36 @@ def test_booster(): c_str(''), c_str('preb.txt')) LIB.LGBM_BoosterFree(booster2) + + +def test_max_thread_control(): + # at initialization, should be -1 + num_threads = ctypes.c_int(0) + ret = LIB.LGBM_GetMaxThreads( + ctypes.byref(num_threads) + ) + assert ret == 0 + assert num_threads.value == -1 + + # updating that value through the C API should work + ret = LIB.LGBM_SetMaxThreads( + ctypes.c_int(6) + ) + assert ret == 0 + + ret = LIB.LGBM_GetMaxThreads( + ctypes.byref(num_threads) + ) + assert ret == 0 + assert num_threads.value == 6 + + # resetting to any negative number should set it to -1 + ret = LIB.LGBM_SetMaxThreads( + ctypes.c_int(-123) + ) + assert ret == 0 + ret = LIB.LGBM_GetMaxThreads( + ctypes.byref(num_threads) + ) + assert ret == 0 + assert num_threads.value == -1 From 522f0f07b0eba0e3190c3e5c8e149a205bd20bf3 Mon Sep 17 00:00:00 2001 From: Oliver Borchert Date: Sun, 10 Dec 2023 18:29:04 +0100 Subject: [PATCH 6/8] [python-package] Add tests for passing Arrow arrays with empty chunks (#6210) --- include/LightGBM/arrow.h | 2 ++ tests/python_package_test/test_arrow.py | 38 ++++++++++++++++--------- 2 files changed, 27 insertions(+), 13 deletions(-) diff --git a/include/LightGBM/arrow.h b/include/LightGBM/arrow.h index 3d1c74713bd3..75511e17e72a 100644 --- a/include/LightGBM/arrow.h +++ b/include/LightGBM/arrow.h @@ -117,6 +117,7 @@ class ArrowChunkedArray { const struct ArrowSchema* schema) { chunks_.reserve(n_chunks); for (auto k = 0; k < n_chunks; ++k) { + if (chunks[k].length == 0) continue; chunks_.push_back(&chunks[k]); } schema_ = schema; @@ -220,6 +221,7 @@ class ArrowTable { std::vector children_chunks; children_chunks.reserve(n_chunks); for (int64_t k = 0; k < n_chunks; ++k) { + if (chunks[k].length == 0) continue; children_chunks.push_back(chunks[k].children[j]); } columns_.emplace_back(children_chunks, schema->children[j]); diff --git a/tests/python_package_test/test_arrow.py b/tests/python_package_test/test_arrow.py index 5e09465e34b3..7542368dcd63 100644 --- a/tests/python_package_test/test_arrow.py +++ b/tests/python_package_test/test_arrow.py @@ -30,18 +30,19 @@ ] -def generate_simple_arrow_table() -> pa.Table: +def generate_simple_arrow_table(empty_chunks: bool = False) -> pa.Table: + c: list[list[int]] = [[]] if empty_chunks else [] columns = [ - pa.chunked_array([[1, 2, 3, 4, 5]], type=pa.uint8()), - pa.chunked_array([[1, 2, 3, 4, 5]], type=pa.int8()), - pa.chunked_array([[1, 2, 3, 4, 5]], type=pa.uint16()), - pa.chunked_array([[1, 2, 3, 4, 5]], type=pa.int16()), - pa.chunked_array([[1, 2, 3, 4, 5]], type=pa.uint32()), - pa.chunked_array([[1, 2, 3, 4, 5]], type=pa.int32()), - pa.chunked_array([[1, 2, 3, 4, 5]], type=pa.uint64()), - pa.chunked_array([[1, 2, 3, 4, 5]], type=pa.int64()), - pa.chunked_array([[1, 2, 3, 4, 5]], type=pa.float32()), - pa.chunked_array([[1, 2, 3, 4, 5]], type=pa.float64()), + pa.chunked_array(c + [[1, 2, 3]] + c + [[4, 5]] + c, type=pa.uint8()), + pa.chunked_array(c + [[1, 2, 3]] + c + [[4, 5]] + c, type=pa.int8()), + pa.chunked_array(c + [[1, 2, 3]] + c + [[4, 5]] + c, type=pa.uint16()), + pa.chunked_array(c + [[1, 2, 3]] + c + [[4, 5]] + c, type=pa.int16()), + pa.chunked_array(c + [[1, 2, 3]] + c + [[4, 5]] + c, type=pa.uint32()), + pa.chunked_array(c + [[1, 2, 3]] + c + [[4, 5]] + c, type=pa.int32()), + pa.chunked_array(c + [[1, 2, 3]] + c + [[4, 5]] + c, type=pa.uint64()), + pa.chunked_array(c + [[1, 2, 3]] + c + [[4, 5]] + c, type=pa.int64()), + pa.chunked_array(c + [[1, 2, 3]] + c + [[4, 5]] + c, type=pa.float32()), + pa.chunked_array(c + [[1, 2, 3]] + c + [[4, 5]] + c, type=pa.float64()), ] return pa.Table.from_arrays(columns, names=[f"col_{i}" for i in range(len(columns))]) @@ -104,6 +105,7 @@ def dummy_dataset_params() -> Dict[str, Any]: ("arrow_table_fn", "dataset_params"), [ # Use lambda functions here to minimize memory consumption (lambda: generate_simple_arrow_table(), dummy_dataset_params()), + (lambda: generate_simple_arrow_table(empty_chunks=True), dummy_dataset_params()), (lambda: generate_dummy_arrow_table(), dummy_dataset_params()), (lambda: generate_nullable_arrow_table(), dummy_dataset_params()), (lambda: generate_random_arrow_table(3, 1000, 42), {}), @@ -160,7 +162,12 @@ def test_dataset_construct_fields_fuzzy(): @pytest.mark.parametrize( ["array_type", "label_data"], - [(pa.array, [0, 1, 0, 0, 1]), (pa.chunked_array, [[0], [1, 0, 0, 1]])], + [ + (pa.array, [0, 1, 0, 0, 1]), + (pa.chunked_array, [[0], [1, 0, 0, 1]]), + (pa.chunked_array, [[], [0], [1, 0, 0, 1]]), + (pa.chunked_array, [[0], [], [1, 0], [], [], [0, 1], []]), + ], ) @pytest.mark.parametrize("arrow_type", _INTEGER_TYPES + _FLOAT_TYPES) def test_dataset_construct_labels(array_type, label_data, arrow_type): @@ -187,7 +194,12 @@ def test_dataset_construct_weights_none(): @pytest.mark.parametrize( ["array_type", "weight_data"], - [(pa.array, [3, 0.7, 1.5, 0.5, 0.1]), (pa.chunked_array, [[3], [0.7, 1.5, 0.5, 0.1]])], + [ + (pa.array, [3, 0.7, 1.5, 0.5, 0.1]), + (pa.chunked_array, [[3], [0.7, 1.5, 0.5, 0.1]]), + (pa.chunked_array, [[], [3], [0.7, 1.5, 0.5, 0.1]]), + (pa.chunked_array, [[3], [0.7], [], [], [1.5, 0.5, 0.1], []]), + ], ) @pytest.mark.parametrize("arrow_type", _FLOAT_TYPES) def test_dataset_construct_weights(array_type, weight_data, arrow_type): From 6fc80528f15b92921ecffaaa14b6bddaa0de3404 Mon Sep 17 00:00:00 2001 From: June Liu <103498042+Zhaojun-Liu@users.noreply.github.com> Date: Wed, 13 Dec 2023 12:06:28 +0800 Subject: [PATCH 7/8] fix errors from MSVC '/permissive-' mode (fixes #6230) (#6232) --- include/LightGBM/arrow.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/LightGBM/arrow.h b/include/LightGBM/arrow.h index 75511e17e72a..767da12a9809 100644 --- a/include/LightGBM/arrow.h +++ b/include/LightGBM/arrow.h @@ -16,6 +16,7 @@ #include #include #include +#include /* -------------------------------------- C DATA INTERFACE ------------------------------------- */ // The C data interface is taken from From 2dfb9a40478b965db8325baa21a63d9281f96b7c Mon Sep 17 00:00:00 2001 From: Oliver Borchert Date: Thu, 14 Dec 2023 04:35:46 +0100 Subject: [PATCH 8/8] [python-package] Allow to pass Arrow table for prediction (#6168) --- include/LightGBM/c_api.h | 34 ++++++ python-package/lightgbm/basic.py | 56 +++++++++- src/c_api.cpp | 51 +++++++++ tests/python_package_test/test_arrow.py | 133 +++++++++++++++++++++--- 4 files changed, 259 insertions(+), 15 deletions(-) diff --git a/include/LightGBM/c_api.h b/include/LightGBM/c_api.h index 397005477a5c..b43f096c31ee 100644 --- a/include/LightGBM/c_api.h +++ b/include/LightGBM/c_api.h @@ -1417,6 +1417,40 @@ LIGHTGBM_C_EXPORT int LGBM_BoosterPredictForMats(BoosterHandle handle, int64_t* out_len, double* out_result); +/*! + * \brief Make prediction for a new dataset. + * \note + * You should pre-allocate memory for ``out_result``: + * - for normal and raw score, its length is equal to ``num_class * num_data``; + * - for leaf index, its length is equal to ``num_class * num_data * num_iteration``; + * - for feature contributions, its length is equal to ``num_class * num_data * (num_feature + 1)``. + * \param handle Handle of booster + * \param n_chunks The number of Arrow arrays passed to this function + * \param chunks Pointer to the list of Arrow arrays + * \param schema Pointer to the schema of all Arrow arrays + * \param predict_type What should be predicted + * - ``C_API_PREDICT_NORMAL``: normal prediction, with transform (if needed); + * - ``C_API_PREDICT_RAW_SCORE``: raw score; + * - ``C_API_PREDICT_LEAF_INDEX``: leaf index; + * - ``C_API_PREDICT_CONTRIB``: feature contributions (SHAP values) + * \param start_iteration Start index of the iteration to predict + * \param num_iteration Number of iteration for prediction, <= 0 means no limit + * \param parameter Other parameters for prediction, e.g. early stopping for prediction + * \param[out] out_len Length of output result + * \param[out] out_result Pointer to array with predictions + * \return 0 when succeed, -1 when failure happens + */ +LIGHTGBM_C_EXPORT int LGBM_BoosterPredictForArrow(BoosterHandle handle, + int64_t n_chunks, + const ArrowArray* chunks, + const ArrowSchema* schema, + int predict_type, + int start_iteration, + int num_iteration, + const char* parameter, + int64_t* out_len, + double* out_result); + /*! * \brief Save model into file. * \param handle Handle of booster diff --git a/python-package/lightgbm/basic.py b/python-package/lightgbm/basic.py index c4022e7fdd9a..560a9a438872 100644 --- a/python-package/lightgbm/basic.py +++ b/python-package/lightgbm/basic.py @@ -115,7 +115,8 @@ np.ndarray, pd_DataFrame, dt_DataTable, - scipy.sparse.spmatrix + scipy.sparse.spmatrix, + pa_Table, ] _LGBM_WeightType = Union[ List[float], @@ -1069,7 +1070,7 @@ def predict( Parameters ---------- - data : str, pathlib.Path, numpy array, pandas DataFrame, H2O DataTable's Frame or scipy.sparse + data : str, pathlib.Path, numpy array, pandas DataFrame, pyarrow Table, H2O DataTable's Frame or scipy.sparse Data source for prediction. If str or pathlib.Path, it represents the path to a text file (CSV, TSV, or LibSVM). start_iteration : int, optional (default=0) @@ -1161,6 +1162,13 @@ def predict( num_iteration=num_iteration, predict_type=predict_type ) + elif _is_pyarrow_table(data): + preds, nrow = self.__pred_for_pyarrow_table( + table=data, + start_iteration=start_iteration, + num_iteration=num_iteration, + predict_type=predict_type + ) elif isinstance(data, list): try: data = np.array(data) @@ -1614,6 +1622,48 @@ def __pred_for_csc( if n_preds != out_num_preds.value: raise ValueError("Wrong length for predict results") return preds, nrow + + def __pred_for_pyarrow_table( + self, + table: pa_Table, + start_iteration: int, + num_iteration: int, + predict_type: int + ) -> Tuple[np.ndarray, int]: + """Predict for a PyArrow table.""" + if not PYARROW_INSTALLED: + raise LightGBMError("Cannot predict from Arrow without `pyarrow` installed.") + + # Check that the input is valid: we only handle numbers (for now) + if not all(arrow_is_integer(t) or arrow_is_floating(t) for t in table.schema.types): + raise ValueError("Arrow table may only have integer or floating point datatypes") + + # Prepare prediction output array + n_preds = self.__get_num_preds( + start_iteration=start_iteration, + num_iteration=num_iteration, + nrow=table.num_rows, + predict_type=predict_type + ) + preds = np.empty(n_preds, dtype=np.float64) + out_num_preds = ctypes.c_int64(0) + + # Export Arrow table to C and run prediction + c_array = _export_arrow_to_c(table) + _safe_call(_LIB.LGBM_BoosterPredictForArrow( + self._handle, + ctypes.c_int64(c_array.n_chunks), + ctypes.c_void_p(c_array.chunks_ptr), + ctypes.c_void_p(c_array.schema_ptr), + ctypes.c_int(predict_type), + ctypes.c_int(start_iteration), + ctypes.c_int(num_iteration), + _c_str(self.pred_parameter), + ctypes.byref(out_num_preds), + preds.ctypes.data_as(ctypes.POINTER(ctypes.c_double)))) + if n_preds != out_num_preds.value: + raise ValueError("Wrong length for predict results") + return preds, table.num_rows def current_iteration(self) -> int: """Get the index of the current iteration. @@ -4350,7 +4400,7 @@ def predict( Parameters ---------- - data : str, pathlib.Path, numpy array, pandas DataFrame, H2O DataTable's Frame or scipy.sparse + data : str, pathlib.Path, numpy array, pandas DataFrame, pyarrow Table, H2O DataTable's Frame or scipy.sparse Data source for prediction. If str or pathlib.Path, it represents the path to a text file (CSV, TSV, or LibSVM). start_iteration : int, optional (default=0) diff --git a/src/c_api.cpp b/src/c_api.cpp index dbe5425bd0aa..67b18003588a 100644 --- a/src/c_api.cpp +++ b/src/c_api.cpp @@ -2568,6 +2568,57 @@ int LGBM_BoosterPredictForMats(BoosterHandle handle, API_END(); } +int LGBM_BoosterPredictForArrow(BoosterHandle handle, + int64_t n_chunks, + const ArrowArray* chunks, + const ArrowSchema* schema, + int predict_type, + int start_iteration, + int num_iteration, + const char* parameter, + int64_t* out_len, + double* out_result) { + API_BEGIN(); + + // Apply the configuration + auto param = Config::Str2Map(parameter); + Config config; + config.Set(param); + OMP_SET_NUM_THREADS(config.num_threads); + + // Set up chunked array and iterators for all columns + ArrowTable table(n_chunks, chunks, schema); + std::vector> its; + its.reserve(table.get_num_columns()); + for (int64_t j = 0; j < table.get_num_columns(); ++j) { + its.emplace_back(table.get_column(j).begin()); + } + + // Build row function + auto num_columns = table.get_num_columns(); + auto row_fn = [num_columns, &its] (int row_idx) { + std::vector> result; + result.reserve(num_columns); + for (int64_t j = 0; j < num_columns; ++j) { + result.emplace_back(static_cast(j), its[j][row_idx]); + } + return result; + }; + + // Run prediction + Booster* ref_booster = reinterpret_cast(handle); + ref_booster->Predict(start_iteration, + num_iteration, + predict_type, + static_cast(table.get_num_rows()), + static_cast(table.get_num_columns()), + row_fn, + config, + out_result, + out_len); + API_END(); +} + int LGBM_BoosterSaveModel(BoosterHandle handle, int start_iteration, int num_iteration, diff --git a/tests/python_package_test/test_arrow.py b/tests/python_package_test/test_arrow.py index 7542368dcd63..593c03d8c7ef 100644 --- a/tests/python_package_test/test_arrow.py +++ b/tests/python_package_test/test_arrow.py @@ -1,6 +1,6 @@ # coding: utf-8 import filecmp -from typing import Any, Dict +from typing import Any, Dict, Optional import numpy as np import pyarrow as pa @@ -63,19 +63,40 @@ def generate_dummy_arrow_table() -> pa.Table: return pa.Table.from_arrays([col1, col2], names=["a", "b"]) -def generate_random_arrow_table(num_columns: int, num_datapoints: int, seed: int) -> pa.Table: - columns = [generate_random_arrow_array(num_datapoints, seed + i) for i in range(num_columns)] +def generate_random_arrow_table( + num_columns: int, + num_datapoints: int, + seed: int, + generate_nulls: bool = True, + values: Optional[np.ndarray] = None, +) -> pa.Table: + columns = [ + generate_random_arrow_array( + num_datapoints, seed + i, generate_nulls=generate_nulls, values=values + ) + for i in range(num_columns) + ] names = [f"col_{i}" for i in range(num_columns)] return pa.Table.from_arrays(columns, names=names) -def generate_random_arrow_array(num_datapoints: int, seed: int) -> pa.ChunkedArray: +def generate_random_arrow_array( + num_datapoints: int, + seed: int, + generate_nulls: bool = True, + values: Optional[np.ndarray] = None, +) -> pa.ChunkedArray: generator = np.random.default_rng(seed) - data = generator.standard_normal(num_datapoints) + data = ( + generator.standard_normal(num_datapoints) + if values is None + else generator.choice(values, size=num_datapoints, replace=True) + ) # Set random nulls - indices = generator.choice(len(data), size=num_datapoints // 10) - data[indices] = None + if generate_nulls: + indices = generator.choice(len(data), size=num_datapoints // 10) + data[indices] = None # Split data into <=2 random chunks split_points = np.sort(generator.choice(np.arange(1, num_datapoints), 2, replace=False)) @@ -131,8 +152,8 @@ def test_dataset_construct_fuzzy(tmp_path, arrow_table_fn, dataset_params): def test_dataset_construct_fields_fuzzy(): arrow_table = generate_random_arrow_table(3, 1000, 42) - arrow_labels = generate_random_arrow_array(1000, 42) - arrow_weights = generate_random_arrow_array(1000, 42) + arrow_labels = generate_random_arrow_array(1000, 42, generate_nulls=False) + arrow_weights = generate_random_arrow_array(1000, 42, generate_nulls=False) arrow_groups = pa.chunked_array([[300, 400, 50], [250]], type=pa.int32()) arrow_dataset = lgb.Dataset( @@ -264,9 +285,9 @@ def test_dataset_construct_init_scores_table(): data = generate_dummy_arrow_table() init_scores = pa.Table.from_arrays( [ - generate_random_arrow_array(5, seed=1), - generate_random_arrow_array(5, seed=2), - generate_random_arrow_array(5, seed=3), + generate_random_arrow_array(5, seed=1, generate_nulls=False), + generate_random_arrow_array(5, seed=2, generate_nulls=False), + generate_random_arrow_array(5, seed=3, generate_nulls=False), ], names=["a", "b", "c"], ) @@ -276,3 +297,91 @@ def test_dataset_construct_init_scores_table(): actual = dataset.get_init_score() expected = init_scores.to_pandas().to_numpy().astype(np.float64) np_assert_array_equal(expected, actual, strict=True) + + +# ------------------------------------------ PREDICTION ----------------------------------------- # + + +def assert_equal_predict_arrow_pandas(booster: lgb.Booster, data: pa.Table): + p_arrow = booster.predict(data) + p_pandas = booster.predict(data.to_pandas()) + np_assert_array_equal(p_arrow, p_pandas, strict=True) + + p_raw_arrow = booster.predict(data, raw_score=True) + p_raw_pandas = booster.predict(data.to_pandas(), raw_score=True) + np_assert_array_equal(p_raw_arrow, p_raw_pandas, strict=True) + + p_leaf_arrow = booster.predict(data, pred_leaf=True) + p_leaf_pandas = booster.predict(data.to_pandas(), pred_leaf=True) + np_assert_array_equal(p_leaf_arrow, p_leaf_pandas, strict=True) + + p_pred_contrib_arrow = booster.predict(data, pred_contrib=True) + p_pred_contrib_pandas = booster.predict(data.to_pandas(), pred_contrib=True) + np_assert_array_equal(p_pred_contrib_arrow, p_pred_contrib_pandas, strict=True) + + p_first_iter_arrow = booster.predict(data, start_iteration=0, num_iteration=1, raw_score=True) + p_first_iter_pandas = booster.predict( + data.to_pandas(), start_iteration=0, num_iteration=1, raw_score=True + ) + np_assert_array_equal(p_first_iter_arrow, p_first_iter_pandas, strict=True) + + +def test_predict_regression(): + data = generate_random_arrow_table(10, 10000, 42) + dataset = lgb.Dataset( + data, + label=generate_random_arrow_array(10000, 43, generate_nulls=False), + params=dummy_dataset_params(), + ) + booster = lgb.train( + {"objective": "regression", "num_leaves": 7}, + dataset, + num_boost_round=5, + ) + assert_equal_predict_arrow_pandas(booster, data) + + +def test_predict_binary_classification(): + data = generate_random_arrow_table(10, 10000, 42) + dataset = lgb.Dataset( + data, + label=generate_random_arrow_array(10000, 43, generate_nulls=False, values=np.arange(2)), + params=dummy_dataset_params(), + ) + booster = lgb.train( + {"objective": "binary", "num_leaves": 7}, + dataset, + num_boost_round=5, + ) + assert_equal_predict_arrow_pandas(booster, data) + + +def test_predict_multiclass_classification(): + data = generate_random_arrow_table(10, 10000, 42) + dataset = lgb.Dataset( + data, + label=generate_random_arrow_array(10000, 43, generate_nulls=False, values=np.arange(5)), + params=dummy_dataset_params(), + ) + booster = lgb.train( + {"objective": "multiclass", "num_leaves": 7, "num_class": 5}, + dataset, + num_boost_round=5, + ) + assert_equal_predict_arrow_pandas(booster, data) + + +def test_predict_ranking(): + data = generate_random_arrow_table(10, 10000, 42) + dataset = lgb.Dataset( + data, + label=generate_random_arrow_array(10000, 43, generate_nulls=False, values=np.arange(4)), + group=np.array([1000, 2000, 3000, 4000]), + params=dummy_dataset_params(), + ) + booster = lgb.train( + {"objective": "lambdarank", "num_leaves": 7}, + dataset, + num_boost_round=5, + ) + assert_equal_predict_arrow_pandas(booster, data)