From ee0131c554844084d6b4eb7fbdf76c069fc40ad2 Mon Sep 17 00:00:00 2001 From: Nikita Titov Date: Thu, 19 Dec 2024 01:01:38 +0300 Subject: [PATCH 1/4] [ci] Add Release Drafter workflow (#6754) Co-authored-by: James Lamb --- .../{no-response.yml => no_response.yml} | 2 +- .github/workflows/release_drafter.yml | 23 +++++++++++++++++++ 2 files changed, 24 insertions(+), 1 deletion(-) rename .github/workflows/{no-response.yml => no_response.yml} (96%) create mode 100644 .github/workflows/release_drafter.yml diff --git a/.github/workflows/no-response.yml b/.github/workflows/no_response.yml similarity index 96% rename from .github/workflows/no-response.yml rename to .github/workflows/no_response.yml index 051d44fba20c..1994f53d7bc7 100644 --- a/.github/workflows/no-response.yml +++ b/.github/workflows/no_response.yml @@ -13,7 +13,7 @@ on: jobs: noResponse: - runs-on: ubuntu-22.04 + runs-on: ubuntu-latest steps: - uses: lee-dohm/no-response@v0.5.0 with: diff --git a/.github/workflows/release_drafter.yml b/.github/workflows/release_drafter.yml new file mode 100644 index 000000000000..80ad63b219bb --- /dev/null +++ b/.github/workflows/release_drafter.yml @@ -0,0 +1,23 @@ +name: Release Drafter + +permissions: + contents: read + +on: + push: + branches: + - master + +jobs: + updateReleaseDraft: + permissions: + contents: write + pull-requests: read + runs-on: ubuntu-latest + steps: + - uses: release-drafter/release-drafter@v6.0.0 + with: + config-name: release-drafter.yml + disable-autolabeler: true + env: + GITHUB_TOKEN: ${{ github.token }} From 4ee0bc05be56754672a5a36c0c402634b930d88e Mon Sep 17 00:00:00 2001 From: James Lamb Date: Sun, 22 Dec 2024 10:27:40 -0500 Subject: [PATCH 2/4] [python-package] stop relying on string concatenation / splitting for cv() eval results (#6761) Co-authored-by: Nikita Titov --- python-package/lightgbm/callback.py | 72 +++++++++++++----------- python-package/lightgbm/engine.py | 38 +++++++++---- tests/python_package_test/test_engine.py | 21 +++++++ 3 files changed, 86 insertions(+), 45 deletions(-) diff --git a/python-package/lightgbm/callback.py b/python-package/lightgbm/callback.py index c64fb8ba755b..be8d04ed7a5e 100644 --- a/python-package/lightgbm/callback.py +++ b/python-package/lightgbm/callback.py @@ -71,6 +71,14 @@ class CallbackEnv: evaluation_result_list: Optional[_ListOfEvalResultTuples] +def _is_using_cv(env: CallbackEnv) -> bool: + """Check if model in callback env is a CVBooster.""" + # this import is here to avoid a circular import + from .engine import CVBooster + + return isinstance(env.model, CVBooster) + + def _format_eval_result(value: _EvalResultTuple, show_stdv: bool) -> str: """Format metric string.""" dataset_name, metric_name, metric_value, *_ = value @@ -143,16 +151,13 @@ def _init(self, env: CallbackEnv) -> None: ) self.eval_result.clear() for item in env.evaluation_result_list: - if len(item) == 4: # regular train - data_name, eval_name = item[:2] - else: # cv - data_name, eval_name = item[1].split() - self.eval_result.setdefault(data_name, OrderedDict()) + dataset_name, metric_name, *_ = item + self.eval_result.setdefault(dataset_name, OrderedDict()) if len(item) == 4: - self.eval_result[data_name].setdefault(eval_name, []) + self.eval_result[dataset_name].setdefault(metric_name, []) else: - self.eval_result[data_name].setdefault(f"{eval_name}-mean", []) - self.eval_result[data_name].setdefault(f"{eval_name}-stdv", []) + self.eval_result[dataset_name].setdefault(f"{metric_name}-mean", []) + self.eval_result[dataset_name].setdefault(f"{metric_name}-stdv", []) def __call__(self, env: CallbackEnv) -> None: if env.iteration == env.begin_iteration: @@ -163,15 +168,16 @@ def __call__(self, env: CallbackEnv) -> None: "Please report it at https://github.com/microsoft/LightGBM/issues" ) for item in env.evaluation_result_list: + # for cv(), 'metric_value' is actually a mean of metric values over all CV folds + dataset_name, metric_name, metric_value, *_ = item if len(item) == 4: - data_name, eval_name, result = item[:3] - self.eval_result[data_name][eval_name].append(result) + # train() + self.eval_result[dataset_name][metric_name].append(metric_value) else: - data_name, eval_name = item[1].split() - res_mean = item[2] - res_stdv = item[4] # type: ignore[misc] - self.eval_result[data_name][f"{eval_name}-mean"].append(res_mean) - self.eval_result[data_name][f"{eval_name}-stdv"].append(res_stdv) + # cv() + metric_std_dev = item[4] # type: ignore[misc] + self.eval_result[dataset_name][f"{metric_name}-mean"].append(metric_value) + self.eval_result[dataset_name][f"{metric_name}-stdv"].append(metric_std_dev) def record_evaluation(eval_result: Dict[str, Dict[str, List[Any]]]) -> Callable: @@ -304,15 +310,15 @@ def _gt_delta(self, curr_score: float, best_score: float, delta: float) -> bool: def _lt_delta(self, curr_score: float, best_score: float, delta: float) -> bool: return curr_score < best_score - delta - def _is_train_set(self, ds_name: str, eval_name: str, env: CallbackEnv) -> bool: + def _is_train_set(self, dataset_name: str, env: CallbackEnv) -> bool: """Check, by name, if a given Dataset is the training data.""" # for lgb.cv() with eval_train_metric=True, evaluation is also done on the training set # and those metrics are considered for early stopping - if ds_name == "cv_agg" and eval_name == "train": + if _is_using_cv(env) and dataset_name == "train": return True # for lgb.train(), it's possible to pass the training data via valid_sets with any eval_name - if isinstance(env.model, Booster) and ds_name == env.model._train_data_name: + if isinstance(env.model, Booster) and dataset_name == env.model._train_data_name: return True return False @@ -327,11 +333,13 @@ def _init(self, env: CallbackEnv) -> None: _log_warning("Early stopping is not available in dart mode") return + # get details of the first dataset + first_dataset_name, first_metric_name, *_ = env.evaluation_result_list[0] + # validation sets are guaranteed to not be identical to the training data in cv() if isinstance(env.model, Booster): only_train_set = len(env.evaluation_result_list) == 1 and self._is_train_set( - ds_name=env.evaluation_result_list[0][0], - eval_name=env.evaluation_result_list[0][1].split(" ")[0], + dataset_name=first_dataset_name, env=env, ) if only_train_set: @@ -370,8 +378,7 @@ def _init(self, env: CallbackEnv) -> None: _log_info(f"Using {self.min_delta} as min_delta for all metrics.") deltas = [self.min_delta] * n_datasets * n_metrics - # split is needed for " " case (e.g. "train l1") - self.first_metric = env.evaluation_result_list[0][1].split(" ")[-1] + self.first_metric = first_metric_name for eval_ret, delta in zip(env.evaluation_result_list, deltas): self.best_iter.append(0) if eval_ret[3]: # greater is better @@ -381,7 +388,7 @@ def _init(self, env: CallbackEnv) -> None: self.best_score.append(float("inf")) self.cmp_op.append(partial(self._lt_delta, delta=delta)) - def _final_iteration_check(self, env: CallbackEnv, eval_name_splitted: List[str], i: int) -> None: + def _final_iteration_check(self, *, env: CallbackEnv, metric_name: str, i: int) -> None: if env.iteration == env.end_iteration - 1: if self.verbose: best_score_str = "\t".join([_format_eval_result(x, show_stdv=True) for x in self.best_score_list[i]]) @@ -389,7 +396,7 @@ def _final_iteration_check(self, env: CallbackEnv, eval_name_splitted: List[str] "Did not meet early stopping. " f"Best iteration is:\n[{self.best_iter[i] + 1}]\t{best_score_str}" ) if self.first_metric_only: - _log_info(f"Evaluated only: {eval_name_splitted[-1]}") + _log_info(f"Evaluated only: {metric_name}") raise EarlyStopException(self.best_iter[i], self.best_score_list[i]) def __call__(self, env: CallbackEnv) -> None: @@ -405,21 +412,18 @@ def __call__(self, env: CallbackEnv) -> None: # self.best_score_list is initialized to an empty list first_time_updating_best_score_list = self.best_score_list == [] for i in range(len(env.evaluation_result_list)): - score = env.evaluation_result_list[i][2] - if first_time_updating_best_score_list or self.cmp_op[i](score, self.best_score[i]): - self.best_score[i] = score + dataset_name, metric_name, metric_value, *_ = env.evaluation_result_list[i] + if first_time_updating_best_score_list or self.cmp_op[i](metric_value, self.best_score[i]): + self.best_score[i] = metric_value self.best_iter[i] = env.iteration if first_time_updating_best_score_list: self.best_score_list.append(env.evaluation_result_list) else: self.best_score_list[i] = env.evaluation_result_list - # split is needed for " " case (e.g. "train l1") - eval_name_splitted = env.evaluation_result_list[i][1].split(" ") - if self.first_metric_only and self.first_metric != eval_name_splitted[-1]: + if self.first_metric_only and self.first_metric != metric_name: continue # use only the first metric for early stopping if self._is_train_set( - ds_name=env.evaluation_result_list[i][0], - eval_name=eval_name_splitted[0], + dataset_name=dataset_name, env=env, ): continue # train data for lgb.cv or sklearn wrapper (underlying lgb.train) @@ -430,9 +434,9 @@ def __call__(self, env: CallbackEnv) -> None: ) _log_info(f"Early stopping, best iteration is:\n[{self.best_iter[i] + 1}]\t{eval_result_str}") if self.first_metric_only: - _log_info(f"Evaluated only: {eval_name_splitted[-1]}") + _log_info(f"Evaluated only: {metric_name}") raise EarlyStopException(self.best_iter[i], self.best_score_list[i]) - self._final_iteration_check(env, eval_name_splitted, i) + self._final_iteration_check(env=env, metric_name=metric_name, i=i) def _should_enable_early_stopping(stopping_rounds: Any) -> bool: diff --git a/python-package/lightgbm/engine.py b/python-package/lightgbm/engine.py index 20dfc62b8856..ccbb4376a89f 100644 --- a/python-package/lightgbm/engine.py +++ b/python-package/lightgbm/engine.py @@ -581,15 +581,31 @@ def _agg_cv_result( raw_results: List[List[_LGBM_BoosterEvalMethodResultType]], ) -> List[_LGBM_BoosterEvalMethodResultWithStandardDeviationType]: """Aggregate cross-validation results.""" - cvmap: Dict[str, List[float]] = OrderedDict() - metric_type: Dict[str, bool] = {} + # build up 2 maps, of the form: + # + # OrderedDict{ + # (, ): + # } + # + # OrderedDict{ + # (, ): list[] + # } + # + metric_types: Dict[Tuple[str, str], bool] = OrderedDict() + metric_values: Dict[Tuple[str, str], List[float]] = OrderedDict() for one_result in raw_results: - for one_line in one_result: - key = f"{one_line[0]} {one_line[1]}" - metric_type[key] = one_line[3] - cvmap.setdefault(key, []) - cvmap[key].append(one_line[2]) - return [("cv_agg", k, float(np.mean(v)), metric_type[k], float(np.std(v))) for k, v in cvmap.items()] + for dataset_name, metric_name, metric_value, is_higher_better in one_result: + key = (dataset_name, metric_name) + metric_types[key] = is_higher_better + metric_values.setdefault(key, []) + metric_values[key].append(metric_value) + + # turn that into a list of tuples of the form: + # + # [ + # (, , mean(), , std_dev()) + # ] + return [(k[0], k[1], float(np.mean(v)), metric_types[k], float(np.std(v))) for k, v in metric_values.items()] def cv( @@ -812,9 +828,9 @@ def cv( ) cvbooster.update(fobj=fobj) # type: ignore[call-arg] res = _agg_cv_result(cvbooster.eval_valid(feval)) # type: ignore[call-arg] - for _, key, mean, _, std in res: - results[f"{key}-mean"].append(mean) - results[f"{key}-stdv"].append(std) + for dataset_name, metric_name, metric_mean, _, metric_std_dev in res: + results[f"{dataset_name} {metric_name}-mean"].append(metric_mean) + results[f"{dataset_name} {metric_name}-stdv"].append(metric_std_dev) try: for cb in callbacks_after_iter: cb( diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py index 05afddb77c77..a1797d1c1187 100644 --- a/tests/python_package_test/test_engine.py +++ b/tests/python_package_test/test_engine.py @@ -64,6 +64,13 @@ def constant_metric(preds, train_data): return ("error", 0.0, False) +def constant_metric_multi(preds, train_data): + return [ + ("important_metric", 1.5, False), + ("irrelevant_metric", 7.8, False), + ] + + def decreasing_metric(preds, train_data): return ("decreasing_metric", next(decreasing_generator), False) @@ -2570,6 +2577,13 @@ def train_booster(params=params_obj_verbose, **kwargs): assert "valid binary_logloss-mean" in res assert "valid error-mean" in res + # default metric in args with 1 custom function returning a list of 2 metrics + res = get_cv_result(metrics="binary_logloss", feval=constant_metric_multi) + assert len(res) == 6 + assert "valid binary_logloss-mean" in res + assert res["valid important_metric-mean"] == [1.5, 1.5] + assert res["valid irrelevant_metric-mean"] == [7.8, 7.8] + # non-default metric in args with custom one res = get_cv_result(metrics="binary_error", feval=constant_metric) assert len(res) == 4 @@ -2703,6 +2717,13 @@ def train_booster(params=params_obj_verbose, **kwargs): assert "binary_logloss" in evals_result["valid_0"] assert "error" in evals_result["valid_0"] + # default metric in params with custom function returning a list of 2 metrics + train_booster(params=params_obj_metric_log_verbose, feval=constant_metric_multi) + assert len(evals_result["valid_0"]) == 3 + assert "binary_logloss" in evals_result["valid_0"] + assert evals_result["valid_0"]["important_metric"] == [1.5, 1.5] + assert evals_result["valid_0"]["irrelevant_metric"] == [7.8, 7.8] + # non-default metric in params with custom one train_booster(params=params_obj_metric_err_verbose, feval=constant_metric) assert len(evals_result["valid_0"]) == 2 From 60b0155ac573a8ad5994c74c49e05854281e2469 Mon Sep 17 00:00:00 2001 From: RektPunk <110188257+RektPunk@users.noreply.github.com> Date: Mon, 23 Dec 2024 00:35:51 +0900 Subject: [PATCH 3/4] [python-package] Fix inconsistency in `predict()` output shape for 1-tree models (#6753) --- python-package/lightgbm/basic.py | 2 +- tests/python_package_test/test_engine.py | 90 +++++++++++++++++++++++- 2 files changed, 90 insertions(+), 2 deletions(-) diff --git a/python-package/lightgbm/basic.py b/python-package/lightgbm/basic.py index e06290dc1c5f..7b152fd2b006 100644 --- a/python-package/lightgbm/basic.py +++ b/python-package/lightgbm/basic.py @@ -1248,7 +1248,7 @@ def predict( if pred_leaf: preds = preds.astype(np.int32) is_sparse = isinstance(preds, (list, scipy.sparse.spmatrix)) - if not is_sparse and preds.size != nrow: + if not is_sparse and (preds.size != nrow or pred_leaf or pred_contrib): if preds.size % nrow == 0: preds = preds.reshape(nrow, -1) else: diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py index a1797d1c1187..667cb86c1a14 100644 --- a/tests/python_package_test/test_engine.py +++ b/tests/python_package_test/test_engine.py @@ -15,7 +15,7 @@ import psutil import pytest from scipy.sparse import csr_matrix, isspmatrix_csc, isspmatrix_csr -from sklearn.datasets import load_svmlight_file, make_blobs, make_multilabel_classification +from sklearn.datasets import load_svmlight_file, make_blobs, make_classification, make_multilabel_classification from sklearn.metrics import average_precision_score, log_loss, mean_absolute_error, mean_squared_error, roc_auc_score from sklearn.model_selection import GroupKFold, TimeSeriesSplit, train_test_split @@ -2314,6 +2314,33 @@ def test_refit(): assert err_pred > new_err_pred +def test_refit_with_one_tree_regression(): + X, y = make_synthetic_regression(n_samples=1_000, n_features=2) + lgb_train = lgb.Dataset(X, label=y) + params = {"objective": "regression", "verbosity": -1} + model = lgb.train(params, lgb_train, num_boost_round=1) + model_refit = model.refit(X, y) + assert isinstance(model_refit, lgb.Booster) + + +def test_refit_with_one_tree_binary_classification(): + X, y = load_breast_cancer(return_X_y=True) + lgb_train = lgb.Dataset(X, label=y) + params = {"objective": "binary", "verbosity": -1} + model = lgb.train(params, lgb_train, num_boost_round=1) + model_refit = model.refit(X, y) + assert isinstance(model_refit, lgb.Booster) + + +def test_refit_with_one_tree_multiclass_classification(): + X, y = load_iris(return_X_y=True) + lgb_train = lgb.Dataset(X, y) + params = {"objective": "multiclass", "num_class": 3, "verbose": -1} + model = lgb.train(params, lgb_train, num_boost_round=1) + model_refit = model.refit(X, y) + assert isinstance(model_refit, lgb.Booster) + + def test_refit_dataset_params(rng): # check refit accepts dataset_params X, y = load_breast_cancer(return_X_y=True) @@ -3872,6 +3899,67 @@ def test_predict_stump(rng, use_init_score): np.testing.assert_allclose(preds_all, np.full_like(preds_all, fill_value=y_avg)) +def test_predict_regression_output_shape(): + n_samples = 1_000 + n_features = 4 + X, y = make_synthetic_regression(n_samples=n_samples, n_features=n_features) + dtrain = lgb.Dataset(X, label=y) + params = {"objective": "regression", "verbosity": -1} + + # 1-round model + bst = lgb.train(params, dtrain, num_boost_round=1) + assert bst.predict(X).shape == (n_samples,) + assert bst.predict(X, pred_contrib=True).shape == (n_samples, n_features + 1) + assert bst.predict(X, pred_leaf=True).shape == (n_samples, 1) + + # 2-round model + bst = lgb.train(params, dtrain, num_boost_round=2) + assert bst.predict(X).shape == (n_samples,) + assert bst.predict(X, pred_contrib=True).shape == (n_samples, n_features + 1) + assert bst.predict(X, pred_leaf=True).shape == (n_samples, 2) + + +def test_predict_binary_classification_output_shape(): + n_samples = 1_000 + n_features = 4 + X, y = make_classification(n_samples=n_samples, n_features=n_features, n_classes=2) + dtrain = lgb.Dataset(X, label=y) + params = {"objective": "binary", "verbosity": -1} + + # 1-round model + bst = lgb.train(params, dtrain, num_boost_round=1) + assert bst.predict(X).shape == (n_samples,) + assert bst.predict(X, pred_contrib=True).shape == (n_samples, n_features + 1) + assert bst.predict(X, pred_leaf=True).shape == (n_samples, 1) + + # 2-round model + bst = lgb.train(params, dtrain, num_boost_round=2) + assert bst.predict(X).shape == (n_samples,) + assert bst.predict(X, pred_contrib=True).shape == (n_samples, n_features + 1) + assert bst.predict(X, pred_leaf=True).shape == (n_samples, 2) + + +def test_predict_multiclass_classification_output_shape(): + n_samples = 1_000 + n_features = 10 + n_classes = 3 + X, y = make_classification(n_samples=n_samples, n_features=n_features, n_classes=n_classes, n_informative=6) + dtrain = lgb.Dataset(X, label=y) + params = {"objective": "multiclass", "verbosity": -1, "num_class": n_classes} + + # 1-round model + bst = lgb.train(params, dtrain, num_boost_round=1) + assert bst.predict(X).shape == (n_samples, n_classes) + assert bst.predict(X, pred_contrib=True).shape == (n_samples, n_classes * (n_features + 1)) + assert bst.predict(X, pred_leaf=True).shape == (n_samples, n_classes) + + # 2-round model + bst = lgb.train(params, dtrain, num_boost_round=2) + assert bst.predict(X).shape == (n_samples, n_classes) + assert bst.predict(X, pred_contrib=True).shape == (n_samples, n_classes * (n_features + 1)) + assert bst.predict(X, pred_leaf=True).shape == (n_samples, n_classes * 2) + + def test_average_precision_metric(): # test against sklearn average precision metric X, y = load_breast_cancer(return_X_y=True) From f3bd64a1ea815f761d0f845bc16b60f0a71fdd99 Mon Sep 17 00:00:00 2001 From: shiyu1994 Date: Thu, 2 Jan 2025 14:55:02 +0800 Subject: [PATCH 4/4] [CUDA] remove src/treelearner/kernels (#6766) * remove src/treelearner/kernels * Update CMakeLists.txt * clean up --- CMakeLists.txt | 56 -- .../kernels/histogram_16_64_256.cu | 949 ------------------ .../kernels/histogram_16_64_256.hu | 160 --- 3 files changed, 1165 deletions(-) delete mode 100644 src/treelearner/kernels/histogram_16_64_256.cu delete mode 100644 src/treelearner/kernels/histogram_16_64_256.hu diff --git a/CMakeLists.txt b/CMakeLists.txt index 4f57cf9622e6..36557304548f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -252,54 +252,6 @@ if(USE_CUDA) set(CMAKE_CUDA_STANDARD 11) set(CMAKE_CUDA_STANDARD_REQUIRED ON) endif() - - set( - BASE_DEFINES - -DPOWER_FEATURE_WORKGROUPS=12 - -DUSE_CONSTANT_BUF=0 - ) - set( - ALLFEATS_DEFINES - ${BASE_DEFINES} - -DENABLE_ALL_FEATURES - ) - set( - FULLDATA_DEFINES - ${ALLFEATS_DEFINES} - -DIGNORE_INDICES - ) - - message(STATUS "ALLFEATS_DEFINES: ${ALLFEATS_DEFINES}") - message(STATUS "FULLDATA_DEFINES: ${FULLDATA_DEFINES}") - - function(add_histogram hsize hname hadd hconst hdir) - add_library(histo${hsize}${hname} OBJECT src/treelearner/kernels/histogram${hsize}.cu) - set_target_properties( - histo${hsize}${hname} - PROPERTIES - CUDA_SEPARABLE_COMPILATION ON - CUDA_ARCHITECTURES ${CUDA_ARCHS} - ) - if(hadd) - list(APPEND histograms histo${hsize}${hname}) - set(histograms ${histograms} PARENT_SCOPE) - endif() - target_compile_definitions( - histo${hsize}${hname} - PRIVATE - -DCONST_HESSIAN=${hconst} - ${hdir} - ) - endfunction() - - foreach(hsize _16_64_256) - add_histogram("${hsize}" "_sp_const" "True" "1" "${BASE_DEFINES}") - add_histogram("${hsize}" "_sp" "True" "0" "${BASE_DEFINES}") - add_histogram("${hsize}" "-allfeats_sp_const" "False" "1" "${ALLFEATS_DEFINES}") - add_histogram("${hsize}" "-allfeats_sp" "False" "0" "${ALLFEATS_DEFINES}") - add_histogram("${hsize}" "-fulldata_sp_const" "True" "1" "${FULLDATA_DEFINES}") - add_histogram("${hsize}" "-fulldata_sp" "True" "0" "${FULLDATA_DEFINES}") - endforeach() endif() include(CheckCXXSourceCompiles) @@ -634,14 +586,6 @@ if(USE_CUDA) CUDA_RESOLVE_DEVICE_SYMBOLS ON ) endif() - - # histograms are list of object libraries. Linking object library to other - # object libraries only gets usage requirements, the linked objects won't be - # used. Thus we have to call target_link_libraries on final targets here. - if(BUILD_CLI) - target_link_libraries(lightgbm PRIVATE ${histograms}) - endif() - target_link_libraries(_lightgbm PRIVATE ${histograms}) endif() if(WIN32) diff --git a/src/treelearner/kernels/histogram_16_64_256.cu b/src/treelearner/kernels/histogram_16_64_256.cu deleted file mode 100644 index 9d8427a6f9a8..000000000000 --- a/src/treelearner/kernels/histogram_16_64_256.cu +++ /dev/null @@ -1,949 +0,0 @@ -/*! - * Copyright (c) 2020 IBM Corporation. All rights reserved. - * Licensed under the MIT License. See LICENSE file in the project root for license information. - */ - -#include - -#include -#include - -#include "histogram_16_64_256.hu" - -namespace LightGBM { - -// atomic add for float number in local memory -inline __device__ void atomic_local_add_f(acc_type *addr, const acc_type val) { - atomicAdd(addr, static_cast(val)); -} - -// histogram16 stuff -#ifdef ENABLE_ALL_FEATURES -#ifdef IGNORE_INDICES -#define KERNEL_NAME histogram16_fulldata -#else // IGNORE_INDICES -#define KERNEL_NAME histogram16 -#endif // IGNORE_INDICES -#else // ENABLE_ALL_FEATURES -#error "ENABLE_ALL_FEATURES should always be 1" -#define KERNEL_NAME histogram16 -#endif // ENABLE_ALL_FEATURES -#define NUM_BINS 16 -#define LOCAL_MEM_SIZE ((sizeof(unsigned int) + 2 * sizeof(acc_type)) * NUM_BINS) - -// this function will be called by histogram16 -// we have one sub-histogram of one feature in local memory, and need to read others -inline void __device__ within_kernel_reduction16x4(const acc_type* __restrict__ feature_sub_hist, - const unsigned int skip_id, - const unsigned int old_val_cont_bin0, - const uint16_t num_sub_hist, - acc_type* __restrict__ output_buf, - acc_type* __restrict__ local_hist, - const size_t power_feature_workgroups) { - const uint16_t ltid = threadIdx.x; - acc_type grad_bin = local_hist[ltid * 2]; - acc_type hess_bin = local_hist[ltid * 2 + 1]; - unsigned int* __restrict__ local_cnt = reinterpret_cast(local_hist + 2 * NUM_BINS); - - unsigned int cont_bin; - if (power_feature_workgroups != 0) { - cont_bin = ltid ? local_cnt[ltid] : old_val_cont_bin0; - } else { - cont_bin = local_cnt[ltid]; - } - uint16_t i; - - if (power_feature_workgroups != 0) { - // add all sub-histograms for feature - const acc_type* __restrict__ p = feature_sub_hist + ltid; - for (i = 0; i < skip_id; ++i) { - grad_bin += *p; p += NUM_BINS; - hess_bin += *p; p += NUM_BINS; - cont_bin += as_acc_int_type(*p); p += NUM_BINS; - } - - // skip the counters we already have - p += 3 * NUM_BINS; - - for (i = i + 1; i < num_sub_hist; ++i) { - grad_bin += *p; p += NUM_BINS; - hess_bin += *p; p += NUM_BINS; - cont_bin += as_acc_int_type(*p); p += NUM_BINS; - } - } - __syncthreads(); - - output_buf[ltid * 2 + 0] = grad_bin; - output_buf[ltid * 2 + 1] = hess_bin; -} - -#if USE_CONSTANT_BUF == 1 -__kernel void KERNEL_NAME(__global const uchar* restrict feature_data_base, - __constant const uchar* restrict feature_masks __attribute__((max_constant_size(65536))), - const data_size_t feature_size, - __constant const data_size_t* restrict data_indices __attribute__((max_constant_size(65536))), - const data_size_t num_data, - __constant const score_t* restrict ordered_gradients __attribute__((max_constant_size(65536))), -#if CONST_HESSIAN == 0 - __constant const score_t* restrict ordered_hessians __attribute__((max_constant_size(65536))), -#else - const score_t const_hessian, -#endif - char* __restrict__ output_buf, - volatile int * sync_counters, - acc_type* __restrict__ hist_buf_base, - const size_t power_feature_workgroups) { -#else -__global__ void KERNEL_NAME(const uchar* feature_data_base, - const uchar* __restrict__ feature_masks, - const data_size_t feature_size, - const data_size_t* data_indices, - const data_size_t num_data, - const score_t* ordered_gradients, -#if CONST_HESSIAN == 0 - const score_t* ordered_hessians, -#else - const score_t const_hessian, -#endif - char* __restrict__ output_buf, - volatile int * sync_counters, - acc_type* __restrict__ hist_buf_base, - const size_t power_feature_workgroups) { -#endif - // allocate the local memory array aligned with float2, to guarantee correct alignment on NVIDIA platforms - // otherwise a "Misaligned Address" exception may occur - __shared__ float2 shared_array[LOCAL_MEM_SIZE/sizeof(float2)]; - const unsigned int gtid = blockIdx.x * blockDim.x + threadIdx.x; - const uint16_t ltid = threadIdx.x; - const uint16_t lsize = NUM_BINS; // get_local_size(0); - const uint16_t group_id = blockIdx.x; - - // local memory per workgroup is 3 KB - // clear local memory - unsigned int *ptr = reinterpret_cast(shared_array); - for (int i = ltid; i < LOCAL_MEM_SIZE/sizeof(unsigned int); i += lsize) { - ptr[i] = 0; - } - __syncthreads(); - // gradient/hessian histograms - // assume this starts at 32 * 4 = 128-byte boundary // What does it mean? boundary?? - // total size: 2 * 256 * size_of(float) = 2 KB - // organization: each feature/grad/hessian is at a different bank, - // as independent of the feature value as possible - acc_type *gh_hist = reinterpret_cast(shared_array); - - // counter histogram - // total size: 256 * size_of(unsigned int) = 1 KB - unsigned int *cnt_hist = reinterpret_cast(gh_hist + 2 * NUM_BINS); - - // odd threads (1, 3, ...) compute histograms for hessians first - // even thread (0, 2, ...) compute histograms for gradients first - // etc. - uchar is_hessian_first = ltid & 1; - - uint16_t feature_id = group_id >> power_feature_workgroups; - - // each 2^POWER_FEATURE_WORKGROUPS workgroups process on one feature (compile-time constant) - // feature_size is the number of examples per feature - const uchar *feature_data = feature_data_base + feature_id * feature_size; - - // size of threads that process this feature4 - const unsigned int subglobal_size = lsize * (1 << power_feature_workgroups); - - // equivalent thread ID in this subgroup for this feature4 - const unsigned int subglobal_tid = gtid - feature_id * subglobal_size; - - - data_size_t ind; - data_size_t ind_next; - #ifdef IGNORE_INDICES - ind = subglobal_tid; - #else - ind = data_indices[subglobal_tid]; - #endif - - // extract feature mask, when a byte is set to 0, that feature is disabled - uchar feature_mask = feature_masks[feature_id]; - // exit if the feature is masked - if (!feature_mask) { - return; - } else { - feature_mask = feature_mask - 1; // feature_mask is used for get feature (1: 4bit feature, 0: 8bit feature) - } - - // STAGE 1: read feature data, and gradient and hessian - // first half of the threads read feature data from global memory - // We will prefetch data into the "next" variable at the beginning of each iteration - uchar feature; - uchar feature_next; - uint16_t bin; - - feature = feature_data[ind >> feature_mask]; - if (feature_mask) { - feature = (feature >> ((ind & 1) << 2)) & 0xf; - } - bin = feature; - acc_type grad_bin = 0.0f, hess_bin = 0.0f; - acc_type *addr_bin; - - // store gradient and hessian - score_t grad, hess; - score_t grad_next, hess_next; - grad = ordered_gradients[ind]; - #if CONST_HESSIAN == 0 - hess = ordered_hessians[ind]; - #endif - - // there are 2^POWER_FEATURE_WORKGROUPS workgroups processing each feature4 - for (unsigned int i = subglobal_tid; i < num_data; i += subglobal_size) { - // prefetch the next iteration variables - // we don't need boundary check because we have made the buffer large - int i_next = i + subglobal_size; - #ifdef IGNORE_INDICES - // we need to check to bounds here - ind_next = i_next < num_data ? i_next : i; - #else - ind_next = data_indices[i_next]; - #endif - - grad_next = ordered_gradients[ind_next]; - #if CONST_HESSIAN == 0 - hess_next = ordered_hessians[ind_next]; - #endif - - // STAGE 2: accumulate gradient and hessian - if (bin != feature) { - addr_bin = gh_hist + bin * 2 + is_hessian_first; - #if CONST_HESSIAN == 0 - acc_type acc_bin = is_hessian_first ? hess_bin : grad_bin; - atomic_local_add_f(addr_bin, acc_bin); - - addr_bin = addr_bin + 1 - 2 * is_hessian_first; - acc_bin = is_hessian_first ? grad_bin : hess_bin; - atomic_local_add_f(addr_bin, acc_bin); - - #elif CONST_HESSIAN == 1 - atomic_local_add_f(addr_bin, grad_bin); - #endif - - bin = feature; - grad_bin = grad; - hess_bin = hess; - } else { - grad_bin += grad; - hess_bin += hess; - } - - // prefetch the next iteration variables - feature_next = feature_data[ind_next >> feature_mask]; - - // STAGE 3: accumulate counter - atomicAdd(cnt_hist + feature, 1); - - // STAGE 4: update next stat - grad = grad_next; - hess = hess_next; - if (!feature_mask) { - feature = feature_next; - } else { - feature = (feature_next >> ((ind_next & 1) << 2)) & 0xf; - } - } - - - addr_bin = gh_hist + bin * 2 + is_hessian_first; - #if CONST_HESSIAN == 0 - acc_type acc_bin = is_hessian_first ? hess_bin : grad_bin; - atomic_local_add_f(addr_bin, acc_bin); - - addr_bin = addr_bin + 1 - 2 * is_hessian_first; - acc_bin = is_hessian_first ? grad_bin : hess_bin; - atomic_local_add_f(addr_bin, acc_bin); - - #elif CONST_HESSIAN == 1 - atomic_local_add_f(addr_bin, grad_bin); - #endif - __syncthreads(); - - #if CONST_HESSIAN == 1 - // make a final reduction - gh_hist[ltid * 2] += gh_hist[ltid * 2 + 1]; - gh_hist[ltid * 2 + 1] = const_hessian * cnt_hist[ltid]; // counter move to this position - __syncthreads(); - #endif - -#if POWER_FEATURE_WORKGROUPS != 0 - acc_type *__restrict__ output = reinterpret_cast(output_buf) + group_id * 3 * NUM_BINS; - // write gradients and Hessians - acc_type *__restrict__ ptr_f = output; - for (uint16_t i = ltid; i < 2 * NUM_BINS; i += lsize) { - // even threads read gradients, odd threads read Hessians - acc_type value = gh_hist[i]; - ptr_f[(i & 1) * NUM_BINS + (i >> 1)] = value; - } - // write counts - acc_int_type *__restrict__ ptr_i = reinterpret_cast(output + 2 * NUM_BINS); - for (uint16_t i = ltid; i < NUM_BINS; i += lsize) { - unsigned int value = cnt_hist[i]; - ptr_i[i] = value; - } - __syncthreads(); - __threadfence(); - unsigned int * counter_val = cnt_hist; - // backup the old value - unsigned int old_val = *counter_val; - if (ltid == 0) { - // all workgroups processing the same feature add this counter - *counter_val = atomicAdd(const_cast(sync_counters + feature_id), 1); - } - // make sure everyone in this workgroup is here - __syncthreads(); - // everyone in this workgroup: if we are the last workgroup, then do reduction! - if (*counter_val == (1 << power_feature_workgroups) - 1) { - if (ltid == 0) { - sync_counters[feature_id] = 0; - } -#else - } - // only 1 work group, no need to increase counter - // the reduction will become a simple copy - { - unsigned int old_val; // dummy -#endif - // locate our feature's block in output memory - unsigned int output_offset = (feature_id << power_feature_workgroups); - acc_type const * __restrict__ feature_subhists = - reinterpret_cast(output_buf) + output_offset * 3 * NUM_BINS; - // skip reading the data already in local memory - unsigned int skip_id = group_id - output_offset; - // locate output histogram location for this feature4 - acc_type *__restrict__ hist_buf = hist_buf_base + feature_id * 2 * NUM_BINS; - - within_kernel_reduction16x4(feature_subhists, skip_id, old_val, 1 << power_feature_workgroups, hist_buf, reinterpret_cast(shared_array), power_feature_workgroups); - } -} - -// end of histogram16 stuff - -// histogram64 stuff -#undef KERNEL_NAME -#undef NUM_BINS -#undef LOCAL_MEM_SIZE -#ifdef ENABLE_ALL_FEATURES -#ifdef IGNORE_INDICES -#define KERNEL_NAME histogram64_fulldata -#else // IGNORE_INDICES -#define KERNEL_NAME histogram64 // seems like ENABLE_ALL_FEATURES is set to 1 in the header if its disabled -// #define KERNEL_NAME histogram64_allfeats -#endif // IGNORE_INDICES -#else // ENABLE_ALL_FEATURES -#error "ENABLE_ALL_FEATURES should always be 1" -#define KERNEL_NAME histogram64 -#endif // ENABLE_ALL_FEATURES -#define NUM_BINS 64 -#define LOCAL_MEM_SIZE ((sizeof(unsigned int) + 2 * sizeof(acc_type)) * NUM_BINS) - -// this function will be called by histogram64 -// we have one sub-histogram of one feature in local memory, and need to read others -inline void __device__ within_kernel_reduction64x4(const acc_type* __restrict__ feature_sub_hist, - const unsigned int skip_id, - const unsigned int old_val_cont_bin0, - const uint16_t num_sub_hist, - acc_type* __restrict__ output_buf, - acc_type* __restrict__ local_hist, - const size_t power_feature_workgroups) { - const uint16_t ltid = threadIdx.x; - acc_type grad_bin = local_hist[ltid * 2]; - acc_type hess_bin = local_hist[ltid * 2 + 1]; - unsigned int* __restrict__ local_cnt = reinterpret_cast(local_hist + 2 * NUM_BINS); - - unsigned int cont_bin; - if (power_feature_workgroups != 0) { - cont_bin = ltid ? local_cnt[ltid] : old_val_cont_bin0; - } else { - cont_bin = local_cnt[ltid]; - } - uint16_t i; - - if (power_feature_workgroups != 0) { - // add all sub-histograms for feature - const acc_type* __restrict__ p = feature_sub_hist + ltid; - for (i = 0; i < skip_id; ++i) { - grad_bin += *p; p += NUM_BINS; - hess_bin += *p; p += NUM_BINS; - cont_bin += as_acc_int_type(*p); p += NUM_BINS; - } - - // skip the counters we already have - p += 3 * NUM_BINS; - - for (i = i + 1; i < num_sub_hist; ++i) { - grad_bin += *p; p += NUM_BINS; - hess_bin += *p; p += NUM_BINS; - cont_bin += as_acc_int_type(*p); p += NUM_BINS; - } - } - __syncthreads(); - - output_buf[ltid * 2 + 0] = grad_bin; - output_buf[ltid * 2 + 1] = hess_bin; -} - -#if USE_CONSTANT_BUF == 1 -__kernel void KERNEL_NAME(__global const uchar* restrict feature_data_base, - __constant const uchar* restrict feature_masks __attribute__((max_constant_size(65536))), - const data_size_t feature_size, - __constant const data_size_t* restrict data_indices __attribute__((max_constant_size(65536))), - const data_size_t num_data, - __constant const score_t* restrict ordered_gradients __attribute__((max_constant_size(65536))), -#if CONST_HESSIAN == 0 - __constant const score_t* restrict ordered_hessians __attribute__((max_constant_size(65536))), -#else - const score_t const_hessian, -#endif - char* __restrict__ output_buf, - volatile int * sync_counters, - acc_type* __restrict__ hist_buf_base, - const size_t power_feature_workgroups) { -#else -__global__ void KERNEL_NAME(const uchar* feature_data_base, - const uchar* __restrict__ feature_masks, - const data_size_t feature_size, - const data_size_t* data_indices, - const data_size_t num_data, - const score_t* ordered_gradients, -#if CONST_HESSIAN == 0 - const score_t* ordered_hessians, -#else - const score_t const_hessian, -#endif - char* __restrict__ output_buf, - volatile int * sync_counters, - acc_type* __restrict__ hist_buf_base, - const size_t power_feature_workgroups) { -#endif - // allocate the local memory array aligned with float2, to guarantee correct alignment on NVIDIA platforms - // otherwise a "Misaligned Address" exception may occur - __shared__ float2 shared_array[LOCAL_MEM_SIZE/sizeof(float2)]; - const unsigned int gtid = blockIdx.x * blockDim.x + threadIdx.x; - const uint16_t ltid = threadIdx.x; - const uint16_t lsize = NUM_BINS; // get_local_size(0); - const uint16_t group_id = blockIdx.x; - - // local memory per workgroup is 3 KB - // clear local memory - unsigned int *ptr = reinterpret_cast(shared_array); - for (int i = ltid; i < LOCAL_MEM_SIZE/sizeof(unsigned int); i += lsize) { - ptr[i] = 0; - } - __syncthreads(); - // gradient/hessian histograms - // assume this starts at 32 * 4 = 128-byte boundary // What does it mean? boundary?? - // total size: 2 * 256 * size_of(float) = 2 KB - // organization: each feature/grad/hessian is at a different bank, - // as independent of the feature value as possible - acc_type *gh_hist = reinterpret_cast(shared_array); - - // counter histogram - // total size: 256 * size_of(unsigned int) = 1 KB - unsigned int *cnt_hist = reinterpret_cast(gh_hist + 2 * NUM_BINS); - - // odd threads (1, 3, ...) compute histograms for Hessians first - // even thread (0, 2, ...) compute histograms for gradients first - // etc. - uchar is_hessian_first = ltid & 1; - - uint16_t feature_id = group_id >> power_feature_workgroups; - - // each 2^POWER_FEATURE_WORKGROUPS workgroups process on one feature (compile-time constant) - // feature_size is the number of examples per feature - const uchar *feature_data = feature_data_base + feature_id * feature_size; - - // size of threads that process this feature4 - const unsigned int subglobal_size = lsize * (1 << power_feature_workgroups); - - // equivalent thread ID in this subgroup for this feature4 - const unsigned int subglobal_tid = gtid - feature_id * subglobal_size; - - data_size_t ind; - data_size_t ind_next; - #ifdef IGNORE_INDICES - ind = subglobal_tid; - #else - ind = data_indices[subglobal_tid]; - #endif - - // extract feature mask, when a byte is set to 0, that feature is disabled - uchar feature_mask = feature_masks[feature_id]; - // exit if the feature is masked - if (!feature_mask) { - return; - } else { - feature_mask = feature_mask - 1; // feature_mask is used for get feature (1: 4bit feature, 0: 8bit feature) - } - - // STAGE 1: read feature data, and gradient and hessian - // first half of the threads read feature data from global memory - // We will prefetch data into the "next" variable at the beginning of each iteration - uchar feature; - uchar feature_next; - uint16_t bin; - - feature = feature_data[ind >> feature_mask]; - if (feature_mask) { - feature = (feature >> ((ind & 1) << 2)) & 0xf; - } - bin = feature; - acc_type grad_bin = 0.0f, hess_bin = 0.0f; - acc_type *addr_bin; - - // store gradient and hessian - score_t grad, hess; - score_t grad_next, hess_next; - grad = ordered_gradients[ind]; - #if CONST_HESSIAN == 0 - hess = ordered_hessians[ind]; - #endif - - // there are 2^POWER_FEATURE_WORKGROUPS workgroups processing each feature4 - for (unsigned int i = subglobal_tid; i < num_data; i += subglobal_size) { - // prefetch the next iteration variables - // we don't need boundary check because we have made the buffer large - int i_next = i + subglobal_size; - #ifdef IGNORE_INDICES - // we need to check to bounds here - ind_next = i_next < num_data ? i_next : i; - #else - ind_next = data_indices[i_next]; - #endif - - grad_next = ordered_gradients[ind_next]; - #if CONST_HESSIAN == 0 - hess_next = ordered_hessians[ind_next]; - #endif - - // STAGE 2: accumulate gradient and hessian - if (bin != feature) { - addr_bin = gh_hist + bin * 2 + is_hessian_first; - #if CONST_HESSIAN == 0 - acc_type acc_bin = is_hessian_first ? hess_bin : grad_bin; - atomic_local_add_f(addr_bin, acc_bin); - - addr_bin = addr_bin + 1 - 2 * is_hessian_first; - acc_bin = is_hessian_first ? grad_bin : hess_bin; - atomic_local_add_f(addr_bin, acc_bin); - - #elif CONST_HESSIAN == 1 - atomic_local_add_f(addr_bin, grad_bin); - #endif - - bin = feature; - grad_bin = grad; - hess_bin = hess; - } else { - grad_bin += grad; - hess_bin += hess; - } - - // prefetch the next iteration variables - feature_next = feature_data[ind_next >> feature_mask]; - - // STAGE 3: accumulate counter - atomicAdd(cnt_hist + feature, 1); - - // STAGE 4: update next stat - grad = grad_next; - hess = hess_next; - if (!feature_mask) { - feature = feature_next; - } else { - feature = (feature_next >> ((ind_next & 1) << 2)) & 0xf; - } - } - - addr_bin = gh_hist + bin * 2 + is_hessian_first; - #if CONST_HESSIAN == 0 - acc_type acc_bin = is_hessian_first ? hess_bin : grad_bin; - atomic_local_add_f(addr_bin, acc_bin); - - addr_bin = addr_bin + 1 - 2 * is_hessian_first; - acc_bin = is_hessian_first ? grad_bin : hess_bin; - atomic_local_add_f(addr_bin, acc_bin); - - #elif CONST_HESSIAN == 1 - atomic_local_add_f(addr_bin, grad_bin); - #endif - __syncthreads(); - - #if CONST_HESSIAN == 1 - // make a final reduction - gh_hist[ltid * 2] += gh_hist[ltid * 2 + 1]; - gh_hist[ltid * 2 + 1] = const_hessian * cnt_hist[ltid]; // counter move to this position - __syncthreads(); - #endif - -#if POWER_FEATURE_WORKGROUPS != 0 - acc_type *__restrict__ output = reinterpret_cast(output_buf) + group_id * 3 * NUM_BINS; - // write gradients and Hessians - acc_type *__restrict__ ptr_f = output; - for (uint16_t i = ltid; i < 2 * NUM_BINS; i += lsize) { - // even threads read gradients, odd threads read Hessians - acc_type value = gh_hist[i]; - ptr_f[(i & 1) * NUM_BINS + (i >> 1)] = value; - } - // write counts - acc_int_type *__restrict__ ptr_i = reinterpret_cast(output + 2 * NUM_BINS); - for (uint16_t i = ltid; i < NUM_BINS; i += lsize) { - unsigned int value = cnt_hist[i]; - ptr_i[i] = value; - } - __syncthreads(); - __threadfence(); - unsigned int * counter_val = cnt_hist; - // backup the old value - unsigned int old_val = *counter_val; - if (ltid == 0) { - // all workgroups processing the same feature add this counter - *counter_val = atomicAdd(const_cast(sync_counters + feature_id), 1); - } - // make sure everyone in this workgroup is here - __syncthreads(); - // everyone in this workgroup: if we are the last workgroup, then do reduction! - if (*counter_val == (1 << power_feature_workgroups) - 1) { - if (ltid == 0) { - sync_counters[feature_id] = 0; - } -#else - } - // only 1 work group, no need to increase counter - // the reduction will become a simple copy - { - unsigned int old_val; // dummy -#endif - // locate our feature's block in output memory - unsigned int output_offset = (feature_id << power_feature_workgroups); - acc_type const * __restrict__ feature_subhists = - reinterpret_cast(output_buf) + output_offset * 3 * NUM_BINS; - // skip reading the data already in local memory - unsigned int skip_id = group_id - output_offset; - // locate output histogram location for this feature4 - acc_type *__restrict__ hist_buf = hist_buf_base + feature_id * 2 * NUM_BINS; - - within_kernel_reduction64x4(feature_subhists, skip_id, old_val, 1 << power_feature_workgroups, hist_buf, reinterpret_cast(shared_array), power_feature_workgroups); - } -} - -// end of histogram64 stuff - -// histogram256 stuff -#undef KERNEL_NAME -#undef NUM_BINS -#undef LOCAL_MEM_SIZE -#ifdef ENABLE_ALL_FEATURES -#ifdef IGNORE_INDICES -#define KERNEL_NAME histogram256_fulldata -#else // IGNORE_INDICES -#define KERNEL_NAME histogram256 // seems like ENABLE_ALL_FEATURES is set to 1 in the header if its disabled -// #define KERNEL_NAME histogram256_allfeats -#endif // IGNORE_INDICES -#else // ENABLE_ALL_FEATURES -#error "ENABLE_ALL_FEATURES should always be 1" -#define KERNEL_NAME histogram256 -#endif // ENABLE_ALL_FEATURES -#define NUM_BINS 256 -#define LOCAL_MEM_SIZE ((sizeof(unsigned int) + 2 * sizeof(acc_type)) * NUM_BINS) - -// this function will be called by histogram256 -// we have one sub-histogram of one feature in local memory, and need to read others -inline void __device__ within_kernel_reduction256x4(const acc_type* __restrict__ feature_sub_hist, - const unsigned int skip_id, - const unsigned int old_val_cont_bin0, - const uint16_t num_sub_hist, - acc_type* __restrict__ output_buf, - acc_type* __restrict__ local_hist, - const size_t power_feature_workgroups) { - const uint16_t ltid = threadIdx.x; - acc_type grad_bin = local_hist[ltid * 2]; - acc_type hess_bin = local_hist[ltid * 2 + 1]; - unsigned int* __restrict__ local_cnt = reinterpret_cast(local_hist + 2 * NUM_BINS); - - unsigned int cont_bin; - if (power_feature_workgroups != 0) { - cont_bin = ltid ? local_cnt[ltid] : old_val_cont_bin0; - } else { - cont_bin = local_cnt[ltid]; - } - uint16_t i; - - if (power_feature_workgroups != 0) { - // add all sub-histograms for feature - const acc_type* __restrict__ p = feature_sub_hist + ltid; - for (i = 0; i < skip_id; ++i) { - grad_bin += *p; p += NUM_BINS; - hess_bin += *p; p += NUM_BINS; - cont_bin += as_acc_int_type(*p); p += NUM_BINS; - } - - // skip the counters we already have - p += 3 * NUM_BINS; - - for (i = i + 1; i < num_sub_hist; ++i) { - grad_bin += *p; p += NUM_BINS; - hess_bin += *p; p += NUM_BINS; - cont_bin += as_acc_int_type(*p); p += NUM_BINS; - } - } - - __syncthreads(); - - output_buf[ltid * 2 + 0] = grad_bin; - output_buf[ltid * 2 + 1] = hess_bin; -} - -#if USE_CONSTANT_BUF == 1 -__kernel void KERNEL_NAME(__global const uchar* restrict feature_data_base, - __constant const uchar* restrict feature_masks __attribute__((max_constant_size(65536))), - const data_size_t feature_size, - __constant const data_size_t* restrict data_indices __attribute__((max_constant_size(65536))), - const data_size_t num_data, - __constant const score_t* restrict ordered_gradients __attribute__((max_constant_size(65536))), -#if CONST_HESSIAN == 0 - __constant const score_t* restrict ordered_hessians __attribute__((max_constant_size(65536))), -#else - const score_t const_hessian, -#endif - char* __restrict__ output_buf, - volatile int * sync_counters, - acc_type* __restrict__ hist_buf_base, - const size_t power_feature_workgroups) { -#else -__global__ void KERNEL_NAME(const uchar* feature_data_base, - const uchar* __restrict__ feature_masks, - const data_size_t feature_size, - const data_size_t* data_indices, - const data_size_t num_data, - const score_t* ordered_gradients, -#if CONST_HESSIAN == 0 - const score_t* ordered_hessians, -#else - const score_t const_hessian, -#endif - char* __restrict__ output_buf, - volatile int * sync_counters, - acc_type* __restrict__ hist_buf_base, - const size_t power_feature_workgroups) { -#endif - // allocate the local memory array aligned with float2, to guarantee correct alignment on NVIDIA platforms - // otherwise a "Misaligned Address" exception may occur - __shared__ float2 shared_array[LOCAL_MEM_SIZE/sizeof(float2)]; - const unsigned int gtid = blockIdx.x * blockDim.x + threadIdx.x; - const uint16_t ltid = threadIdx.x; - const uint16_t lsize = NUM_BINS; // get_local_size(0); - const uint16_t group_id = blockIdx.x; - - // local memory per workgroup is 3 KB - // clear local memory - unsigned int *ptr = reinterpret_cast(shared_array); - for (int i = ltid; i < LOCAL_MEM_SIZE/sizeof(unsigned int); i += lsize) { - ptr[i] = 0; - } - __syncthreads(); - // gradient/hessian histograms - // assume this starts at 32 * 4 = 128-byte boundary // What does it mean? boundary?? - // total size: 2 * 256 * size_of(float) = 2 KB - // organization: each feature/grad/hessian is at a different bank, - // as independent of the feature value as possible - acc_type *gh_hist = reinterpret_cast(shared_array); - - // counter histogram - // total size: 256 * size_of(unsigned int) = 1 KB - unsigned int *cnt_hist = reinterpret_cast(gh_hist + 2 * NUM_BINS); - - // odd threads (1, 3, ...) compute histograms for hessians first - // even thread (0, 2, ...) compute histograms for gradients first - // etc. - uchar is_hessian_first = ltid & 1; - - uint16_t feature_id = group_id >> power_feature_workgroups; - - // each 2^POWER_FEATURE_WORKGROUPS workgroups process on one feature (compile-time constant) - // feature_size is the number of examples per feature - const uchar *feature_data = feature_data_base + feature_id * feature_size; - - // size of threads that process this feature4 - const unsigned int subglobal_size = lsize * (1 << power_feature_workgroups); - - // equivalent thread ID in this subgroup for this feature4 - const unsigned int subglobal_tid = gtid - feature_id * subglobal_size; - - data_size_t ind; - data_size_t ind_next; - #ifdef IGNORE_INDICES - ind = subglobal_tid; - #else - ind = data_indices[subglobal_tid]; - #endif - - // extract feature mask, when a byte is set to 0, that feature is disabled - uchar feature_mask = feature_masks[feature_id]; - // exit if the feature is masked - if (!feature_mask) { - return; - } else { - feature_mask = feature_mask - 1; // feature_mask is used for get feature (1: 4bit feature, 0: 8bit feature) - } - - // STAGE 1: read feature data, and gradient and hessian - // first half of the threads read feature data from global memory - // We will prefetch data into the "next" variable at the beginning of each iteration - uchar feature; - uchar feature_next; - uint16_t bin; - - feature = feature_data[ind >> feature_mask]; - if (feature_mask) { - feature = (feature >> ((ind & 1) << 2)) & 0xf; - } - bin = feature; - acc_type grad_bin = 0.0f, hess_bin = 0.0f; - acc_type *addr_bin; - - // store gradient and hessian - score_t grad, hess; - score_t grad_next, hess_next; - grad = ordered_gradients[ind]; - #if CONST_HESSIAN == 0 - hess = ordered_hessians[ind]; - #endif - - // there are 2^POWER_FEATURE_WORKGROUPS workgroups processing each feature4 - for (unsigned int i = subglobal_tid; i < num_data; i += subglobal_size) { - // prefetch the next iteration variables - // we don't need boundary check because we have made the buffer large - int i_next = i + subglobal_size; - #ifdef IGNORE_INDICES - // we need to check to bounds here - ind_next = i_next < num_data ? i_next : i; - #else - ind_next = data_indices[i_next]; - #endif - - grad_next = ordered_gradients[ind_next]; - #if CONST_HESSIAN == 0 - hess_next = ordered_hessians[ind_next]; - #endif - // STAGE 2: accumulate gradient and hessian - if (bin != feature) { - addr_bin = gh_hist + bin * 2 + is_hessian_first; - #if CONST_HESSIAN == 0 - acc_type acc_bin = is_hessian_first ? hess_bin : grad_bin; - atomic_local_add_f(addr_bin, acc_bin); - - addr_bin = addr_bin + 1 - 2 * is_hessian_first; - acc_bin = is_hessian_first ? grad_bin : hess_bin; - atomic_local_add_f(addr_bin, acc_bin); - - #elif CONST_HESSIAN == 1 - atomic_local_add_f(addr_bin, grad_bin); - #endif - - bin = feature; - grad_bin = grad; - hess_bin = hess; - } else { - grad_bin += grad; - hess_bin += hess; - } - - // prefetch the next iteration variables - feature_next = feature_data[ind_next >> feature_mask]; - - // STAGE 3: accumulate counter - atomicAdd(cnt_hist + feature, 1); - - // STAGE 4: update next stat - grad = grad_next; - hess = hess_next; - if (!feature_mask) { - feature = feature_next; - } else { - feature = (feature_next >> ((ind_next & 1) << 2)) & 0xf; - } - } - - addr_bin = gh_hist + bin * 2 + is_hessian_first; - #if CONST_HESSIAN == 0 - acc_type acc_bin = is_hessian_first ? hess_bin : grad_bin; - atomic_local_add_f(addr_bin, acc_bin); - - addr_bin = addr_bin + 1 - 2 * is_hessian_first; - acc_bin = is_hessian_first ? grad_bin : hess_bin; - - atomic_local_add_f(addr_bin, acc_bin); - - #elif CONST_HESSIAN == 1 - atomic_local_add_f(addr_bin, grad_bin); - #endif - __syncthreads(); - - #if CONST_HESSIAN == 1 - // make a final reduction - gh_hist[ltid * 2] += gh_hist[ltid * 2 + 1]; - gh_hist[ltid * 2 + 1] = const_hessian * cnt_hist[ltid]; // counter move to this position - __syncthreads(); - #endif - -#if POWER_FEATURE_WORKGROUPS != 0 - acc_type *__restrict__ output = reinterpret_cast(output_buf) + group_id * 3 * NUM_BINS; - // write gradients and Hessians - acc_type *__restrict__ ptr_f = output; - for (uint16_t i = ltid; i < 2 * NUM_BINS; i += lsize) { - // even threads read gradients, odd threads read Hessians - acc_type value = gh_hist[i]; - ptr_f[(i & 1) * NUM_BINS + (i >> 1)] = value; - } - // write counts - acc_int_type *__restrict__ ptr_i = reinterpret_cast(output + 2 * NUM_BINS); - for (uint16_t i = ltid; i < NUM_BINS; i += lsize) { - unsigned int value = cnt_hist[i]; - ptr_i[i] = value; - } - __syncthreads(); - __threadfence(); - unsigned int * counter_val = cnt_hist; - // backup the old value - unsigned int old_val = *counter_val; - if (ltid == 0) { - // all workgroups processing the same feature add this counter - *counter_val = atomicAdd(const_cast(sync_counters + feature_id), 1); - } - // make sure everyone in this workgroup is here - __syncthreads(); - // everyone in this workgroup: if we are the last workgroup, then do reduction! - if (*counter_val == (1 << power_feature_workgroups) - 1) { - if (ltid == 0) { - sync_counters[feature_id] = 0; - } -#else - } - // only 1 work group, no need to increase counter - // the reduction will become a simple copy - { - unsigned int old_val; // dummy -#endif - // locate our feature's block in output memory - unsigned int output_offset = (feature_id << power_feature_workgroups); - acc_type const * __restrict__ feature_subhists = - reinterpret_cast(output_buf) + output_offset * 3 * NUM_BINS; - // skip reading the data already in local memory - unsigned int skip_id = group_id - output_offset; - // locate output histogram location for this feature4 - acc_type *__restrict__ hist_buf = hist_buf_base + feature_id * 2 * NUM_BINS; - - within_kernel_reduction256x4(feature_subhists, skip_id, old_val, 1 << power_feature_workgroups, hist_buf, reinterpret_cast(shared_array), power_feature_workgroups); - } -} - -// end of histogram256 stuff - -} // namespace LightGBM diff --git a/src/treelearner/kernels/histogram_16_64_256.hu b/src/treelearner/kernels/histogram_16_64_256.hu deleted file mode 100644 index f72b7465783d..000000000000 --- a/src/treelearner/kernels/histogram_16_64_256.hu +++ /dev/null @@ -1,160 +0,0 @@ -/*! - * Copyright (c) 2020 IBM Corporation. All rights reserved. - * Licensed under the MIT License. See LICENSE file in the project root for license information. - */ - -#ifndef LIGHTGBM_TREELEARNER_KERNELS_HISTOGRAM_16_64_256_HU_ -#define LIGHTGBM_TREELEARNER_KERNELS_HISTOGRAM_16_64_256_HU_ - -#include "LightGBM/meta.h" - -namespace LightGBM { - -// use double precision or not -#ifndef USE_DP_FLOAT -#define USE_DP_FLOAT 1 -#endif - -// ignore hessian, and use the local memory for hessian as an additional bank for gradient -#ifndef CONST_HESSIAN -#define CONST_HESSIAN 0 -#endif - -typedef unsigned char uchar; - -template -__device__ double as_double(const T t) { - static_assert(sizeof(T) == sizeof(double), "size mismatch"); - double d; - memcpy(&d, &t, sizeof(T)); - return d; -} -template -__device__ unsigned long long as_ulong_ulong(const T t) { - static_assert(sizeof(T) == sizeof(unsigned long long), "size mismatch"); - unsigned long long u; - memcpy(&u, &t, sizeof(T)); - return u; -} -template -__device__ float as_float(const T t) { - static_assert(sizeof(T) == sizeof(float), "size mismatch"); - float f; - memcpy(&f, &t, sizeof(T)); - return f; -} -template -__device__ unsigned int as_uint(const T t) { - static_assert(sizeof(T) == sizeof(unsigned int), "size_mismatch"); - unsigned int u; - memcpy(&u, &t, sizeof(T)); - return u; -} -template -__device__ uchar4 as_uchar4(const T t) { - static_assert(sizeof(T) == sizeof(uchar4), "size mismatch"); - uchar4 u; - memcpy(&u, &t, sizeof(T)); - return u; -} - -#if USE_DP_FLOAT == 1 -typedef double acc_type; -typedef unsigned long long acc_int_type; -#define as_acc_type as_double -#define as_acc_int_type as_ulong_ulong -#else -typedef float acc_type; -typedef unsigned int acc_int_type; -#define as_acc_type as_float -#define as_acc_int_type as_uint -#endif - -// use all features and do not use feature mask -#ifndef ENABLE_ALL_FEATURES -#define ENABLE_ALL_FEATURES 1 -#endif - -// define all of the different kernels - -#define DECLARE_CONST_BUF(name) \ -__global__ void name(__global const uchar* restrict feature_data_base, \ - const uchar* restrict feature_masks,\ - const data_size_t feature_size,\ - const data_size_t* restrict data_indices, \ - const data_size_t num_data, \ - const score_t* restrict ordered_gradients, \ - const score_t* restrict ordered_hessians,\ - char* __restrict__ output_buf,\ - volatile int * sync_counters,\ - acc_type* __restrict__ hist_buf_base, \ - const size_t power_feature_workgroups); - - -#define DECLARE_CONST_HES_CONST_BUF(name) \ -__global__ void name(const uchar* __restrict__ feature_data_base, \ - const uchar* __restrict__ feature_masks,\ - const data_size_t feature_size,\ - const data_size_t* __restrict__ data_indices, \ - const data_size_t num_data, \ - const score_t* __restrict__ ordered_gradients, \ - const score_t const_hessian,\ - char* __restrict__ output_buf,\ - volatile int * sync_counters,\ - acc_type* __restrict__ hist_buf_base, \ - const size_t power_feature_workgroups); - - - -#define DECLARE_CONST_HES(name) \ -__global__ void name(const uchar* feature_data_base, \ - const uchar* __restrict__ feature_masks,\ - const data_size_t feature_size,\ - const data_size_t* data_indices, \ - const data_size_t num_data, \ - const score_t* ordered_gradients, \ - const score_t const_hessian,\ - char* __restrict__ output_buf, \ - volatile int * sync_counters,\ - acc_type* __restrict__ hist_buf_base, \ - const size_t power_feature_workgroups); - - -#define DECLARE(name) \ -__global__ void name(const uchar* feature_data_base, \ - const uchar* __restrict__ feature_masks,\ - const data_size_t feature_size,\ - const data_size_t* data_indices, \ - const data_size_t num_data, \ - const score_t* ordered_gradients, \ - const score_t* ordered_hessians,\ - char* __restrict__ output_buf, \ - volatile int * sync_counters,\ - acc_type* __restrict__ hist_buf_base, \ - const size_t power_feature_workgroups); - - -DECLARE_CONST_HES(histogram16_allfeats); -DECLARE_CONST_HES(histogram16_fulldata); -DECLARE_CONST_HES(histogram16); -DECLARE(histogram16_allfeats); -DECLARE(histogram16_fulldata); -DECLARE(histogram16); - -DECLARE_CONST_HES(histogram64_allfeats); -DECLARE_CONST_HES(histogram64_fulldata); -DECLARE_CONST_HES(histogram64); -DECLARE(histogram64_allfeats); -DECLARE(histogram64_fulldata); -DECLARE(histogram64); - -DECLARE_CONST_HES(histogram256_allfeats); -DECLARE_CONST_HES(histogram256_fulldata); -DECLARE_CONST_HES(histogram256); -DECLARE(histogram256_allfeats); -DECLARE(histogram256_fulldata); -DECLARE(histogram256); - -} // namespace LightGBM - -#endif // LIGHTGBM_TREELEARNER_KERNELS_HISTOGRAM_16_64_256_HU_