From ee0131c554844084d6b4eb7fbdf76c069fc40ad2 Mon Sep 17 00:00:00 2001
From: Nikita Titov <nekit94-08@mail.ru>
Date: Thu, 19 Dec 2024 01:01:38 +0300
Subject: [PATCH 1/4] [ci] Add Release Drafter workflow (#6754)

Co-authored-by: James Lamb <jaylamb20@gmail.com>
---
 .../{no-response.yml => no_response.yml}      |  2 +-
 .github/workflows/release_drafter.yml         | 23 +++++++++++++++++++
 2 files changed, 24 insertions(+), 1 deletion(-)
 rename .github/workflows/{no-response.yml => no_response.yml} (96%)
 create mode 100644 .github/workflows/release_drafter.yml

diff --git a/.github/workflows/no-response.yml b/.github/workflows/no_response.yml
similarity index 96%
rename from .github/workflows/no-response.yml
rename to .github/workflows/no_response.yml
index 051d44fba20c..1994f53d7bc7 100644
--- a/.github/workflows/no-response.yml
+++ b/.github/workflows/no_response.yml
@@ -13,7 +13,7 @@ on:
 
 jobs:
   noResponse:
-    runs-on: ubuntu-22.04
+    runs-on: ubuntu-latest
     steps:
       - uses: lee-dohm/no-response@v0.5.0
         with:
diff --git a/.github/workflows/release_drafter.yml b/.github/workflows/release_drafter.yml
new file mode 100644
index 000000000000..80ad63b219bb
--- /dev/null
+++ b/.github/workflows/release_drafter.yml
@@ -0,0 +1,23 @@
+name: Release Drafter
+
+permissions:
+  contents: read
+
+on:
+  push:
+    branches:
+      - master
+
+jobs:
+  updateReleaseDraft:
+    permissions:
+      contents: write
+      pull-requests: read
+    runs-on: ubuntu-latest
+    steps:
+      - uses: release-drafter/release-drafter@v6.0.0
+        with:
+          config-name: release-drafter.yml
+          disable-autolabeler: true
+        env:
+          GITHUB_TOKEN: ${{ github.token }}

From 4ee0bc05be56754672a5a36c0c402634b930d88e Mon Sep 17 00:00:00 2001
From: James Lamb <jaylamb20@gmail.com>
Date: Sun, 22 Dec 2024 10:27:40 -0500
Subject: [PATCH 2/4] [python-package] stop relying on string concatenation /
 splitting for cv() eval results (#6761)

Co-authored-by: Nikita Titov <nekit94-08@mail.ru>
---
 python-package/lightgbm/callback.py      | 72 +++++++++++++-----------
 python-package/lightgbm/engine.py        | 38 +++++++++----
 tests/python_package_test/test_engine.py | 21 +++++++
 3 files changed, 86 insertions(+), 45 deletions(-)

diff --git a/python-package/lightgbm/callback.py b/python-package/lightgbm/callback.py
index c64fb8ba755b..be8d04ed7a5e 100644
--- a/python-package/lightgbm/callback.py
+++ b/python-package/lightgbm/callback.py
@@ -71,6 +71,14 @@ class CallbackEnv:
     evaluation_result_list: Optional[_ListOfEvalResultTuples]
 
 
+def _is_using_cv(env: CallbackEnv) -> bool:
+    """Check if model in callback env is a CVBooster."""
+    # this import is here to avoid a circular import
+    from .engine import CVBooster
+
+    return isinstance(env.model, CVBooster)
+
+
 def _format_eval_result(value: _EvalResultTuple, show_stdv: bool) -> str:
     """Format metric string."""
     dataset_name, metric_name, metric_value, *_ = value
@@ -143,16 +151,13 @@ def _init(self, env: CallbackEnv) -> None:
             )
         self.eval_result.clear()
         for item in env.evaluation_result_list:
-            if len(item) == 4:  # regular train
-                data_name, eval_name = item[:2]
-            else:  # cv
-                data_name, eval_name = item[1].split()
-            self.eval_result.setdefault(data_name, OrderedDict())
+            dataset_name, metric_name, *_ = item
+            self.eval_result.setdefault(dataset_name, OrderedDict())
             if len(item) == 4:
-                self.eval_result[data_name].setdefault(eval_name, [])
+                self.eval_result[dataset_name].setdefault(metric_name, [])
             else:
-                self.eval_result[data_name].setdefault(f"{eval_name}-mean", [])
-                self.eval_result[data_name].setdefault(f"{eval_name}-stdv", [])
+                self.eval_result[dataset_name].setdefault(f"{metric_name}-mean", [])
+                self.eval_result[dataset_name].setdefault(f"{metric_name}-stdv", [])
 
     def __call__(self, env: CallbackEnv) -> None:
         if env.iteration == env.begin_iteration:
@@ -163,15 +168,16 @@ def __call__(self, env: CallbackEnv) -> None:
                 "Please report it at https://github.com/microsoft/LightGBM/issues"
             )
         for item in env.evaluation_result_list:
+            # for cv(), 'metric_value' is actually a mean of metric values over all CV folds
+            dataset_name, metric_name, metric_value, *_ = item
             if len(item) == 4:
-                data_name, eval_name, result = item[:3]
-                self.eval_result[data_name][eval_name].append(result)
+                # train()
+                self.eval_result[dataset_name][metric_name].append(metric_value)
             else:
-                data_name, eval_name = item[1].split()
-                res_mean = item[2]
-                res_stdv = item[4]  # type: ignore[misc]
-                self.eval_result[data_name][f"{eval_name}-mean"].append(res_mean)
-                self.eval_result[data_name][f"{eval_name}-stdv"].append(res_stdv)
+                # cv()
+                metric_std_dev = item[4]  # type: ignore[misc]
+                self.eval_result[dataset_name][f"{metric_name}-mean"].append(metric_value)
+                self.eval_result[dataset_name][f"{metric_name}-stdv"].append(metric_std_dev)
 
 
 def record_evaluation(eval_result: Dict[str, Dict[str, List[Any]]]) -> Callable:
@@ -304,15 +310,15 @@ def _gt_delta(self, curr_score: float, best_score: float, delta: float) -> bool:
     def _lt_delta(self, curr_score: float, best_score: float, delta: float) -> bool:
         return curr_score < best_score - delta
 
-    def _is_train_set(self, ds_name: str, eval_name: str, env: CallbackEnv) -> bool:
+    def _is_train_set(self, dataset_name: str, env: CallbackEnv) -> bool:
         """Check, by name, if a given Dataset is the training data."""
         # for lgb.cv() with eval_train_metric=True, evaluation is also done on the training set
         # and those metrics are considered for early stopping
-        if ds_name == "cv_agg" and eval_name == "train":
+        if _is_using_cv(env) and dataset_name == "train":
             return True
 
         # for lgb.train(), it's possible to pass the training data via valid_sets with any eval_name
-        if isinstance(env.model, Booster) and ds_name == env.model._train_data_name:
+        if isinstance(env.model, Booster) and dataset_name == env.model._train_data_name:
             return True
 
         return False
@@ -327,11 +333,13 @@ def _init(self, env: CallbackEnv) -> None:
             _log_warning("Early stopping is not available in dart mode")
             return
 
+        # get details of the first dataset
+        first_dataset_name, first_metric_name, *_ = env.evaluation_result_list[0]
+
         # validation sets are guaranteed to not be identical to the training data in cv()
         if isinstance(env.model, Booster):
             only_train_set = len(env.evaluation_result_list) == 1 and self._is_train_set(
-                ds_name=env.evaluation_result_list[0][0],
-                eval_name=env.evaluation_result_list[0][1].split(" ")[0],
+                dataset_name=first_dataset_name,
                 env=env,
             )
             if only_train_set:
@@ -370,8 +378,7 @@ def _init(self, env: CallbackEnv) -> None:
                 _log_info(f"Using {self.min_delta} as min_delta for all metrics.")
             deltas = [self.min_delta] * n_datasets * n_metrics
 
-        # split is needed for "<dataset type> <metric>" case (e.g. "train l1")
-        self.first_metric = env.evaluation_result_list[0][1].split(" ")[-1]
+        self.first_metric = first_metric_name
         for eval_ret, delta in zip(env.evaluation_result_list, deltas):
             self.best_iter.append(0)
             if eval_ret[3]:  # greater is better
@@ -381,7 +388,7 @@ def _init(self, env: CallbackEnv) -> None:
                 self.best_score.append(float("inf"))
                 self.cmp_op.append(partial(self._lt_delta, delta=delta))
 
-    def _final_iteration_check(self, env: CallbackEnv, eval_name_splitted: List[str], i: int) -> None:
+    def _final_iteration_check(self, *, env: CallbackEnv, metric_name: str, i: int) -> None:
         if env.iteration == env.end_iteration - 1:
             if self.verbose:
                 best_score_str = "\t".join([_format_eval_result(x, show_stdv=True) for x in self.best_score_list[i]])
@@ -389,7 +396,7 @@ def _final_iteration_check(self, env: CallbackEnv, eval_name_splitted: List[str]
                     "Did not meet early stopping. " f"Best iteration is:\n[{self.best_iter[i] + 1}]\t{best_score_str}"
                 )
                 if self.first_metric_only:
-                    _log_info(f"Evaluated only: {eval_name_splitted[-1]}")
+                    _log_info(f"Evaluated only: {metric_name}")
             raise EarlyStopException(self.best_iter[i], self.best_score_list[i])
 
     def __call__(self, env: CallbackEnv) -> None:
@@ -405,21 +412,18 @@ def __call__(self, env: CallbackEnv) -> None:
         # self.best_score_list is initialized to an empty list
         first_time_updating_best_score_list = self.best_score_list == []
         for i in range(len(env.evaluation_result_list)):
-            score = env.evaluation_result_list[i][2]
-            if first_time_updating_best_score_list or self.cmp_op[i](score, self.best_score[i]):
-                self.best_score[i] = score
+            dataset_name, metric_name, metric_value, *_ = env.evaluation_result_list[i]
+            if first_time_updating_best_score_list or self.cmp_op[i](metric_value, self.best_score[i]):
+                self.best_score[i] = metric_value
                 self.best_iter[i] = env.iteration
                 if first_time_updating_best_score_list:
                     self.best_score_list.append(env.evaluation_result_list)
                 else:
                     self.best_score_list[i] = env.evaluation_result_list
-            # split is needed for "<dataset type> <metric>" case (e.g. "train l1")
-            eval_name_splitted = env.evaluation_result_list[i][1].split(" ")
-            if self.first_metric_only and self.first_metric != eval_name_splitted[-1]:
+            if self.first_metric_only and self.first_metric != metric_name:
                 continue  # use only the first metric for early stopping
             if self._is_train_set(
-                ds_name=env.evaluation_result_list[i][0],
-                eval_name=eval_name_splitted[0],
+                dataset_name=dataset_name,
                 env=env,
             ):
                 continue  # train data for lgb.cv or sklearn wrapper (underlying lgb.train)
@@ -430,9 +434,9 @@ def __call__(self, env: CallbackEnv) -> None:
                     )
                     _log_info(f"Early stopping, best iteration is:\n[{self.best_iter[i] + 1}]\t{eval_result_str}")
                     if self.first_metric_only:
-                        _log_info(f"Evaluated only: {eval_name_splitted[-1]}")
+                        _log_info(f"Evaluated only: {metric_name}")
                 raise EarlyStopException(self.best_iter[i], self.best_score_list[i])
-            self._final_iteration_check(env, eval_name_splitted, i)
+            self._final_iteration_check(env=env, metric_name=metric_name, i=i)
 
 
 def _should_enable_early_stopping(stopping_rounds: Any) -> bool:
diff --git a/python-package/lightgbm/engine.py b/python-package/lightgbm/engine.py
index 20dfc62b8856..ccbb4376a89f 100644
--- a/python-package/lightgbm/engine.py
+++ b/python-package/lightgbm/engine.py
@@ -581,15 +581,31 @@ def _agg_cv_result(
     raw_results: List[List[_LGBM_BoosterEvalMethodResultType]],
 ) -> List[_LGBM_BoosterEvalMethodResultWithStandardDeviationType]:
     """Aggregate cross-validation results."""
-    cvmap: Dict[str, List[float]] = OrderedDict()
-    metric_type: Dict[str, bool] = {}
+    # build up 2 maps, of the form:
+    #
+    # OrderedDict{
+    #     (<dataset_name>, <metric_name>): <is_higher_better>
+    # }
+    #
+    # OrderedDict{
+    #     (<dataset_name>, <metric_name>): list[<metric_value>]
+    # }
+    #
+    metric_types: Dict[Tuple[str, str], bool] = OrderedDict()
+    metric_values: Dict[Tuple[str, str], List[float]] = OrderedDict()
     for one_result in raw_results:
-        for one_line in one_result:
-            key = f"{one_line[0]} {one_line[1]}"
-            metric_type[key] = one_line[3]
-            cvmap.setdefault(key, [])
-            cvmap[key].append(one_line[2])
-    return [("cv_agg", k, float(np.mean(v)), metric_type[k], float(np.std(v))) for k, v in cvmap.items()]
+        for dataset_name, metric_name, metric_value, is_higher_better in one_result:
+            key = (dataset_name, metric_name)
+            metric_types[key] = is_higher_better
+            metric_values.setdefault(key, [])
+            metric_values[key].append(metric_value)
+
+    # turn that into a list of tuples of the form:
+    #
+    # [
+    #     (<dataset_name>, <metric_name>, mean(<values>), <is_higher_better>, std_dev(<values>))
+    # ]
+    return [(k[0], k[1], float(np.mean(v)), metric_types[k], float(np.std(v))) for k, v in metric_values.items()]
 
 
 def cv(
@@ -812,9 +828,9 @@ def cv(
             )
         cvbooster.update(fobj=fobj)  # type: ignore[call-arg]
         res = _agg_cv_result(cvbooster.eval_valid(feval))  # type: ignore[call-arg]
-        for _, key, mean, _, std in res:
-            results[f"{key}-mean"].append(mean)
-            results[f"{key}-stdv"].append(std)
+        for dataset_name, metric_name, metric_mean, _, metric_std_dev in res:
+            results[f"{dataset_name} {metric_name}-mean"].append(metric_mean)
+            results[f"{dataset_name} {metric_name}-stdv"].append(metric_std_dev)
         try:
             for cb in callbacks_after_iter:
                 cb(
diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py
index 05afddb77c77..a1797d1c1187 100644
--- a/tests/python_package_test/test_engine.py
+++ b/tests/python_package_test/test_engine.py
@@ -64,6 +64,13 @@ def constant_metric(preds, train_data):
     return ("error", 0.0, False)
 
 
+def constant_metric_multi(preds, train_data):
+    return [
+        ("important_metric", 1.5, False),
+        ("irrelevant_metric", 7.8, False),
+    ]
+
+
 def decreasing_metric(preds, train_data):
     return ("decreasing_metric", next(decreasing_generator), False)
 
@@ -2570,6 +2577,13 @@ def train_booster(params=params_obj_verbose, **kwargs):
     assert "valid binary_logloss-mean" in res
     assert "valid error-mean" in res
 
+    # default metric in args with 1 custom function returning a list of 2 metrics
+    res = get_cv_result(metrics="binary_logloss", feval=constant_metric_multi)
+    assert len(res) == 6
+    assert "valid binary_logloss-mean" in res
+    assert res["valid important_metric-mean"] == [1.5, 1.5]
+    assert res["valid irrelevant_metric-mean"] == [7.8, 7.8]
+
     # non-default metric in args with custom one
     res = get_cv_result(metrics="binary_error", feval=constant_metric)
     assert len(res) == 4
@@ -2703,6 +2717,13 @@ def train_booster(params=params_obj_verbose, **kwargs):
     assert "binary_logloss" in evals_result["valid_0"]
     assert "error" in evals_result["valid_0"]
 
+    # default metric in params with custom function returning a list of 2 metrics
+    train_booster(params=params_obj_metric_log_verbose, feval=constant_metric_multi)
+    assert len(evals_result["valid_0"]) == 3
+    assert "binary_logloss" in evals_result["valid_0"]
+    assert evals_result["valid_0"]["important_metric"] == [1.5, 1.5]
+    assert evals_result["valid_0"]["irrelevant_metric"] == [7.8, 7.8]
+
     # non-default metric in params with custom one
     train_booster(params=params_obj_metric_err_verbose, feval=constant_metric)
     assert len(evals_result["valid_0"]) == 2

From 60b0155ac573a8ad5994c74c49e05854281e2469 Mon Sep 17 00:00:00 2001
From: RektPunk <110188257+RektPunk@users.noreply.github.com>
Date: Mon, 23 Dec 2024 00:35:51 +0900
Subject: [PATCH 3/4] [python-package] Fix inconsistency in `predict()` output
 shape for 1-tree models (#6753)

---
 python-package/lightgbm/basic.py         |  2 +-
 tests/python_package_test/test_engine.py | 90 +++++++++++++++++++++++-
 2 files changed, 90 insertions(+), 2 deletions(-)

diff --git a/python-package/lightgbm/basic.py b/python-package/lightgbm/basic.py
index e06290dc1c5f..7b152fd2b006 100644
--- a/python-package/lightgbm/basic.py
+++ b/python-package/lightgbm/basic.py
@@ -1248,7 +1248,7 @@ def predict(
         if pred_leaf:
             preds = preds.astype(np.int32)
         is_sparse = isinstance(preds, (list, scipy.sparse.spmatrix))
-        if not is_sparse and preds.size != nrow:
+        if not is_sparse and (preds.size != nrow or pred_leaf or pred_contrib):
             if preds.size % nrow == 0:
                 preds = preds.reshape(nrow, -1)
             else:
diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py
index a1797d1c1187..667cb86c1a14 100644
--- a/tests/python_package_test/test_engine.py
+++ b/tests/python_package_test/test_engine.py
@@ -15,7 +15,7 @@
 import psutil
 import pytest
 from scipy.sparse import csr_matrix, isspmatrix_csc, isspmatrix_csr
-from sklearn.datasets import load_svmlight_file, make_blobs, make_multilabel_classification
+from sklearn.datasets import load_svmlight_file, make_blobs, make_classification, make_multilabel_classification
 from sklearn.metrics import average_precision_score, log_loss, mean_absolute_error, mean_squared_error, roc_auc_score
 from sklearn.model_selection import GroupKFold, TimeSeriesSplit, train_test_split
 
@@ -2314,6 +2314,33 @@ def test_refit():
     assert err_pred > new_err_pred
 
 
+def test_refit_with_one_tree_regression():
+    X, y = make_synthetic_regression(n_samples=1_000, n_features=2)
+    lgb_train = lgb.Dataset(X, label=y)
+    params = {"objective": "regression", "verbosity": -1}
+    model = lgb.train(params, lgb_train, num_boost_round=1)
+    model_refit = model.refit(X, y)
+    assert isinstance(model_refit, lgb.Booster)
+
+
+def test_refit_with_one_tree_binary_classification():
+    X, y = load_breast_cancer(return_X_y=True)
+    lgb_train = lgb.Dataset(X, label=y)
+    params = {"objective": "binary", "verbosity": -1}
+    model = lgb.train(params, lgb_train, num_boost_round=1)
+    model_refit = model.refit(X, y)
+    assert isinstance(model_refit, lgb.Booster)
+
+
+def test_refit_with_one_tree_multiclass_classification():
+    X, y = load_iris(return_X_y=True)
+    lgb_train = lgb.Dataset(X, y)
+    params = {"objective": "multiclass", "num_class": 3, "verbose": -1}
+    model = lgb.train(params, lgb_train, num_boost_round=1)
+    model_refit = model.refit(X, y)
+    assert isinstance(model_refit, lgb.Booster)
+
+
 def test_refit_dataset_params(rng):
     # check refit accepts dataset_params
     X, y = load_breast_cancer(return_X_y=True)
@@ -3872,6 +3899,67 @@ def test_predict_stump(rng, use_init_score):
         np.testing.assert_allclose(preds_all, np.full_like(preds_all, fill_value=y_avg))
 
 
+def test_predict_regression_output_shape():
+    n_samples = 1_000
+    n_features = 4
+    X, y = make_synthetic_regression(n_samples=n_samples, n_features=n_features)
+    dtrain = lgb.Dataset(X, label=y)
+    params = {"objective": "regression", "verbosity": -1}
+
+    # 1-round model
+    bst = lgb.train(params, dtrain, num_boost_round=1)
+    assert bst.predict(X).shape == (n_samples,)
+    assert bst.predict(X, pred_contrib=True).shape == (n_samples, n_features + 1)
+    assert bst.predict(X, pred_leaf=True).shape == (n_samples, 1)
+
+    # 2-round model
+    bst = lgb.train(params, dtrain, num_boost_round=2)
+    assert bst.predict(X).shape == (n_samples,)
+    assert bst.predict(X, pred_contrib=True).shape == (n_samples, n_features + 1)
+    assert bst.predict(X, pred_leaf=True).shape == (n_samples, 2)
+
+
+def test_predict_binary_classification_output_shape():
+    n_samples = 1_000
+    n_features = 4
+    X, y = make_classification(n_samples=n_samples, n_features=n_features, n_classes=2)
+    dtrain = lgb.Dataset(X, label=y)
+    params = {"objective": "binary", "verbosity": -1}
+
+    # 1-round model
+    bst = lgb.train(params, dtrain, num_boost_round=1)
+    assert bst.predict(X).shape == (n_samples,)
+    assert bst.predict(X, pred_contrib=True).shape == (n_samples, n_features + 1)
+    assert bst.predict(X, pred_leaf=True).shape == (n_samples, 1)
+
+    # 2-round model
+    bst = lgb.train(params, dtrain, num_boost_round=2)
+    assert bst.predict(X).shape == (n_samples,)
+    assert bst.predict(X, pred_contrib=True).shape == (n_samples, n_features + 1)
+    assert bst.predict(X, pred_leaf=True).shape == (n_samples, 2)
+
+
+def test_predict_multiclass_classification_output_shape():
+    n_samples = 1_000
+    n_features = 10
+    n_classes = 3
+    X, y = make_classification(n_samples=n_samples, n_features=n_features, n_classes=n_classes, n_informative=6)
+    dtrain = lgb.Dataset(X, label=y)
+    params = {"objective": "multiclass", "verbosity": -1, "num_class": n_classes}
+
+    # 1-round model
+    bst = lgb.train(params, dtrain, num_boost_round=1)
+    assert bst.predict(X).shape == (n_samples, n_classes)
+    assert bst.predict(X, pred_contrib=True).shape == (n_samples, n_classes * (n_features + 1))
+    assert bst.predict(X, pred_leaf=True).shape == (n_samples, n_classes)
+
+    # 2-round model
+    bst = lgb.train(params, dtrain, num_boost_round=2)
+    assert bst.predict(X).shape == (n_samples, n_classes)
+    assert bst.predict(X, pred_contrib=True).shape == (n_samples, n_classes * (n_features + 1))
+    assert bst.predict(X, pred_leaf=True).shape == (n_samples, n_classes * 2)
+
+
 def test_average_precision_metric():
     # test against sklearn average precision metric
     X, y = load_breast_cancer(return_X_y=True)

From f3bd64a1ea815f761d0f845bc16b60f0a71fdd99 Mon Sep 17 00:00:00 2001
From: shiyu1994 <shiyu_k1994@qq.com>
Date: Thu, 2 Jan 2025 14:55:02 +0800
Subject: [PATCH 4/4] [CUDA] remove src/treelearner/kernels (#6766)

* remove src/treelearner/kernels

* Update CMakeLists.txt

* clean up
---
 CMakeLists.txt                                |  56 --
 .../kernels/histogram_16_64_256.cu            | 949 ------------------
 .../kernels/histogram_16_64_256.hu            | 160 ---
 3 files changed, 1165 deletions(-)
 delete mode 100644 src/treelearner/kernels/histogram_16_64_256.cu
 delete mode 100644 src/treelearner/kernels/histogram_16_64_256.hu

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 4f57cf9622e6..36557304548f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -252,54 +252,6 @@ if(USE_CUDA)
       set(CMAKE_CUDA_STANDARD 11)
       set(CMAKE_CUDA_STANDARD_REQUIRED ON)
     endif()
-
-    set(
-      BASE_DEFINES
-      -DPOWER_FEATURE_WORKGROUPS=12
-      -DUSE_CONSTANT_BUF=0
-    )
-    set(
-      ALLFEATS_DEFINES
-      ${BASE_DEFINES}
-      -DENABLE_ALL_FEATURES
-    )
-    set(
-      FULLDATA_DEFINES
-      ${ALLFEATS_DEFINES}
-      -DIGNORE_INDICES
-    )
-
-    message(STATUS "ALLFEATS_DEFINES: ${ALLFEATS_DEFINES}")
-    message(STATUS "FULLDATA_DEFINES: ${FULLDATA_DEFINES}")
-
-    function(add_histogram hsize hname hadd hconst hdir)
-      add_library(histo${hsize}${hname} OBJECT src/treelearner/kernels/histogram${hsize}.cu)
-      set_target_properties(
-        histo${hsize}${hname}
-        PROPERTIES
-          CUDA_SEPARABLE_COMPILATION ON
-          CUDA_ARCHITECTURES ${CUDA_ARCHS}
-      )
-      if(hadd)
-        list(APPEND histograms histo${hsize}${hname})
-        set(histograms ${histograms} PARENT_SCOPE)
-      endif()
-      target_compile_definitions(
-        histo${hsize}${hname}
-        PRIVATE
-        -DCONST_HESSIAN=${hconst}
-        ${hdir}
-      )
-    endfunction()
-
-    foreach(hsize _16_64_256)
-      add_histogram("${hsize}" "_sp_const" "True" "1" "${BASE_DEFINES}")
-      add_histogram("${hsize}" "_sp" "True" "0" "${BASE_DEFINES}")
-      add_histogram("${hsize}" "-allfeats_sp_const" "False" "1" "${ALLFEATS_DEFINES}")
-      add_histogram("${hsize}" "-allfeats_sp" "False" "0" "${ALLFEATS_DEFINES}")
-      add_histogram("${hsize}" "-fulldata_sp_const" "True" "1" "${FULLDATA_DEFINES}")
-      add_histogram("${hsize}" "-fulldata_sp" "True" "0" "${FULLDATA_DEFINES}")
-    endforeach()
 endif()
 
 include(CheckCXXSourceCompiles)
@@ -634,14 +586,6 @@ if(USE_CUDA)
         CUDA_RESOLVE_DEVICE_SYMBOLS ON
     )
   endif()
-
-  # histograms are list of object libraries. Linking object library to other
-  # object libraries only gets usage requirements, the linked objects won't be
-  # used. Thus we have to call target_link_libraries on final targets here.
-  if(BUILD_CLI)
-    target_link_libraries(lightgbm PRIVATE ${histograms})
-  endif()
-  target_link_libraries(_lightgbm PRIVATE ${histograms})
 endif()
 
 if(WIN32)
diff --git a/src/treelearner/kernels/histogram_16_64_256.cu b/src/treelearner/kernels/histogram_16_64_256.cu
deleted file mode 100644
index 9d8427a6f9a8..000000000000
--- a/src/treelearner/kernels/histogram_16_64_256.cu
+++ /dev/null
@@ -1,949 +0,0 @@
-/*!
- * Copyright (c) 2020 IBM Corporation. All rights reserved.
- * Licensed under the MIT License. See LICENSE file in the project root for license information.
- */
-
-#include <LightGBM/meta.h>
-
-#include <cstdint>
-#include <cstdio>
-
-#include "histogram_16_64_256.hu"
-
-namespace LightGBM {
-
-// atomic add for float number in local memory
-inline __device__ void atomic_local_add_f(acc_type *addr, const acc_type val) {
-    atomicAdd(addr, static_cast<acc_type>(val));
-}
-
-// histogram16 stuff
-#ifdef ENABLE_ALL_FEATURES
-#ifdef IGNORE_INDICES
-#define KERNEL_NAME histogram16_fulldata
-#else  // IGNORE_INDICES
-#define KERNEL_NAME histogram16
-#endif  // IGNORE_INDICES
-#else  // ENABLE_ALL_FEATURES
-#error "ENABLE_ALL_FEATURES should always be 1"
-#define KERNEL_NAME histogram16
-#endif  // ENABLE_ALL_FEATURES
-#define NUM_BINS 16
-#define LOCAL_MEM_SIZE ((sizeof(unsigned int) + 2 * sizeof(acc_type)) * NUM_BINS)
-
-// this function will be called by histogram16
-// we have one sub-histogram of one feature in local memory, and need to read others
-inline void __device__ within_kernel_reduction16x4(const acc_type* __restrict__ feature_sub_hist,
-                           const unsigned int skip_id,
-                           const unsigned int old_val_cont_bin0,
-                           const uint16_t num_sub_hist,
-                           acc_type* __restrict__ output_buf,
-                           acc_type* __restrict__ local_hist,
-                           const size_t power_feature_workgroups) {
-    const uint16_t ltid = threadIdx.x;
-    acc_type grad_bin = local_hist[ltid * 2];
-    acc_type hess_bin = local_hist[ltid * 2 + 1];
-    unsigned int* __restrict__ local_cnt = reinterpret_cast<unsigned int *>(local_hist + 2 * NUM_BINS);
-
-    unsigned int cont_bin;
-    if (power_feature_workgroups != 0) {
-      cont_bin = ltid ? local_cnt[ltid] : old_val_cont_bin0;
-    } else {
-      cont_bin = local_cnt[ltid];
-    }
-    uint16_t i;
-
-    if (power_feature_workgroups != 0) {
-        // add all sub-histograms for feature
-        const acc_type* __restrict__ p = feature_sub_hist + ltid;
-        for (i = 0; i < skip_id; ++i) {
-            grad_bin += *p;          p += NUM_BINS;
-            hess_bin += *p;          p += NUM_BINS;
-            cont_bin += as_acc_int_type(*p); p += NUM_BINS;
-        }
-
-        // skip the counters we already have
-        p += 3 * NUM_BINS;
-
-        for (i = i + 1; i < num_sub_hist; ++i) {
-            grad_bin += *p;          p += NUM_BINS;
-            hess_bin += *p;          p += NUM_BINS;
-            cont_bin += as_acc_int_type(*p); p += NUM_BINS;
-        }
-    }
-    __syncthreads();
-
-    output_buf[ltid * 2 + 0] = grad_bin;
-    output_buf[ltid * 2 + 1] = hess_bin;
-}
-
-#if USE_CONSTANT_BUF == 1
-__kernel void KERNEL_NAME(__global const uchar* restrict feature_data_base,
-                      __constant const uchar* restrict feature_masks __attribute__((max_constant_size(65536))),
-                      const data_size_t feature_size,
-                      __constant const data_size_t* restrict data_indices __attribute__((max_constant_size(65536))),
-                      const data_size_t num_data,
-                      __constant const score_t* restrict ordered_gradients __attribute__((max_constant_size(65536))),
-#if CONST_HESSIAN == 0
-                      __constant const score_t* restrict ordered_hessians __attribute__((max_constant_size(65536))),
-#else
-                      const score_t const_hessian,
-#endif
-                      char* __restrict__ output_buf,
-                      volatile int * sync_counters,
-                      acc_type* __restrict__ hist_buf_base,
-                      const size_t power_feature_workgroups) {
-#else
-__global__ void KERNEL_NAME(const uchar* feature_data_base,
-                      const uchar* __restrict__ feature_masks,
-                      const data_size_t feature_size,
-                      const data_size_t* data_indices,
-                      const data_size_t num_data,
-                      const score_t*  ordered_gradients,
-#if CONST_HESSIAN == 0
-                      const score_t*  ordered_hessians,
-#else
-                      const score_t const_hessian,
-#endif
-                      char* __restrict__ output_buf,
-                      volatile int * sync_counters,
-                      acc_type* __restrict__ hist_buf_base,
-                      const size_t power_feature_workgroups) {
-#endif
-    // allocate the local memory array aligned with float2, to guarantee correct alignment on NVIDIA platforms
-    // otherwise a "Misaligned Address" exception may occur
-    __shared__ float2 shared_array[LOCAL_MEM_SIZE/sizeof(float2)];
-    const unsigned int gtid = blockIdx.x * blockDim.x + threadIdx.x;
-    const uint16_t ltid = threadIdx.x;
-    const uint16_t lsize = NUM_BINS;  // get_local_size(0);
-    const uint16_t group_id = blockIdx.x;
-
-    // local memory per workgroup is 3 KB
-    // clear local memory
-    unsigned int *ptr = reinterpret_cast<unsigned int *>(shared_array);
-    for (int i = ltid; i < LOCAL_MEM_SIZE/sizeof(unsigned int); i += lsize) {
-        ptr[i] = 0;
-    }
-    __syncthreads();
-    // gradient/hessian histograms
-    // assume this starts at 32 * 4 = 128-byte boundary // What does it mean? boundary??
-    // total size: 2 * 256 * size_of(float) = 2 KB
-    // organization: each feature/grad/hessian is at a different bank,
-    //               as independent of the feature value as possible
-    acc_type *gh_hist = reinterpret_cast<acc_type *>(shared_array);
-
-    // counter histogram
-    // total size: 256 * size_of(unsigned int) = 1 KB
-    unsigned int *cnt_hist = reinterpret_cast<unsigned int *>(gh_hist + 2 * NUM_BINS);
-
-    // odd threads (1, 3, ...) compute histograms for hessians first
-    // even thread (0, 2, ...) compute histograms for gradients first
-    // etc.
-    uchar is_hessian_first = ltid & 1;
-
-    uint16_t feature_id = group_id >> power_feature_workgroups;
-
-    // each 2^POWER_FEATURE_WORKGROUPS workgroups process on one feature (compile-time constant)
-    // feature_size is the number of examples per feature
-    const uchar *feature_data = feature_data_base + feature_id * feature_size;
-
-    // size of threads that process this feature4
-    const unsigned int subglobal_size = lsize * (1 << power_feature_workgroups);
-
-    // equivalent thread ID in this subgroup for this feature4
-    const unsigned int subglobal_tid  = gtid - feature_id * subglobal_size;
-
-
-    data_size_t ind;
-    data_size_t ind_next;
-    #ifdef IGNORE_INDICES
-    ind = subglobal_tid;
-    #else
-    ind = data_indices[subglobal_tid];
-    #endif
-
-    // extract feature mask, when a byte is set to 0, that feature is disabled
-    uchar feature_mask = feature_masks[feature_id];
-    // exit if the feature is masked
-    if (!feature_mask) {
-        return;
-    } else {
-        feature_mask = feature_mask - 1;  // feature_mask is used for get feature (1: 4bit feature, 0: 8bit feature)
-    }
-
-    // STAGE 1: read feature data, and gradient and hessian
-    // first half of the threads read feature data from global memory
-    // We will prefetch data into the "next" variable at the beginning of each iteration
-    uchar feature;
-    uchar feature_next;
-    uint16_t bin;
-
-    feature = feature_data[ind >> feature_mask];
-    if (feature_mask) {
-       feature = (feature >> ((ind & 1) << 2)) & 0xf;
-    }
-    bin = feature;
-    acc_type grad_bin = 0.0f, hess_bin = 0.0f;
-    acc_type *addr_bin;
-
-    // store gradient and hessian
-    score_t grad, hess;
-    score_t grad_next, hess_next;
-    grad = ordered_gradients[ind];
-    #if CONST_HESSIAN == 0
-    hess = ordered_hessians[ind];
-    #endif
-
-    // there are 2^POWER_FEATURE_WORKGROUPS workgroups processing each feature4
-    for (unsigned int i = subglobal_tid; i < num_data; i += subglobal_size) {
-        // prefetch the next iteration variables
-        // we don't need boundary check because we have made the buffer large
-        int i_next = i + subglobal_size;
-        #ifdef IGNORE_INDICES
-        // we need to check to bounds here
-        ind_next = i_next < num_data ? i_next : i;
-        #else
-        ind_next = data_indices[i_next];
-        #endif
-
-        grad_next = ordered_gradients[ind_next];
-        #if CONST_HESSIAN == 0
-        hess_next = ordered_hessians[ind_next];
-        #endif
-
-        // STAGE 2: accumulate gradient and hessian
-        if (bin != feature) {
-            addr_bin = gh_hist + bin * 2 + is_hessian_first;
-            #if CONST_HESSIAN == 0
-            acc_type acc_bin = is_hessian_first ? hess_bin : grad_bin;
-            atomic_local_add_f(addr_bin, acc_bin);
-
-            addr_bin = addr_bin + 1 - 2 * is_hessian_first;
-            acc_bin = is_hessian_first ? grad_bin : hess_bin;
-            atomic_local_add_f(addr_bin, acc_bin);
-
-            #elif CONST_HESSIAN == 1
-            atomic_local_add_f(addr_bin, grad_bin);
-            #endif
-
-            bin = feature;
-            grad_bin = grad;
-            hess_bin = hess;
-        } else {
-            grad_bin += grad;
-            hess_bin += hess;
-        }
-
-        // prefetch the next iteration variables
-        feature_next = feature_data[ind_next >> feature_mask];
-
-        // STAGE 3: accumulate counter
-        atomicAdd(cnt_hist + feature, 1);
-
-        // STAGE 4: update next stat
-        grad = grad_next;
-        hess = hess_next;
-        if (!feature_mask) {
-            feature = feature_next;
-        } else {
-            feature = (feature_next >> ((ind_next & 1) << 2)) & 0xf;
-        }
-    }
-
-
-    addr_bin = gh_hist + bin * 2 + is_hessian_first;
-    #if CONST_HESSIAN == 0
-    acc_type acc_bin = is_hessian_first ? hess_bin : grad_bin;
-    atomic_local_add_f(addr_bin, acc_bin);
-
-    addr_bin = addr_bin + 1 - 2 * is_hessian_first;
-    acc_bin = is_hessian_first ? grad_bin : hess_bin;
-    atomic_local_add_f(addr_bin, acc_bin);
-
-    #elif CONST_HESSIAN == 1
-    atomic_local_add_f(addr_bin, grad_bin);
-    #endif
-    __syncthreads();
-
-    #if CONST_HESSIAN == 1
-    // make a final reduction
-    gh_hist[ltid * 2] += gh_hist[ltid * 2 + 1];
-    gh_hist[ltid * 2 + 1] = const_hessian * cnt_hist[ltid];  // counter move to this position
-    __syncthreads();
-    #endif
-
-#if POWER_FEATURE_WORKGROUPS != 0
-    acc_type *__restrict__ output = reinterpret_cast<acc_type *>(output_buf) + group_id * 3 * NUM_BINS;
-    // write gradients and Hessians
-    acc_type *__restrict__ ptr_f = output;
-    for (uint16_t i = ltid; i < 2 * NUM_BINS; i += lsize) {
-        // even threads read gradients, odd threads read Hessians
-        acc_type value = gh_hist[i];
-        ptr_f[(i & 1) * NUM_BINS + (i >> 1)] = value;
-    }
-    // write counts
-    acc_int_type *__restrict__ ptr_i = reinterpret_cast<acc_int_type *>(output + 2 * NUM_BINS);
-    for (uint16_t i = ltid; i < NUM_BINS; i += lsize) {
-        unsigned int value = cnt_hist[i];
-        ptr_i[i] = value;
-    }
-    __syncthreads();
-    __threadfence();
-    unsigned int * counter_val = cnt_hist;
-    // backup the old value
-    unsigned int old_val = *counter_val;
-    if (ltid == 0) {
-        // all workgroups processing the same feature add this counter
-        *counter_val = atomicAdd(const_cast<int*>(sync_counters + feature_id), 1);
-    }
-    // make sure everyone in this workgroup is here
-    __syncthreads();
-    // everyone in this workgroup: if we are the last workgroup, then do reduction!
-    if (*counter_val == (1 << power_feature_workgroups) - 1) {
-        if (ltid == 0) {
-            sync_counters[feature_id] = 0;
-        }
-#else
-    }
-    // only 1 work group, no need to increase counter
-    // the reduction will become a simple copy
-    {
-        unsigned int old_val;  // dummy
-#endif
-        // locate our feature's block in output memory
-        unsigned int output_offset = (feature_id << power_feature_workgroups);
-        acc_type const * __restrict__ feature_subhists =
-                 reinterpret_cast<acc_type *>(output_buf) + output_offset * 3 * NUM_BINS;
-        // skip reading the data already in local memory
-        unsigned int skip_id = group_id - output_offset;
-        // locate output histogram location for this feature4
-        acc_type *__restrict__ hist_buf = hist_buf_base + feature_id * 2 * NUM_BINS;
-
-        within_kernel_reduction16x4(feature_subhists, skip_id, old_val, 1 << power_feature_workgroups, hist_buf, reinterpret_cast<acc_type *>(shared_array), power_feature_workgroups);
-    }
-}
-
-// end of histogram16 stuff
-
-// histogram64 stuff
-#undef KERNEL_NAME
-#undef NUM_BINS
-#undef LOCAL_MEM_SIZE
-#ifdef ENABLE_ALL_FEATURES
-#ifdef IGNORE_INDICES
-#define KERNEL_NAME histogram64_fulldata
-#else  // IGNORE_INDICES
-#define KERNEL_NAME histogram64  // seems like ENABLE_ALL_FEATURES is set to 1 in the header if its disabled
-// #define KERNEL_NAME histogram64_allfeats
-#endif  // IGNORE_INDICES
-#else  // ENABLE_ALL_FEATURES
-#error "ENABLE_ALL_FEATURES should always be 1"
-#define KERNEL_NAME histogram64
-#endif  // ENABLE_ALL_FEATURES
-#define NUM_BINS 64
-#define LOCAL_MEM_SIZE ((sizeof(unsigned int) + 2 * sizeof(acc_type)) * NUM_BINS)
-
-// this function will be called by histogram64
-// we have one sub-histogram of one feature in local memory, and need to read others
-inline void __device__ within_kernel_reduction64x4(const acc_type* __restrict__ feature_sub_hist,
-                           const unsigned int skip_id,
-                           const unsigned int old_val_cont_bin0,
-                           const uint16_t num_sub_hist,
-                           acc_type* __restrict__ output_buf,
-                           acc_type* __restrict__ local_hist,
-                           const size_t power_feature_workgroups) {
-    const uint16_t ltid = threadIdx.x;
-    acc_type grad_bin = local_hist[ltid * 2];
-    acc_type hess_bin = local_hist[ltid * 2 + 1];
-    unsigned int* __restrict__ local_cnt = reinterpret_cast<unsigned int *>(local_hist + 2 * NUM_BINS);
-
-    unsigned int cont_bin;
-    if (power_feature_workgroups != 0) {
-      cont_bin = ltid ? local_cnt[ltid] : old_val_cont_bin0;
-    } else {
-      cont_bin = local_cnt[ltid];
-    }
-    uint16_t i;
-
-    if (power_feature_workgroups != 0) {
-        // add all sub-histograms for feature
-        const acc_type* __restrict__ p = feature_sub_hist + ltid;
-        for (i = 0; i < skip_id; ++i) {
-            grad_bin += *p;          p += NUM_BINS;
-            hess_bin += *p;          p += NUM_BINS;
-            cont_bin += as_acc_int_type(*p); p += NUM_BINS;
-        }
-
-        // skip the counters we already have
-        p += 3 * NUM_BINS;
-
-        for (i = i + 1; i < num_sub_hist; ++i) {
-            grad_bin += *p;          p += NUM_BINS;
-            hess_bin += *p;          p += NUM_BINS;
-            cont_bin += as_acc_int_type(*p); p += NUM_BINS;
-        }
-    }
-    __syncthreads();
-
-    output_buf[ltid * 2 + 0] = grad_bin;
-    output_buf[ltid * 2 + 1] = hess_bin;
-}
-
-#if USE_CONSTANT_BUF == 1
-__kernel void KERNEL_NAME(__global const uchar* restrict feature_data_base,
-                      __constant const uchar* restrict feature_masks __attribute__((max_constant_size(65536))),
-                      const data_size_t feature_size,
-                      __constant const data_size_t* restrict data_indices __attribute__((max_constant_size(65536))),
-                      const data_size_t num_data,
-                      __constant const score_t* restrict ordered_gradients __attribute__((max_constant_size(65536))),
-#if CONST_HESSIAN == 0
-                      __constant const score_t* restrict ordered_hessians __attribute__((max_constant_size(65536))),
-#else
-                      const score_t const_hessian,
-#endif
-                      char* __restrict__ output_buf,
-                      volatile int * sync_counters,
-                      acc_type* __restrict__ hist_buf_base,
-                      const size_t power_feature_workgroups) {
-#else
-__global__ void KERNEL_NAME(const uchar* feature_data_base,
-                      const uchar* __restrict__ feature_masks,
-                      const data_size_t feature_size,
-                      const data_size_t* data_indices,
-                      const data_size_t num_data,
-                      const score_t*  ordered_gradients,
-#if CONST_HESSIAN == 0
-                      const score_t*  ordered_hessians,
-#else
-                      const score_t const_hessian,
-#endif
-                      char* __restrict__ output_buf,
-                      volatile int * sync_counters,
-                      acc_type* __restrict__ hist_buf_base,
-                      const size_t power_feature_workgroups) {
-#endif
-    // allocate the local memory array aligned with float2, to guarantee correct alignment on NVIDIA platforms
-    // otherwise a "Misaligned Address" exception may occur
-    __shared__ float2 shared_array[LOCAL_MEM_SIZE/sizeof(float2)];
-    const unsigned int gtid = blockIdx.x * blockDim.x + threadIdx.x;
-    const uint16_t ltid = threadIdx.x;
-    const uint16_t lsize = NUM_BINS;  // get_local_size(0);
-    const uint16_t group_id = blockIdx.x;
-
-    // local memory per workgroup is 3 KB
-    // clear local memory
-    unsigned int *ptr = reinterpret_cast<unsigned int *>(shared_array);
-    for (int i = ltid; i < LOCAL_MEM_SIZE/sizeof(unsigned int); i += lsize) {
-        ptr[i] = 0;
-    }
-    __syncthreads();
-    // gradient/hessian histograms
-    // assume this starts at 32 * 4 = 128-byte boundary // What does it mean? boundary??
-    // total size: 2 * 256 * size_of(float) = 2 KB
-    // organization: each feature/grad/hessian is at a different bank,
-    //               as independent of the feature value as possible
-    acc_type *gh_hist = reinterpret_cast<acc_type *>(shared_array);
-
-    // counter histogram
-    // total size: 256 * size_of(unsigned int) = 1 KB
-    unsigned int *cnt_hist = reinterpret_cast<unsigned int *>(gh_hist + 2 * NUM_BINS);
-
-    // odd threads (1, 3, ...) compute histograms for Hessians first
-    // even thread (0, 2, ...) compute histograms for gradients first
-    // etc.
-    uchar is_hessian_first = ltid & 1;
-
-    uint16_t feature_id = group_id >> power_feature_workgroups;
-
-    // each 2^POWER_FEATURE_WORKGROUPS workgroups process on one feature (compile-time constant)
-    // feature_size is the number of examples per feature
-    const uchar *feature_data = feature_data_base + feature_id * feature_size;
-
-    // size of threads that process this feature4
-    const unsigned int subglobal_size = lsize * (1 << power_feature_workgroups);
-
-    // equivalent thread ID in this subgroup for this feature4
-    const unsigned int subglobal_tid  = gtid - feature_id * subglobal_size;
-
-    data_size_t ind;
-    data_size_t ind_next;
-    #ifdef IGNORE_INDICES
-    ind = subglobal_tid;
-    #else
-    ind = data_indices[subglobal_tid];
-    #endif
-
-    // extract feature mask, when a byte is set to 0, that feature is disabled
-    uchar feature_mask = feature_masks[feature_id];
-    // exit if the feature is masked
-    if (!feature_mask) {
-        return;
-    } else {
-        feature_mask = feature_mask - 1;  // feature_mask is used for get feature (1: 4bit feature, 0: 8bit feature)
-    }
-
-    // STAGE 1: read feature data, and gradient and hessian
-    // first half of the threads read feature data from global memory
-    // We will prefetch data into the "next" variable at the beginning of each iteration
-    uchar feature;
-    uchar feature_next;
-    uint16_t bin;
-
-    feature = feature_data[ind >> feature_mask];
-    if (feature_mask) {
-       feature = (feature >> ((ind & 1) << 2)) & 0xf;
-    }
-    bin = feature;
-    acc_type grad_bin = 0.0f, hess_bin = 0.0f;
-    acc_type *addr_bin;
-
-    // store gradient and hessian
-    score_t grad, hess;
-    score_t grad_next, hess_next;
-    grad = ordered_gradients[ind];
-    #if CONST_HESSIAN == 0
-    hess = ordered_hessians[ind];
-    #endif
-
-    // there are 2^POWER_FEATURE_WORKGROUPS workgroups processing each feature4
-    for (unsigned int i = subglobal_tid; i < num_data; i += subglobal_size) {
-        // prefetch the next iteration variables
-        // we don't need boundary check because we have made the buffer large
-        int i_next = i + subglobal_size;
-        #ifdef IGNORE_INDICES
-        // we need to check to bounds here
-        ind_next = i_next < num_data ? i_next : i;
-        #else
-        ind_next = data_indices[i_next];
-        #endif
-
-        grad_next = ordered_gradients[ind_next];
-        #if CONST_HESSIAN == 0
-        hess_next = ordered_hessians[ind_next];
-        #endif
-
-        // STAGE 2: accumulate gradient and hessian
-        if (bin != feature) {
-            addr_bin = gh_hist + bin * 2 + is_hessian_first;
-            #if CONST_HESSIAN == 0
-            acc_type acc_bin = is_hessian_first ? hess_bin : grad_bin;
-            atomic_local_add_f(addr_bin, acc_bin);
-
-            addr_bin = addr_bin + 1 - 2 * is_hessian_first;
-            acc_bin = is_hessian_first ? grad_bin : hess_bin;
-            atomic_local_add_f(addr_bin, acc_bin);
-
-            #elif CONST_HESSIAN == 1
-            atomic_local_add_f(addr_bin, grad_bin);
-            #endif
-
-            bin = feature;
-            grad_bin = grad;
-            hess_bin = hess;
-        } else {
-            grad_bin += grad;
-            hess_bin += hess;
-        }
-
-        // prefetch the next iteration variables
-        feature_next = feature_data[ind_next >> feature_mask];
-
-        // STAGE 3: accumulate counter
-        atomicAdd(cnt_hist + feature, 1);
-
-        // STAGE 4: update next stat
-        grad = grad_next;
-        hess = hess_next;
-        if (!feature_mask) {
-            feature = feature_next;
-        } else {
-            feature = (feature_next >> ((ind_next & 1) << 2)) & 0xf;
-        }
-    }
-
-    addr_bin = gh_hist + bin * 2 + is_hessian_first;
-    #if CONST_HESSIAN == 0
-    acc_type acc_bin = is_hessian_first ? hess_bin : grad_bin;
-    atomic_local_add_f(addr_bin, acc_bin);
-
-    addr_bin = addr_bin + 1 - 2 * is_hessian_first;
-    acc_bin = is_hessian_first ? grad_bin : hess_bin;
-    atomic_local_add_f(addr_bin, acc_bin);
-
-    #elif CONST_HESSIAN == 1
-    atomic_local_add_f(addr_bin, grad_bin);
-    #endif
-    __syncthreads();
-
-    #if CONST_HESSIAN == 1
-    // make a final reduction
-    gh_hist[ltid * 2] += gh_hist[ltid * 2 + 1];
-    gh_hist[ltid * 2 + 1] = const_hessian * cnt_hist[ltid];  // counter move to this position
-    __syncthreads();
-    #endif
-
-#if POWER_FEATURE_WORKGROUPS != 0
-    acc_type *__restrict__ output = reinterpret_cast<acc_type *>(output_buf) + group_id * 3 * NUM_BINS;
-    // write gradients and Hessians
-    acc_type *__restrict__ ptr_f = output;
-    for (uint16_t i = ltid; i < 2 * NUM_BINS; i += lsize) {
-        // even threads read gradients, odd threads read Hessians
-        acc_type value = gh_hist[i];
-        ptr_f[(i & 1) * NUM_BINS + (i >> 1)] = value;
-    }
-    // write counts
-    acc_int_type *__restrict__ ptr_i = reinterpret_cast<acc_int_type *>(output + 2 * NUM_BINS);
-    for (uint16_t i = ltid; i < NUM_BINS; i += lsize) {
-        unsigned int value = cnt_hist[i];
-        ptr_i[i] = value;
-    }
-    __syncthreads();
-    __threadfence();
-    unsigned int * counter_val = cnt_hist;
-    // backup the old value
-    unsigned int old_val = *counter_val;
-    if (ltid == 0) {
-        // all workgroups processing the same feature add this counter
-        *counter_val = atomicAdd(const_cast<int*>(sync_counters + feature_id), 1);
-    }
-    // make sure everyone in this workgroup is here
-    __syncthreads();
-    // everyone in this workgroup: if we are the last workgroup, then do reduction!
-    if (*counter_val == (1 << power_feature_workgroups) - 1) {
-        if (ltid == 0) {
-            sync_counters[feature_id] = 0;
-        }
-#else
-    }
-    // only 1 work group, no need to increase counter
-    // the reduction will become a simple copy
-    {
-        unsigned int old_val;  // dummy
-#endif
-        // locate our feature's block in output memory
-        unsigned int output_offset = (feature_id << power_feature_workgroups);
-        acc_type const * __restrict__ feature_subhists =
-                 reinterpret_cast<acc_type *>(output_buf) + output_offset * 3 * NUM_BINS;
-        // skip reading the data already in local memory
-        unsigned int skip_id = group_id - output_offset;
-        // locate output histogram location for this feature4
-        acc_type *__restrict__ hist_buf = hist_buf_base + feature_id * 2 * NUM_BINS;
-
-        within_kernel_reduction64x4(feature_subhists, skip_id, old_val, 1 << power_feature_workgroups, hist_buf, reinterpret_cast<acc_type *>(shared_array), power_feature_workgroups);
-    }
-}
-
-// end of histogram64 stuff
-
-// histogram256 stuff
-#undef KERNEL_NAME
-#undef NUM_BINS
-#undef LOCAL_MEM_SIZE
-#ifdef ENABLE_ALL_FEATURES
-#ifdef IGNORE_INDICES
-#define KERNEL_NAME histogram256_fulldata
-#else  // IGNORE_INDICES
-#define KERNEL_NAME histogram256  // seems like ENABLE_ALL_FEATURES is set to 1 in the header if its disabled
-// #define KERNEL_NAME histogram256_allfeats
-#endif  // IGNORE_INDICES
-#else  // ENABLE_ALL_FEATURES
-#error "ENABLE_ALL_FEATURES should always be 1"
-#define KERNEL_NAME histogram256
-#endif  // ENABLE_ALL_FEATURES
-#define NUM_BINS 256
-#define LOCAL_MEM_SIZE ((sizeof(unsigned int) + 2 * sizeof(acc_type)) * NUM_BINS)
-
-// this function will be called by histogram256
-// we have one sub-histogram of one feature in local memory, and need to read others
-inline void __device__ within_kernel_reduction256x4(const acc_type* __restrict__ feature_sub_hist,
-                           const unsigned int skip_id,
-                           const unsigned int old_val_cont_bin0,
-                           const uint16_t num_sub_hist,
-                           acc_type* __restrict__ output_buf,
-                           acc_type* __restrict__ local_hist,
-                           const size_t power_feature_workgroups) {
-    const uint16_t ltid = threadIdx.x;
-    acc_type grad_bin = local_hist[ltid * 2];
-    acc_type hess_bin = local_hist[ltid * 2 + 1];
-    unsigned int* __restrict__ local_cnt = reinterpret_cast<unsigned int *>(local_hist + 2 * NUM_BINS);
-
-    unsigned int cont_bin;
-    if (power_feature_workgroups != 0) {
-      cont_bin = ltid ? local_cnt[ltid] : old_val_cont_bin0;
-    } else {
-      cont_bin = local_cnt[ltid];
-    }
-    uint16_t i;
-
-    if (power_feature_workgroups != 0) {
-        // add all sub-histograms for feature
-        const acc_type* __restrict__ p = feature_sub_hist + ltid;
-        for (i = 0; i < skip_id; ++i) {
-            grad_bin += *p;          p += NUM_BINS;
-            hess_bin += *p;          p += NUM_BINS;
-            cont_bin += as_acc_int_type(*p); p += NUM_BINS;
-        }
-
-        // skip the counters we already have
-        p += 3 * NUM_BINS;
-
-        for (i = i + 1; i < num_sub_hist; ++i) {
-            grad_bin += *p;          p += NUM_BINS;
-            hess_bin += *p;          p += NUM_BINS;
-            cont_bin += as_acc_int_type(*p); p += NUM_BINS;
-        }
-    }
-
-    __syncthreads();
-
-    output_buf[ltid * 2 + 0] = grad_bin;
-    output_buf[ltid * 2 + 1] = hess_bin;
-}
-
-#if USE_CONSTANT_BUF == 1
-__kernel void KERNEL_NAME(__global const uchar* restrict feature_data_base,
-                      __constant const uchar* restrict feature_masks __attribute__((max_constant_size(65536))),
-                      const data_size_t feature_size,
-                      __constant const data_size_t* restrict data_indices __attribute__((max_constant_size(65536))),
-                      const data_size_t num_data,
-                      __constant const score_t* restrict ordered_gradients __attribute__((max_constant_size(65536))),
-#if CONST_HESSIAN == 0
-                      __constant const score_t* restrict ordered_hessians __attribute__((max_constant_size(65536))),
-#else
-                      const score_t const_hessian,
-#endif
-                      char* __restrict__ output_buf,
-                      volatile int * sync_counters,
-                      acc_type* __restrict__ hist_buf_base,
-                      const size_t power_feature_workgroups) {
-#else
-__global__ void KERNEL_NAME(const uchar* feature_data_base,
-                      const uchar* __restrict__ feature_masks,
-                      const data_size_t feature_size,
-                      const data_size_t* data_indices,
-                      const data_size_t num_data,
-                      const score_t*  ordered_gradients,
-#if CONST_HESSIAN == 0
-                      const score_t*  ordered_hessians,
-#else
-                      const score_t const_hessian,
-#endif
-                      char* __restrict__ output_buf,
-                      volatile int * sync_counters,
-                      acc_type* __restrict__ hist_buf_base,
-                      const size_t power_feature_workgroups) {
-#endif
-    // allocate the local memory array aligned with float2, to guarantee correct alignment on NVIDIA platforms
-    // otherwise a "Misaligned Address" exception may occur
-    __shared__ float2 shared_array[LOCAL_MEM_SIZE/sizeof(float2)];
-    const unsigned int gtid = blockIdx.x * blockDim.x + threadIdx.x;
-    const uint16_t ltid = threadIdx.x;
-    const uint16_t lsize = NUM_BINS;  // get_local_size(0);
-    const uint16_t group_id = blockIdx.x;
-
-    // local memory per workgroup is 3 KB
-    // clear local memory
-    unsigned int *ptr = reinterpret_cast<unsigned int *>(shared_array);
-    for (int i = ltid; i < LOCAL_MEM_SIZE/sizeof(unsigned int); i += lsize) {
-        ptr[i] = 0;
-    }
-    __syncthreads();
-    // gradient/hessian histograms
-    // assume this starts at 32 * 4 = 128-byte boundary // What does it mean? boundary??
-    // total size: 2 * 256 * size_of(float) = 2 KB
-    // organization: each feature/grad/hessian is at a different bank,
-    //               as independent of the feature value as possible
-    acc_type *gh_hist = reinterpret_cast<acc_type *>(shared_array);
-
-    // counter histogram
-    // total size: 256 * size_of(unsigned int) = 1 KB
-    unsigned int *cnt_hist = reinterpret_cast<unsigned int *>(gh_hist + 2 * NUM_BINS);
-
-    // odd threads (1, 3, ...) compute histograms for hessians first
-    // even thread (0, 2, ...) compute histograms for gradients first
-    // etc.
-    uchar is_hessian_first = ltid & 1;
-
-    uint16_t feature_id = group_id >> power_feature_workgroups;
-
-    // each 2^POWER_FEATURE_WORKGROUPS workgroups process on one feature (compile-time constant)
-    // feature_size is the number of examples per feature
-    const uchar *feature_data = feature_data_base + feature_id * feature_size;
-
-    // size of threads that process this feature4
-    const unsigned int subglobal_size = lsize * (1 << power_feature_workgroups);
-
-    // equivalent thread ID in this subgroup for this feature4
-    const unsigned int subglobal_tid  = gtid - feature_id * subglobal_size;
-
-    data_size_t ind;
-    data_size_t ind_next;
-    #ifdef IGNORE_INDICES
-    ind = subglobal_tid;
-    #else
-    ind = data_indices[subglobal_tid];
-    #endif
-
-    // extract feature mask, when a byte is set to 0, that feature is disabled
-    uchar feature_mask = feature_masks[feature_id];
-    // exit if the feature is masked
-    if (!feature_mask) {
-        return;
-    } else {
-        feature_mask = feature_mask - 1;  // feature_mask is used for get feature (1: 4bit feature, 0: 8bit feature)
-    }
-
-    // STAGE 1: read feature data, and gradient and hessian
-    // first half of the threads read feature data from global memory
-    // We will prefetch data into the "next" variable at the beginning of each iteration
-    uchar feature;
-    uchar feature_next;
-    uint16_t bin;
-
-    feature = feature_data[ind >> feature_mask];
-    if (feature_mask) {
-       feature = (feature >> ((ind & 1) << 2)) & 0xf;
-    }
-    bin = feature;
-    acc_type grad_bin = 0.0f, hess_bin = 0.0f;
-    acc_type *addr_bin;
-
-    // store gradient and hessian
-    score_t grad, hess;
-    score_t grad_next, hess_next;
-    grad = ordered_gradients[ind];
-    #if CONST_HESSIAN == 0
-    hess = ordered_hessians[ind];
-    #endif
-
-    // there are 2^POWER_FEATURE_WORKGROUPS workgroups processing each feature4
-    for (unsigned int i = subglobal_tid; i < num_data; i += subglobal_size) {
-        // prefetch the next iteration variables
-        // we don't need boundary check because we have made the buffer large
-        int i_next = i + subglobal_size;
-        #ifdef IGNORE_INDICES
-        // we need to check to bounds here
-        ind_next = i_next < num_data ? i_next : i;
-        #else
-        ind_next = data_indices[i_next];
-        #endif
-
-        grad_next = ordered_gradients[ind_next];
-        #if CONST_HESSIAN == 0
-        hess_next = ordered_hessians[ind_next];
-        #endif
-        // STAGE 2: accumulate gradient and hessian
-        if (bin != feature) {
-            addr_bin = gh_hist + bin * 2 + is_hessian_first;
-            #if CONST_HESSIAN == 0
-            acc_type acc_bin = is_hessian_first ? hess_bin : grad_bin;
-            atomic_local_add_f(addr_bin, acc_bin);
-
-            addr_bin = addr_bin + 1 - 2 * is_hessian_first;
-            acc_bin = is_hessian_first ? grad_bin : hess_bin;
-            atomic_local_add_f(addr_bin, acc_bin);
-
-            #elif CONST_HESSIAN == 1
-            atomic_local_add_f(addr_bin, grad_bin);
-            #endif
-
-            bin = feature;
-            grad_bin = grad;
-            hess_bin = hess;
-        } else {
-            grad_bin += grad;
-            hess_bin += hess;
-        }
-
-        // prefetch the next iteration variables
-        feature_next = feature_data[ind_next >> feature_mask];
-
-        // STAGE 3: accumulate counter
-        atomicAdd(cnt_hist + feature, 1);
-
-        // STAGE 4: update next stat
-        grad = grad_next;
-        hess = hess_next;
-        if (!feature_mask) {
-            feature = feature_next;
-        } else {
-            feature = (feature_next >> ((ind_next & 1) << 2)) & 0xf;
-        }
-    }
-
-    addr_bin = gh_hist + bin * 2 + is_hessian_first;
-    #if CONST_HESSIAN == 0
-    acc_type acc_bin = is_hessian_first ? hess_bin : grad_bin;
-    atomic_local_add_f(addr_bin, acc_bin);
-
-    addr_bin = addr_bin + 1 - 2 * is_hessian_first;
-    acc_bin = is_hessian_first ? grad_bin : hess_bin;
-
-    atomic_local_add_f(addr_bin, acc_bin);
-
-    #elif CONST_HESSIAN == 1
-    atomic_local_add_f(addr_bin, grad_bin);
-    #endif
-     __syncthreads();
-
-    #if CONST_HESSIAN == 1
-    // make a final reduction
-    gh_hist[ltid * 2] += gh_hist[ltid * 2 + 1];
-    gh_hist[ltid * 2 + 1] = const_hessian * cnt_hist[ltid];  // counter move to this position
-    __syncthreads();
-    #endif
-
-#if POWER_FEATURE_WORKGROUPS != 0
-    acc_type *__restrict__ output = reinterpret_cast<acc_type *>(output_buf) + group_id * 3 * NUM_BINS;
-    // write gradients and Hessians
-    acc_type *__restrict__ ptr_f = output;
-    for (uint16_t i = ltid; i < 2 * NUM_BINS; i += lsize) {
-        // even threads read gradients, odd threads read Hessians
-        acc_type value = gh_hist[i];
-        ptr_f[(i & 1) * NUM_BINS + (i >> 1)] = value;
-    }
-    // write counts
-    acc_int_type *__restrict__ ptr_i = reinterpret_cast<acc_int_type *>(output + 2 * NUM_BINS);
-    for (uint16_t i = ltid; i < NUM_BINS; i += lsize) {
-        unsigned int value = cnt_hist[i];
-        ptr_i[i] = value;
-    }
-    __syncthreads();
-    __threadfence();
-    unsigned int * counter_val = cnt_hist;
-    // backup the old value
-    unsigned int old_val = *counter_val;
-    if (ltid == 0) {
-        // all workgroups processing the same feature add this counter
-        *counter_val = atomicAdd(const_cast<int*>(sync_counters + feature_id), 1);
-    }
-    // make sure everyone in this workgroup is here
-    __syncthreads();
-    // everyone in this workgroup: if we are the last workgroup, then do reduction!
-    if (*counter_val == (1 << power_feature_workgroups) - 1) {
-        if (ltid == 0) {
-            sync_counters[feature_id] = 0;
-        }
-#else
-    }
-    // only 1 work group, no need to increase counter
-    // the reduction will become a simple copy
-    {
-        unsigned int old_val;  // dummy
-#endif
-        // locate our feature's block in output memory
-        unsigned int output_offset = (feature_id << power_feature_workgroups);
-        acc_type const * __restrict__ feature_subhists =
-                 reinterpret_cast<acc_type *>(output_buf) + output_offset * 3 * NUM_BINS;
-        // skip reading the data already in local memory
-        unsigned int skip_id = group_id - output_offset;
-        // locate output histogram location for this feature4
-        acc_type *__restrict__ hist_buf = hist_buf_base + feature_id * 2 * NUM_BINS;
-
-        within_kernel_reduction256x4(feature_subhists, skip_id, old_val, 1 << power_feature_workgroups, hist_buf, reinterpret_cast<acc_type *>(shared_array), power_feature_workgroups);
-    }
-}
-
-// end of histogram256 stuff
-
-}  // namespace LightGBM
diff --git a/src/treelearner/kernels/histogram_16_64_256.hu b/src/treelearner/kernels/histogram_16_64_256.hu
deleted file mode 100644
index f72b7465783d..000000000000
--- a/src/treelearner/kernels/histogram_16_64_256.hu
+++ /dev/null
@@ -1,160 +0,0 @@
-/*!
- * Copyright (c) 2020 IBM Corporation. All rights reserved.
- * Licensed under the MIT License. See LICENSE file in the project root for license information.
- */
-
-#ifndef LIGHTGBM_TREELEARNER_KERNELS_HISTOGRAM_16_64_256_HU_
-#define LIGHTGBM_TREELEARNER_KERNELS_HISTOGRAM_16_64_256_HU_
-
-#include "LightGBM/meta.h"
-
-namespace LightGBM {
-
-// use double precision or not
-#ifndef USE_DP_FLOAT
-#define USE_DP_FLOAT 1
-#endif
-
-// ignore hessian, and use the local memory for hessian as an additional bank for gradient
-#ifndef CONST_HESSIAN
-#define CONST_HESSIAN 0
-#endif
-
-typedef unsigned char uchar;
-
-template<typename T>
-__device__ double as_double(const T t) {
-  static_assert(sizeof(T) == sizeof(double), "size mismatch");
-  double d;
-  memcpy(&d, &t, sizeof(T));
-  return d;
-}
-template<typename T>
-__device__ unsigned long long as_ulong_ulong(const T t) {
-  static_assert(sizeof(T) == sizeof(unsigned long long), "size mismatch");
-  unsigned long long u;
-  memcpy(&u, &t, sizeof(T));
-  return u;
-}
-template<typename T>
-__device__ float as_float(const T t) {
-  static_assert(sizeof(T) == sizeof(float), "size mismatch");
-  float f;
-  memcpy(&f, &t, sizeof(T));
-  return f;
-}
-template<typename T>
-__device__ unsigned int as_uint(const T t) {
-  static_assert(sizeof(T) == sizeof(unsigned int), "size_mismatch");
-  unsigned int u;
-  memcpy(&u, &t, sizeof(T));
-  return u;
-}
-template<typename T>
-__device__ uchar4 as_uchar4(const T t) {
-  static_assert(sizeof(T) == sizeof(uchar4), "size mismatch");
-  uchar4 u;
-  memcpy(&u, &t, sizeof(T));
-  return u;
-}
-
-#if USE_DP_FLOAT == 1
-typedef double acc_type;
-typedef unsigned long long acc_int_type;
-#define as_acc_type as_double
-#define as_acc_int_type as_ulong_ulong
-#else
-typedef float acc_type;
-typedef unsigned int acc_int_type;
-#define as_acc_type as_float
-#define as_acc_int_type as_uint
-#endif
-
-// use all features and do not use feature mask
-#ifndef ENABLE_ALL_FEATURES
-#define ENABLE_ALL_FEATURES 1
-#endif
-
-// define all of the different kernels
-
-#define DECLARE_CONST_BUF(name) \
-__global__ void name(__global const uchar* restrict feature_data_base, \
-                     const uchar* restrict feature_masks,\
-                     const data_size_t feature_size,\
-                     const data_size_t* restrict data_indices, \
-                     const data_size_t num_data, \
-                     const score_t* restrict ordered_gradients, \
-                     const score_t* restrict ordered_hessians,\
-                     char* __restrict__ output_buf,\
-                     volatile int * sync_counters,\
-                     acc_type* __restrict__ hist_buf_base, \
-                     const size_t power_feature_workgroups);
-
-
-#define DECLARE_CONST_HES_CONST_BUF(name) \
-__global__ void name(const uchar* __restrict__ feature_data_base, \
-                     const uchar* __restrict__ feature_masks,\
-                     const data_size_t feature_size,\
-                     const data_size_t* __restrict__ data_indices, \
-                     const data_size_t num_data, \
-                     const score_t* __restrict__ ordered_gradients, \
-                     const score_t const_hessian,\
-                     char* __restrict__ output_buf,\
-                     volatile int * sync_counters,\
-                     acc_type* __restrict__ hist_buf_base, \
-                     const size_t power_feature_workgroups);
-
-
-
-#define DECLARE_CONST_HES(name) \
-__global__ void name(const uchar* feature_data_base, \
-                     const uchar* __restrict__ feature_masks,\
-                     const data_size_t feature_size,\
-                     const data_size_t* data_indices, \
-                     const data_size_t num_data, \
-                     const score_t*  ordered_gradients, \
-                     const score_t const_hessian,\
-                     char* __restrict__ output_buf, \
-                     volatile int * sync_counters,\
-                     acc_type* __restrict__ hist_buf_base, \
-                     const size_t power_feature_workgroups);
-
-
-#define DECLARE(name) \
-__global__ void name(const uchar* feature_data_base, \
-                     const uchar* __restrict__ feature_masks,\
-                     const data_size_t feature_size,\
-                     const data_size_t* data_indices, \
-                     const data_size_t num_data, \
-                     const score_t*  ordered_gradients, \
-                     const score_t*  ordered_hessians,\
-                     char* __restrict__ output_buf, \
-                     volatile int * sync_counters,\
-                     acc_type* __restrict__ hist_buf_base, \
-                     const size_t power_feature_workgroups);
-
-
-DECLARE_CONST_HES(histogram16_allfeats);
-DECLARE_CONST_HES(histogram16_fulldata);
-DECLARE_CONST_HES(histogram16);
-DECLARE(histogram16_allfeats);
-DECLARE(histogram16_fulldata);
-DECLARE(histogram16);
-
-DECLARE_CONST_HES(histogram64_allfeats);
-DECLARE_CONST_HES(histogram64_fulldata);
-DECLARE_CONST_HES(histogram64);
-DECLARE(histogram64_allfeats);
-DECLARE(histogram64_fulldata);
-DECLARE(histogram64);
-
-DECLARE_CONST_HES(histogram256_allfeats);
-DECLARE_CONST_HES(histogram256_fulldata);
-DECLARE_CONST_HES(histogram256);
-DECLARE(histogram256_allfeats);
-DECLARE(histogram256_fulldata);
-DECLARE(histogram256);
-
-}  // namespace LightGBM
-
-#endif  // LIGHTGBM_TREELEARNER_KERNELS_HISTOGRAM_16_64_256_HU_