From 35b0ca147b011b9453130c55b42aad197bc0bc88 Mon Sep 17 00:00:00 2001
From: Yu Shi <shiyu_k1994@qq.com>
Date: Thu, 9 Nov 2023 09:25:22 +0000
Subject: [PATCH] add nccl topology

---
 docs/Parameters.rst                           |  24 +-
 include/LightGBM/boosting.h                   |   4 +-
 include/LightGBM/config.h                     |  18 +-
 include/LightGBM/cuda/cuda_nccl_topology.hpp  | 225 ++++++++++
 .../LightGBM/cuda/cuda_objective_function.hpp |   2 +-
 include/LightGBM/cuda/cuda_utils.hu           |  37 ++
 include/LightGBM/dataset.h                    |   4 +
 include/LightGBM/objective_function.h         |   8 +
 src/application/application.cpp               |   6 +-
 src/boosting/boosting.cpp                     |  16 +-
 src/boosting/cuda/nccl_gbdt.cpp               | 383 ++++--------------
 src/boosting/cuda/nccl_gbdt.hpp               |  39 +-
 src/boosting/cuda/nccl_gbdt_component.hpp     |  95 +++++
 src/boosting/gbdt.h                           |   2 +-
 src/c_api.cpp                                 |   4 +-
 src/io/config_auto.cpp                        |  19 +-
 src/io/dataset.cpp                            |  35 +-
 src/objective/objective_function.cpp          |  89 ++--
 .../cuda/cuda_single_gpu_tree_learner.hpp     |   2 +-
 src/treelearner/tree_learner.cpp              |   2 +-
 20 files changed, 604 insertions(+), 410 deletions(-)
 create mode 100644 include/LightGBM/cuda/cuda_nccl_topology.hpp
 create mode 100644 src/boosting/cuda/nccl_gbdt_component.hpp
diff --git a/docs/Parameters.rst b/docs/Parameters.rst
index 86104ba5be55..4d6e92e03988 100644
--- a/docs/Parameters.rst
+++ b/docs/Parameters.rst
@@ -1280,7 +1280,7 @@ GPU Parameters
 
 -  ``gpu_platform_id`` :raw-html:`<a id="gpu_platform_id" title="Permalink to this parameter" href="#gpu_platform_id">&#x1F517;&#xFE0E;</a>`, default = ``-1``, type = int
 
-   -  OpenCL platform ID. Usually each GPU vendor exposes one OpenCL platform
+   -  OpenCL platform ID with device_type=gpu. Usually each GPU vendor exposes one OpenCL platform
 
    -  ``-1`` means the system-wide default platform
 
@@ -1288,23 +1288,31 @@ GPU Parameters
 
 -  ``gpu_device_id`` :raw-html:`<a id="gpu_device_id" title="Permalink to this parameter" href="#gpu_device_id">&#x1F517;&#xFE0E;</a>`, default = ``-1``, type = int
 
-   -  OpenCL device ID in the specified platform. Each GPU in the selected platform has a unique device ID
+   -  Master CUDA device ID with device_type=cuda or OpenCL device ID in the specified platform with device_type=gpu.
+
+   -  Each GPU in the selected platform has a unique device ID
 
    -  ``-1`` means the default device in the selected platform
 
    -  **Note**: refer to `GPU Targets <./GPU-Targets.rst#query-opencl-devices-in-your-system>`__ for more details
 
--  ``gpu_use_dp`` :raw-html:`<a id="gpu_use_dp" title="Permalink to this parameter" href="#gpu_use_dp">&#x1F517;&#xFE0E;</a>`, default = ``false``, type = bool
+-  ``num_gpus`` :raw-html:`<a id="num_gpus" title="Permalink to this parameter" href="#num_gpus">&#x1F517;&#xFE0E;</a>`, default = ``1``, type = int
 
-   -  set this to ``true`` to use double precision math on GPU (by default single precision is used)
+   -  Number of GPUs to use for training, used with device_type=cuda
 
-   -  **Note**: can be used only in OpenCL implementation, in CUDA implementation only double precision is currently supported
+   -  When <= 0, only 1 GPU will be used
+
+-  ``gpu_device_id_list`` :raw-html:`<a id="gpu_device_id_list" title="Permalink to this parameter" href="#gpu_device_id_list">&#x1F517;&#xFE0E;</a>`, default = ``""``, type = string
 
--  ``num_gpu`` :raw-html:`<a id="num_gpu" title="Permalink to this parameter" href="#num_gpu">&#x1F517;&#xFE0E;</a>`, default = ``1``, type = int, constraints: ``num_gpu > 0``
+   -  List of CUDA device IDs used when device_type=cuda
 
-   -  number of GPUs
+   -  When empty, the devices with the smallest IDs will be used
 
-   -  **Note**: can be used only in CUDA implementation
+-  ``gpu_use_dp`` :raw-html:`<a id="gpu_use_dp" title="Permalink to this parameter" href="#gpu_use_dp">&#x1F517;&#xFE0E;</a>`, default = ``false``, type = bool
+
+   -  set this to ``true`` to use double precision math on GPU (by default single precision is used)
+
+   -  **Note**: can be used only in OpenCL implementation, in CUDA implementation only double precision is currently supported
 
 .. end params list
 
diff --git a/include/LightGBM/boosting.h b/include/LightGBM/boosting.h
index 1bfc18b4470b..8e694aca29da 100644
--- a/include/LightGBM/boosting.h
+++ b/include/LightGBM/boosting.h
@@ -309,9 +309,11 @@ class LIGHTGBM_EXPORT Boosting {
   * \param format Format of model
   * \param config config for boosting
   * \param filename name of model file, if existing will continue to train from this model
+  * \param device_type type of device, can be cpu, gpu or cuda
+  * \param num_gpu number of GPUs to use
   * \return The boosting object
   */
-  static Boosting* CreateBoosting(const std::string& type, const char* filename);
+  static Boosting* CreateBoosting(const std::string& type, const char* filename, const std::string& device_type, const int num_gpu);
 
   virtual std::string GetLoadedParam() const = 0;
 
diff --git a/include/LightGBM/config.h b/include/LightGBM/config.h
index 6d61bc764924..09a6836f56c7 100644
--- a/include/LightGBM/config.h
+++ b/include/LightGBM/config.h
@@ -1091,25 +1091,29 @@ struct Config {
   #pragma region GPU Parameters
   #endif  // __NVCC__
 
-  // desc = OpenCL platform ID. Usually each GPU vendor exposes one OpenCL platform
+  // desc = OpenCL platform ID with device_type=gpu. Usually each GPU vendor exposes one OpenCL platform
   // desc = ``-1`` means the system-wide default platform
   // desc = **Note**: refer to `GPU Targets <./GPU-Targets.rst#query-opencl-devices-in-your-system>`__ for more details
   int gpu_platform_id = -1;
 
-  // desc = OpenCL device ID in the specified platform. Each GPU in the selected platform has a unique device ID
+  // desc = Master CUDA device ID with device_type=cuda or OpenCL device ID in the specified platform with device_type=gpu.
+  // desc = Each GPU in the selected platform has a unique device ID
   // desc = ``-1`` means the default device in the selected platform
   // desc = **Note**: refer to `GPU Targets <./GPU-Targets.rst#query-opencl-devices-in-your-system>`__ for more details
   int gpu_device_id = -1;
 
+  // desc = Number of GPUs to use for training, used with device_type=cuda
+  // desc = When <= 0, only 1 GPU will be used
+  int num_gpus = 1;
+
+  // desc = List of CUDA device IDs used when device_type=cuda
+  // desc = When empty, the devices with the smallest IDs will be used
+  std::string gpu_device_id_list = "";
+
   // desc = set this to ``true`` to use double precision math on GPU (by default single precision is used)
   // desc = **Note**: can be used only in OpenCL implementation, in CUDA implementation only double precision is currently supported
   bool gpu_use_dp = false;
 
-  // check = >0
-  // desc = number of GPUs
-  // desc = **Note**: can be used only in CUDA implementation
-  int num_gpu = 1;
-
   #ifndef __NVCC__
   #pragma endregion
 
diff --git a/include/LightGBM/cuda/cuda_nccl_topology.hpp b/include/LightGBM/cuda/cuda_nccl_topology.hpp
new file mode 100644
index 000000000000..98f485ba97d8
--- /dev/null
+++ b/include/LightGBM/cuda/cuda_nccl_topology.hpp
@@ -0,0 +1,225 @@
+/*!
+ * Copyright (c) 2023 Microsoft Corporation. All rights reserved.
+ * Licensed under the MIT License. See LICENSE file in the project root for license information.
+ */
+
+#include <LightGBM/cuda/cuda_utils.hu>
+#include <LightGBM/network.h>
+#include <LightGBM/utils/common.h>
+#include <set>
+#include <string>
+#include <vector>
+#include <nccl.h>
+#include <functional>
+#include <thread>
+
+namespace LightGBM {
+
+class NCCLTopology {
+ public:
+  NCCLTopology(const int master_gpu_device_id, const int num_gpus, const std::string& gpu_device_id_list, const data_size_t global_num_data) {
+    num_gpus_ = num_gpus;
+    master_gpu_device_id_ = master_gpu_device_id;
+    global_num_data_ = global_num_data;
+    int max_num_gpu = 0;
+    CUDASUCCESS_OR_FATAL(cudaGetDeviceCount(&max_num_gpu));
+    if (gpu_device_id_list != std::string("")) {
+      std::set<int> gpu_id_set;
+      std::vector<std::string> gpu_list_str = Common::Split(gpu_device_id_list.c_str(), ",");
+      for (const auto& gpu_str : gpu_list_str) {
+        int gpu_id = 0;
+        Common::Atoi<int>(gpu_str.c_str(), &gpu_id);
+        if (gpu_id < 0 || gpu_id >= max_num_gpu) {
+          Log::Warning("Invalid GPU device ID %d in gpu_device_list is ignored.", gpu_id);
+        } else {
+          gpu_id_set.insert(gpu_id);
+        }
+      }
+      for (const int gpu_id : gpu_id_set) {
+        gpu_list_.push_back(gpu_id);
+      }
+    }
+    if (!gpu_list_.empty() && num_gpus_ != static_cast<int>(gpu_list_.size())) {
+      Log::Warning("num_gpus_ = %d is different from the number of valid device IDs in gpu_device_list (%d), using %d GPUs instead.",\
+                  num_gpus_, static_cast<int>(gpu_list_.size()), static_cast<int>(gpu_list_.size()));
+      num_gpus_ = static_cast<int>(gpu_list_.size());
+    }
+
+    if (!gpu_list_.empty()) {
+      bool check_master_gpu = false;
+      for (int i = 0; i < static_cast<int>(gpu_list_.size()); ++i) {
+        const int gpu_id = gpu_list_[i];
+        if (gpu_id == master_gpu_device_id_) {
+          check_master_gpu = true;
+          master_gpu_index_ = i;
+          break;
+        }
+      }
+      if (!check_master_gpu) {
+        Log::Warning("Master GPU index not in gpu_device_list. Using %d as the master GPU instead.", gpu_list_[0]);
+        master_gpu_device_id_ = gpu_list_[0];
+        master_gpu_index_ = 0;
+      }
+    } else {
+      if (num_gpus_ <= 0) {
+        num_gpus_ = 1;
+      } else if (num_gpus_ > max_num_gpu) {
+        Log::Warning("Only %d GPUs available, using num_gpu = %d.", max_num_gpu, max_num_gpu);
+        num_gpus_ = max_num_gpu;
+      }
+      if (master_gpu_device_id_ < 0 || master_gpu_device_id_ >= num_gpus_) {
+        Log::Warning("Invalid gpu_device_id = %d for master GPU index, using gpu_device_id = 0 instead.", master_gpu_device_id_);
+        master_gpu_device_id_ = 0;
+        master_gpu_index_ = 0;
+      }
+      for (int i = 0; i < num_gpus_; ++i) {
+        gpu_list_.push_back(i);
+      }
+    }
+
+    Log::Info("Using GPU devices %s, and local master GPU device %d.", Common::Join<int>(gpu_list_, ","), master_gpu_device_id_);
+
+    const int num_threads = OMP_NUM_THREADS();
+    if (num_gpus_ > num_threads) {
+      Log::Fatal("Number of GPUs %d is greather than the number of threads %d. Please use more threads.", num_gpus_, num_threads);
+    }
+
+    host_threads_.resize(num_gpus_);
+  }
+
+  void InitNCCL() {
+    nccl_gpu_rank_.resize(num_gpus_, -1);
+    nccl_communicators_.resize(num_gpus_);
+    ncclUniqueId nccl_unique_id;
+    if (Network::num_machines() == 1 || Network::rank() == 0) {
+      NCCLCHECK(ncclGetUniqueId(&nccl_unique_id));
+    }
+    if (Network::num_machines() > 1) {
+      std::vector<ncclUniqueId> output_buffer(Network::num_machines());
+      Network::Allgather(
+        reinterpret_cast<char*>(&nccl_unique_id),
+        sizeof(ncclUniqueId) / sizeof(char),
+        reinterpret_cast<char*>(output_buffer.data()));
+      if (Network::rank() > 0) {
+        nccl_unique_id = output_buffer[0];
+      }
+    }
+
+    if (Network::num_machines() > 1) {
+      node_rank_offset_.resize(Network::num_machines() + 1, 0);
+      Network::Allgather(
+        reinterpret_cast<char*>(&num_gpus_),
+        sizeof(int) / sizeof(char),
+        reinterpret_cast<char*>(node_rank_offset_.data() + 1));
+      for (int rank = 1; rank < Network::num_machines() + 1; ++rank) {
+        node_rank_offset_[rank] += node_rank_offset_[rank - 1];
+      }
+      CHECK_EQ(node_rank_offset_[Network::rank() + 1] - node_rank_offset_[Network::rank()], num_gpus_);
+      NCCLCHECK(ncclGroupStart());
+      for (int gpu_index = 0; gpu_index < num_gpus_; ++gpu_index) {
+        SetCUDADevice(gpu_list_[gpu_index], __FILE__, __LINE__);
+        nccl_gpu_rank_[gpu_index] = gpu_index + node_rank_offset_[Network::rank()];
+        NCCLCHECK(ncclCommInitRank(&nccl_communicators_[gpu_index], node_rank_offset_.back(), nccl_unique_id, nccl_gpu_rank_[gpu_index]));
+      }
+      NCCLCHECK(ncclGroupEnd());
+    } else {
+      NCCLCHECK(ncclGroupStart());
+      for (int gpu_index = 0; gpu_index < num_gpus_; ++gpu_index) {
+        SetCUDADevice(gpu_list_[gpu_index], __FILE__, __LINE__);
+        nccl_gpu_rank_[gpu_index] = gpu_index;
+        NCCLCHECK(ncclCommInitRank(&nccl_communicators_[gpu_index], num_gpus_, nccl_unique_id, gpu_index));
+      }
+      NCCLCHECK(ncclGroupEnd());
+    }
+
+    // return to master gpu device
+    CUDASUCCESS_OR_FATAL(cudaSetDevice(master_gpu_device_id_));
+  }
+
+  template <typename ARG_T, typename RET_T>
+  void RunPerDevice(const std::vector<std::unique_ptr<ARG_T>>& objs, const std::function<RET_T(ARG_T*)>& func) {
+    #pragma omp parallel for schedule(static) num_threads(num_gpus_)
+    for (int i = 0; i < num_gpus_; ++i) {
+      CUDASUCCESS_OR_FATAL(cudaSetDevice(gpu_list_[i]));
+      func(objs[i].get());
+    }
+    CUDASUCCESS_OR_FATAL(cudaSetDevice(master_gpu_device_id_));
+  }
+
+  template <typename RET_T>
+  void InitPerDevice(std::vector<std::unique_ptr<RET_T>>& vec) {
+    vec.resize(num_gpus_);
+    #pragma omp parallel for schedule(static) num_threads(num_gpus_)
+    for (int i = 0; i < num_gpus_; ++i) {
+      CUDASUCCESS_OR_FATAL(cudaSetDevice(gpu_list_[i]));
+      RET_T* nccl_info = new RET_T();
+      nccl_info->SetNCCLInfo(nccl_communicators_[i], nccl_gpu_rank_[i], i, gpu_list_[i], global_num_data_);
+    }
+    CUDASUCCESS_OR_FATAL(cudaSetDevice(master_gpu_device_id_));
+  }
+
+  template <typename ARG_T>
+  void DispatchPerDevice(std::vector<std::unique_ptr<ARG_T>>& objs, const std::function<void(ARG_T*)>& func) {
+    for (int i = 0; i < num_gpus_; ++i) {
+      host_threads_[i] = std::thread([this, i, &func, &objs] () {
+        CUDASUCCESS_OR_FATAL(cudaSetDevice(gpu_list_[i]))
+        func(objs[i].get());
+      });
+      // if (pthread_create(&host_threads_[i], nullptr, [this, i, &func] (void* ptr) {
+      //     CUDASUCCESS_OR_FATAL(cudaSetDevice(gpu_list_[i]))
+      //     func(reinterpret_cast<ARG_T*>(ptr));
+      //     return reinterpret_cast<void*>(nullptr);
+      //   },
+      //   reinterpret_cast<void*>(objs[i].get()))) {
+      //   Log::Fatal("Error in creating boosting threads.");
+      // }
+    }
+    for (int i = 0; i < num_gpus_; ++i) {
+      // if (pthread_join(host_threads_[i], nullptr)) {
+      //   Log::Fatal("Error in joining boosting threads.");
+      // }
+      host_threads_[i].join();
+    }
+    CUDASUCCESS_OR_FATAL(cudaSetDevice(master_gpu_device_id_));
+  }
+
+  template <typename ARG_T, typename RET_T>
+  void RunOnMasterDevice(const std::vector<std::unique_ptr<ARG_T>>& objs, const std::function<RET_T(ARG_T*)>& func) {
+    CUDASUCCESS_OR_FATAL(cudaSetDevice(master_gpu_device_id_));
+    func(objs[master_gpu_index_].get());
+  }
+
+  template <typename ARG_T, typename RET_T>
+  void RunOnNonMasterDevice(const std::vector<std::unique_ptr<ARG_T>>& objs, const std::function<RET_T(ARG_T*)>& func) {
+    for (int i = 0; i < num_gpus_; ++i) {
+      if (i != master_gpu_index_) {
+        CUDASUCCESS_OR_FATAL(cudaSetDevice(gpu_list_[i]));
+        func(objs[i].get());
+      }
+    }
+    CUDASUCCESS_OR_FATAL(cudaSetDevice(master_gpu_device_id_));
+  }
+
+  int num_gpus() const { return num_gpus_; }
+
+  int master_gpu_index() const { return master_gpu_index_; }
+
+  int master_gpu_device_id() const { return master_gpu_device_id_; }
+
+  const std::vector<int>& gpu_list() const { return gpu_list_; }
+
+ private:
+  int num_gpus_;
+  int master_gpu_index_;
+  int master_gpu_device_id_;
+  std::vector<int> gpu_list_;
+  data_size_t global_num_data_;
+
+  ncclUniqueId nccl_unique_id_;
+  std::vector<int> node_rank_offset_;
+  std::vector<int> nccl_gpu_rank_;
+  std::vector<ncclComm_t> nccl_communicators_;
+  std::vector<std::thread> host_threads_;
+};
+
+}  // namespace LightGBM
diff --git a/include/LightGBM/cuda/cuda_objective_function.hpp b/include/LightGBM/cuda/cuda_objective_function.hpp
index 465ed334156c..3b4ce46a38be 100644
--- a/include/LightGBM/cuda/cuda_objective_function.hpp
+++ b/include/LightGBM/cuda/cuda_objective_function.hpp
@@ -19,7 +19,7 @@
 namespace LightGBM {
 
 template <typename HOST_OBJECTIVE>
-class CUDAObjectiveInterface: public HOST_OBJECTIVE {
+class CUDAObjectiveInterface: public HOST_OBJECTIVE, NCCLInfo {
  public:
   explicit CUDAObjectiveInterface(const Config& config): HOST_OBJECTIVE(config) {
     const int gpu_device_id = config.gpu_device_id >= 0 ? config.gpu_device_id : 0;
diff --git a/include/LightGBM/cuda/cuda_utils.hu b/include/LightGBM/cuda/cuda_utils.hu
index 4bd84aeb264d..7fbb482d06a4 100644
--- a/include/LightGBM/cuda/cuda_utils.hu
+++ b/include/LightGBM/cuda/cuda_utils.hu
@@ -11,8 +11,10 @@
 #include <cuda.h>
 #include <cuda_runtime.h>
 #include <stdio.h>
+#include <nccl.h>
 
 #include <LightGBM/utils/log.h>
+#include <LightGBM/meta.h>
 
 #include <algorithm>
 #include <vector>
@@ -32,6 +34,15 @@ inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort =
 
 #define CUDASUCCESS_OR_FATAL_OUTER(ans) { gpuAssert((ans), file, line); }
 
+#define NCCLCHECK(cmd) do {                         \
+  ncclResult_t r = cmd;                             \
+  if (r!= ncclSuccess) {                            \
+    printf("Failed, NCCL error %s:%d '%s'\n",             \
+        __FILE__,__LINE__,ncclGetErrorString(r));   \
+    exit(EXIT_FAILURE);                             \
+  }                                                 \
+} while(0)
+
 void SetCUDADevice(int gpu_device_id, const char* file, int line);
 
 int GetCUDADevice(const char* file, int line);
@@ -205,6 +216,32 @@ static __device__ T SafeLog(T x) {
   }
 }
 
+class NCCLInfo {
+ public:
+  NCCLInfo() {}
+
+  void SetNCCLInfo(
+    ncclComm_t nccl_communicator,
+    int nccl_gpu_rank,
+    int local_gpu_rank,
+    int gpu_device_id,
+    data_size_t global_num_data) {
+    nccl_communicator_ = nccl_communicator;
+    nccl_gpu_rank_ = nccl_gpu_rank;
+    local_gpu_rank_ = local_gpu_rank;
+    gpu_device_id_ = gpu_device_id;
+    global_num_data_ = global_num_data;
+  }
+
+ protected:
+  ncclComm_t nccl_communicator_ = nullptr;
+  int nccl_gpu_rank_ = -1;
+  int local_gpu_rank_ = -1;
+  int gpu_device_id_ = -1;
+  int num_gpu_in_node_ = 0;
+  data_size_t global_num_data_ = 0;
+};
+
 }  // namespace LightGBM
 
 #endif  // USE_CUDA
diff --git a/include/LightGBM/dataset.h b/include/LightGBM/dataset.h
index 56bc7b841dc3..9e41d35e73cd 100644
--- a/include/LightGBM/dataset.h
+++ b/include/LightGBM/dataset.h
@@ -647,6 +647,8 @@ class Dataset {
 
   void CopySubrow(const Dataset* fullset, const data_size_t* used_indices, data_size_t num_used_indices, bool need_meta_data);
 
+  void CopySubrowToDevice(const Dataset* fullset, const data_size_t* used_indices, data_size_t num_used_indices, bool need_meta_data, int gpu_device_id);
+
   MultiValBin* GetMultiBinFromSparseFeatures(const std::vector<uint32_t>& offsets) const;
 
   MultiValBin* GetMultiBinFromAllFeatures(const std::vector<uint32_t>& offsets) const;
@@ -991,6 +993,8 @@ class Dataset {
 
   void CreateCUDAColumnData();
 
+  void CopySubrowHostPart(const Dataset* fullset, const data_size_t* used_indices, data_size_t num_used_indices, bool need_meta_data);
+
   std::string data_filename_;
   /*! \brief Store used features */
   std::vector<std::unique_ptr<FeatureGroup>> feature_groups_;
diff --git a/include/LightGBM/objective_function.h b/include/LightGBM/objective_function.h
index ad188dc39676..fb3dc3ef2977 100644
--- a/include/LightGBM/objective_function.h
+++ b/include/LightGBM/objective_function.h
@@ -107,6 +107,14 @@ class ObjectiveFunction {
 
   virtual bool NeedConvertOutputCUDA () const { return false; }
 
+  /*!
+  * \brief Create object of objective function on CUDA
+  * \param type Specific type of objective function
+  * \param config Config for objective function
+  */
+  LIGHTGBM_EXPORT static ObjectiveFunction* CreateObjectiveFunctionCUDA(const std::string& type,
+    const Config& config);
+
   #endif  // USE_CUDA
 };
 
diff --git a/src/application/application.cpp b/src/application/application.cpp
index 0bb9eca13bf2..926d1d4608d1 100644
--- a/src/application/application.cpp
+++ b/src/application/application.cpp
@@ -181,7 +181,7 @@ void Application::InitTrain() {
   // create boosting
   boosting_.reset(
     Boosting::CreateBoosting(config_.boosting,
-                             config_.input_model.c_str()));
+                             config_.input_model.c_str(), config_.device_type, config_.num_gpus));
   // create objective function
   objective_fun_.reset(
     ObjectiveFunction::CreateObjectiveFunction(config_.objective,
@@ -261,13 +261,13 @@ void Application::Predict() {
 
 void Application::InitPredict() {
   boosting_.reset(
-    Boosting::CreateBoosting("gbdt", config_.input_model.c_str()));
+    Boosting::CreateBoosting("gbdt", config_.input_model.c_str(), config_.device_type, config_.num_gpus));
   Log::Info("Finished initializing prediction, total used %d iterations", boosting_->GetCurrentIteration());
 }
 
 void Application::ConvertModel() {
   boosting_.reset(
-    Boosting::CreateBoosting(config_.boosting, config_.input_model.c_str()));
+    Boosting::CreateBoosting(config_.boosting, config_.input_model.c_str(), config_.device_type, config_.num_gpus));
   boosting_->SaveModelToIfElse(-1, config_.convert_model.c_str());
 }
 
diff --git a/src/boosting/boosting.cpp b/src/boosting/boosting.cpp
index 98f2554b1388..7c5fcde1b655 100644
--- a/src/boosting/boosting.cpp
+++ b/src/boosting/boosting.cpp
@@ -8,6 +8,10 @@
 #include "gbdt.h"
 #include "rf.hpp"
 
+#ifdef USE_CUDA
+#include "cuda/nccl_gbdt.hpp"
+#endif  // USE_CUDA
+
 namespace LightGBM {
 
 std::string GetBoostingTypeFromModelFile(const char* filename) {
@@ -31,7 +35,7 @@ bool Boosting::LoadFileToBoosting(Boosting* boosting, const char* filename) {
   return true;
 }
 
-Boosting* Boosting::CreateBoosting(const std::string& type, const char* filename) {
+Boosting* Boosting::CreateBoosting(const std::string& type, const char* filename, const std::string& device_type, const int num_gpus) {
   if (filename == nullptr || filename[0] == '\0') {
     if (type == std::string("gbdt")) {
       return new GBDT();
@@ -48,7 +52,15 @@ Boosting* Boosting::CreateBoosting(const std::string& type, const char* filename
     std::unique_ptr<Boosting> ret;
     if (GetBoostingTypeFromModelFile(filename) == std::string("tree")) {
       if (type == std::string("gbdt")) {
-        ret.reset(new GBDT());
+        #ifdef USE_CUDA
+        if (device_type == std::string("cuda") && num_gpus > 1) {
+          return new NCCLGBDT<GBDT>();
+        } else {
+        #endif  // USE_CUDA
+          return new GBDT();
+        #ifdef USE_CUDA
+        }
+        #endif  // USE_CUDA
       } else if (type == std::string("dart")) {
         ret.reset(new DART());
       } else if (type == std::string("goss")) {
diff --git a/src/boosting/cuda/nccl_gbdt.cpp b/src/boosting/cuda/nccl_gbdt.cpp
index 47672a53679c..f06c5b92611b 100644
--- a/src/boosting/cuda/nccl_gbdt.cpp
+++ b/src/boosting/cuda/nccl_gbdt.cpp
@@ -4,8 +4,11 @@
  */
 
 #include "nccl_gbdt.hpp"
+#include "nccl_gbdt_component.hpp"
 #include <LightGBM/metric.h>
 
+#include <set>
+
 #ifdef USE_CUDA
 
 namespace LightGBM {
@@ -22,182 +25,26 @@ void NCCLGBDT<GBDT_T>::Init(
   const ObjectiveFunction* objective_function,
   const std::vector<const Metric*>& training_metrics) {
   GBDT_T::Init(gbdt_config, train_data, objective_function, training_metrics);
-  int max_num_gpu = 0;
-  CUDASUCCESS_OR_FATAL(cudaGetDeviceCount(&max_num_gpu));
-  num_gpu_ = this->config_->num_gpu;
-  if (num_gpu_ > max_num_gpu) {
-    Log::Warning("Specifying %d GPUs, but only %d available.", num_gpu_, max_num_gpu);
-    num_gpu_ = max_num_gpu;
-  }
-  int gpu_device_id = this->config_->gpu_device_id;
-  if (this->config_->gpu_device_list == std::string("")) {
-    if (gpu_device_id < 0 || gpu_device_id >= num_gpu_) {
-      Log::Warning("Master GPU Device ID %d is not in the valid range [%d, %d], will use GPU 0 as master.", gpu_device_id, 0, max_num_gpu);
-      gpu_device_id = 0;
-    }
-  }
-  master_gpu_device_id_ = gpu_device_id;
-  master_gpu_index_ = master_gpu_device_id_;
-
-  if (this->config_->gpu_device_list != std::string("")) {
-    std::vector<std::string> gpu_list_str = Common::Split(this->config_->gpu_device_list.c_str(), ",");
-    for (const auto& gpu_str : gpu_list_str) {
-      int gpu_id = 0;
-      Common::Atoi<int>(gpu_str.c_str(), &gpu_id);
-      gpu_list_.emplace_back(gpu_id);
-    }
-    bool check_master_gpu = false;
-    for (int i = 0; i < static_cast<int>(gpu_list_.size()); ++i) {
-      const int gpu_id = gpu_list_[i];
-      if (gpu_id == master_gpu_device_id_) {
-        master_gpu_index_ = i;
-        check_master_gpu = true;
-      }
-    }
-    if (!check_master_gpu) {
-      Log::Fatal("Master GPU ID %d is not in GPU ID list.", master_gpu_device_id_);
-    }
-  }
-
-  const int num_threads = OMP_NUM_THREADS();
-  if (num_gpu_ > num_threads) {
-    Log::Fatal("Number of GPUs %d is greather than the number of threads %d. Please use more threads.", num_gpu_, num_threads);
-  }
-
-  InitNCCL();
-
-  // partition data across GPUs
-  const data_size_t num_data_per_gpu = (this->num_data_ + num_gpu_ - 1) / num_gpu_;
-  std::vector<data_size_t> all_data_indices(this->num_data_, 0);
-  #pragma omp parallel for schedule(static)
-  for (data_size_t i = 0; i < this->num_data_; ++i) {
-    all_data_indices[i] = i;
-  }
-  per_gpu_data_start_.resize(num_gpu_);
-  per_gpu_data_end_.resize(num_gpu_);
-  per_gpu_datasets_.resize(num_gpu_);
-  for (int gpu_index = 0; gpu_index < num_gpu_; ++gpu_index) {
-    SetCUDADevice(gpu_index);
-    const data_size_t data_start = num_data_per_gpu * gpu_index;
-    const data_size_t data_end = std::min(data_start + num_data_per_gpu, this->num_data_);
-    const data_size_t num_data_in_gpu = data_end - data_start;
-    per_gpu_data_start_[gpu_index] = data_start;
-    per_gpu_data_end_[gpu_index] = data_end;
-    per_gpu_datasets_[gpu_index].reset(new Dataset(num_data_in_gpu));
-    per_gpu_datasets_[gpu_index]->ReSize(num_data_in_gpu);
-    per_gpu_datasets_[gpu_index]->CopyFeatureMapperFrom(this->train_data_);
-    per_gpu_datasets_[gpu_index]->CopySubrow(this->train_data_, all_data_indices.data() + data_start, num_data_in_gpu, true, data_start, data_end, GetCUDADevice(gpu_index));
-  }
 
-  // initialize per gpu objectives, training scores and tree learners
-  per_gpu_objective_functions_.resize(num_gpu_);
-  per_gpu_train_score_updater_.resize(num_gpu_);
-  per_gpu_gradients_.resize(num_gpu_);
-  per_gpu_hessians_.resize(num_gpu_);
-  per_gpu_tree_learners_.resize(num_gpu_);
-  for (int gpu_index = 0; gpu_index < num_gpu_; ++gpu_index) {
-    SetCUDADevice(gpu_index);
-    const data_size_t num_data_in_gpu = per_gpu_data_end_[gpu_index] - per_gpu_data_start_[gpu_index];
-    per_gpu_objective_functions_[gpu_index].reset(ObjectiveFunction::CreateObjectiveFunction(this->config_->objective, *(this->config_.get())));
-    per_gpu_objective_functions_[gpu_index]->Init(per_gpu_datasets_[gpu_index]->metadata(), per_gpu_datasets_[gpu_index]->num_data());
-    per_gpu_objective_functions_[gpu_index]->SetNCCLComm(&nccl_communicators_[gpu_index]);
-    per_gpu_train_score_updater_[gpu_index].reset(new CUDAScoreUpdater(per_gpu_datasets_[gpu_index].get(), this->num_tree_per_iteration_));
-    per_gpu_gradients_[gpu_index].reset(new CUDAVector<score_t>(num_data_in_gpu));
-    per_gpu_hessians_[gpu_index].reset(new CUDAVector<score_t>(num_data_in_gpu));
-    per_gpu_tree_learners_[gpu_index].reset(TreeLearner::CreateTreeLearner(
-      this->config_->tree_learner,
-      this->config_->device_type,
-      this->config_.get()));
-    per_gpu_tree_learners_[gpu_index]->SetNCCL(&nccl_communicators_[gpu_index], nccl_gpu_rank_[gpu_index], GetCUDADevice(gpu_index), this->num_data_);
-    per_gpu_tree_learners_[gpu_index]->Init(per_gpu_datasets_[gpu_index].get(), this->is_constant_hessian_);
-  }
-
-  // initialize host threads and thread data
-  host_threads_.resize(num_gpu_);
-  boosting_thread_data_.resize(num_gpu_);
-  train_tree_learner_thread_data_.resize(num_gpu_);
-  update_score_thread_data_.resize(num_gpu_);
-  for (int gpu_index = 0; gpu_index < num_gpu_; ++gpu_index) {
-    boosting_thread_data_[gpu_index].gpu_index = GetCUDADevice(gpu_index);
-    boosting_thread_data_[gpu_index].gpu_objective_function = per_gpu_objective_functions_[gpu_index].get();
-    boosting_thread_data_[gpu_index].gradients = per_gpu_gradients_[gpu_index]->RawData();
-    boosting_thread_data_[gpu_index].hessians = per_gpu_hessians_[gpu_index]->RawData();
-    boosting_thread_data_[gpu_index].score = per_gpu_train_score_updater_[gpu_index]->score();
-    train_tree_learner_thread_data_[gpu_index].gpu_index = GetCUDADevice(gpu_index);
-    train_tree_learner_thread_data_[gpu_index].gpu_tree_learner = per_gpu_tree_learners_[gpu_index].get();
-    train_tree_learner_thread_data_[gpu_index].gradients = per_gpu_gradients_[gpu_index]->RawData();
-    train_tree_learner_thread_data_[gpu_index].hessians = per_gpu_hessians_[gpu_index]->RawData();
-    train_tree_learner_thread_data_[gpu_index].num_data_in_gpu = per_gpu_data_end_[gpu_index] - per_gpu_data_start_[gpu_index];
-    update_score_thread_data_[gpu_index].gpu_index = GetCUDADevice(gpu_index);
-    update_score_thread_data_[gpu_index].gpu_score_updater = per_gpu_train_score_updater_[gpu_index].get();
-    update_score_thread_data_[gpu_index].gpu_tree_learner = per_gpu_tree_learners_[gpu_index].get();
-  }
-
-  // return to master gpu device
-  CUDASUCCESS_OR_FATAL(cudaSetDevice(master_gpu_device_id_));
-}
+  nccl_topology_.reset(new NCCLTopology(this->config_->gpu_device_id, this->config_->num_gpus, this->config_->gpu_device_id_list, train_data->num_data()));
 
-template <typename GBDT_T>
-void NCCLGBDT<GBDT_T>::InitNCCL() {
-  nccl_gpu_rank_.resize(num_gpu_, -1);
-  nccl_communicators_.resize(num_gpu_);
-  ncclUniqueId nccl_unique_id;
-  if (Network::num_machines() == 1 || Network::rank() == 0) {
-    NCCLCHECK(ncclGetUniqueId(&nccl_unique_id));
-  }
-  if (Network::num_machines() > 1) {
-    std::vector<ncclUniqueId> output_buffer(Network::num_machines());
-    Network::Allgather(
-      reinterpret_cast<char*>(&nccl_unique_id),
-      sizeof(ncclUniqueId) / sizeof(char),
-      reinterpret_cast<char*>(output_buffer.data()));
-    if (Network::rank() > 0) {
-      nccl_unique_id = output_buffer[0];
-    }
-  }
-
-  if (Network::num_machines() > 1) {
-    std::vector<int> num_gpus_per_machine(Network::num_machines() + 1, 0);
-    Network::Allgather(
-      reinterpret_cast<char*>(&num_gpu_),
-      sizeof(int) / sizeof(char),
-      reinterpret_cast<char*>(num_gpus_per_machine.data() + 1));
-    for (int rank = 1; rank < Network::num_machines() + 1; ++rank) {
-      num_gpus_per_machine[rank] += num_gpus_per_machine[rank - 1];
-    }
-    CHECK_EQ(num_gpus_per_machine[Network::rank() + 1] - num_gpus_per_machine[Network::rank()], num_gpu_);
-    NCCLCHECK(ncclGroupStart());
-    for (int gpu_index = 0; gpu_index < num_gpu_; ++gpu_index) {
-      SetCUDADevice(gpu_index);
-      nccl_gpu_rank_[gpu_index] = gpu_index + num_gpus_per_machine[Network::rank()];
-      NCCLCHECK(ncclCommInitRank(&nccl_communicators_[gpu_index], num_gpus_per_machine.back(), nccl_unique_id, nccl_gpu_rank_[gpu_index]));
-    }
-    NCCLCHECK(ncclGroupEnd());
-  } else {
-    NCCLCHECK(ncclGroupStart());
-    for (int gpu_index = 0; gpu_index < num_gpu_; ++gpu_index) {
-      SetCUDADevice(gpu_index);
-      nccl_gpu_rank_[gpu_index] = gpu_index;
-      NCCLCHECK(ncclCommInitRank(&nccl_communicators_[gpu_index], num_gpu_, nccl_unique_id, gpu_index));
-    }
-    NCCLCHECK(ncclGroupEnd());
-  }
+  nccl_topology_->InitNCCL();
 
-  // return to master gpu device
-  CUDASUCCESS_OR_FATAL(cudaSetDevice(master_gpu_device_id_));
+  nccl_topology_->InitPerDevice<NCCLGBDTComponent>(nccl_gbdt_components_);
+  nccl_topology_->RunPerDevice<NCCLGBDTComponent, void>(nccl_gbdt_components_, [this, gbdt_config, train_data]
+    (NCCLGBDTComponent* nccl_gbdt_component) { nccl_gbdt_component->Init(
+      gbdt_config, train_data, this->num_tree_per_iteration_, this->boosting_on_gpu_, this->is_constant_hessian_
+    );
+  });
 }
 
 template <typename GBDT_T>
-void* NCCLGBDT<GBDT_T>::BoostingThread(void* thread_data) {
-  const BoostingThreadData* boosting_thread_data = reinterpret_cast<BoostingThreadData*>(thread_data);
-  const int gpu_index = boosting_thread_data->gpu_index;
-  const ObjectiveFunction* objective_function = boosting_thread_data->gpu_objective_function;
-  score_t* gradients = boosting_thread_data->gradients;
-  score_t* hessians = boosting_thread_data->hessians;
-  const double* score = boosting_thread_data->score;
-  CUDASUCCESS_OR_FATAL(cudaSetDevice(gpu_index));
+void NCCLGBDT<GBDT_T>::BoostingThread(NCCLGBDTComponent* thread_data) {
+  const ObjectiveFunction* objective_function = thread_data->objective_function();
+  score_t* gradients = thread_data->gradients();
+  score_t* hessians = thread_data->hessians();
+  const double* score = thread_data->train_score_updater()->score();
   objective_function->GetGradients(score, gradients, hessians);
-  return nullptr;
 }
 
 template <typename GBDT_T>
@@ -206,46 +53,28 @@ void NCCLGBDT<GBDT_T>::Boosting() {
   if (this->objective_function_ == nullptr) {
     Log::Fatal("No object function provided");
   }
-  for (int gpu_index = 0; gpu_index < num_gpu_; ++gpu_index) {
-    if (pthread_create(&host_threads_[gpu_index], nullptr, BoostingThread,
-      reinterpret_cast<void*>(&boosting_thread_data_[gpu_index]))) {
-      Log::Fatal("Error in creating boosting threads.");
-    }
-  }
-  for (int gpu_index = 0; gpu_index < num_gpu_; ++gpu_index) {
-    if (pthread_join(host_threads_[gpu_index], nullptr)) {
-      Log::Fatal("Error in joining boosting threads.");
-    }
-  }
+  nccl_topology_->DispatchPerDevice<NCCLGBDTComponent>(nccl_gbdt_components_, BoostingThread);
 }
 
 template <typename GBDT_T>
 double NCCLGBDT<GBDT_T>::BoostFromAverage(int class_id, bool update_scorer) {
   double init_score = GBDT_T::BoostFromAverage(class_id, update_scorer);
-  for (int gpu_index = 0; gpu_index < num_gpu_; ++gpu_index) {
-    SetCUDADevice(gpu_index);
-    if (std::fabs(init_score) > kEpsilon && update_scorer) {
-      per_gpu_train_score_updater_[gpu_index]->AddScore(init_score, class_id);
-    }
+
+  if (init_score != 0.0) {
+    nccl_topology_->RunPerDevice<NCCLGBDTComponent, void>(nccl_gbdt_components_, [init_score, class_id] (NCCLGBDTComponent* thread_data) {
+      thread_data->train_score_updater()->AddScore(init_score, class_id);
+    });
   }
 
-  // return to master gpu device
-  CUDASUCCESS_OR_FATAL(cudaSetDevice(master_gpu_device_id_));
+  return init_score;
 }
 
 template <typename GBDT_T>
-void* NCCLGBDT<GBDT_T>::TrainTreeLearnerThread(void* thread_data) {
-  TrainTreeLearnerThreadData* tree_train_learner_thread_data = reinterpret_cast<TrainTreeLearnerThreadData*>(thread_data);
-  const int gpu_index = tree_train_learner_thread_data->gpu_index;
-  const int class_id = tree_train_learner_thread_data->class_id;
-  const data_size_t num_data_in_gpu = tree_train_learner_thread_data->num_data_in_gpu;
-  const score_t* gradients = tree_train_learner_thread_data->gradients + class_id * num_data_in_gpu;
-  const score_t* hessians = tree_train_learner_thread_data->hessians + class_id * num_data_in_gpu;
-  const bool is_first_tree = tree_train_learner_thread_data->is_first_time;
-  CUDASUCCESS_OR_FATAL(cudaSetDevice(gpu_index));
-  tree_train_learner_thread_data->tree.reset(
-    tree_train_learner_thread_data->gpu_tree_learner->Train(gradients, hessians, is_first_tree));
-  return nullptr;
+void NCCLGBDT<GBDT_T>::TrainTreeLearnerThread(NCCLGBDTComponent* thread_data, const int class_id, const bool is_first_tree) {
+  const data_size_t num_data_in_gpu = thread_data->num_data_in_gpu();
+  const score_t* gradients = thread_data->gradients() + class_id * num_data_in_gpu;
+  const score_t* hessians = thread_data->hessians() + class_id * num_data_in_gpu;
+  thread_data->SetTree(thread_data->tree_learner()->Train(gradients, hessians, is_first_tree));
 }
 
 template <typename GBDT_T>
@@ -259,88 +88,61 @@ bool NCCLGBDT<GBDT_T>::TrainOneIter(const score_t* gradients, const score_t* hes
     }
     Boosting();
   } else {
-    for (int gpu_index = 0; gpu_index < num_gpu_; ++gpu_index) {
-      SetCUDADevice(gpu_index);
-      const data_size_t gpu_data_start = per_gpu_data_start_[gpu_index];
-      const data_size_t num_data_in_gpu = per_gpu_data_end_[gpu_index] - gpu_data_start;
+    nccl_topology_->RunPerDevice<NCCLGBDTComponent, void>(nccl_gbdt_components_, [this, gradients, hessians] (NCCLGBDTComponent* thread_data) {
+      const data_size_t data_start_index = thread_data->data_start_index();
+      const data_size_t num_data_in_gpu = thread_data->num_data_in_gpu();
+
       for (int class_id = 0; class_id < this->num_class_; ++class_id) {
         CopyFromHostToCUDADevice<score_t>(
-          per_gpu_gradients_[gpu_index]->RawData() + class_id * num_data_in_gpu,
-          gradients + class_id * this->num_data_ + gpu_data_start, num_data_in_gpu, __FILE__, __LINE__);
+          thread_data->gradients() + class_id * num_data_in_gpu,
+          gradients + class_id * this->num_data_ + data_start_index, num_data_in_gpu, __FILE__, __LINE__);
         CopyFromHostToCUDADevice<score_t>(
-          per_gpu_hessians_[gpu_index]->RawData() + class_id * num_data_in_gpu,
-          hessians + class_id * this->num_data_ + gpu_data_start, num_data_in_gpu, __FILE__, __LINE__);
+          thread_data->hessians() + class_id * num_data_in_gpu,
+          hessians + class_id * this->num_data_ + data_start_index, num_data_in_gpu, __FILE__, __LINE__);
       }
-    }
-
-    // return to master gpu device
-    CUDASUCCESS_OR_FATAL(cudaSetDevice(master_gpu_device_id_));
+    });
   }
 
   bool should_continue = false;
   for (int cur_tree_id = 0; cur_tree_id < this->num_tree_per_iteration_; ++cur_tree_id) {
-    std::vector<std::unique_ptr<Tree>> new_tree(num_gpu_);
-    for (int gpu_index = 0; gpu_index < num_gpu_; ++gpu_index) {
-      new_tree[gpu_index].reset(nullptr);
-    }
     if (this->class_need_train_[cur_tree_id] && this->train_data_->num_features() > 0) {
-      if (this->is_use_subset_ && this->bag_data_cnt_ < this->num_data_) {
+      if (this->data_sample_strategy_->is_use_subset() && this->data_sample_strategy_->bag_data_cnt() < this->num_data_) {
         Log::Fatal("Bagging is not supported for NCCLGBDT");
       }
       bool is_first_tree = this->models_.size() < static_cast<size_t>(this->num_tree_per_iteration_);
-      for (int gpu_index = 0; gpu_index < num_gpu_; ++gpu_index) {
-        train_tree_learner_thread_data_[gpu_index].is_first_time = is_first_tree;
-        train_tree_learner_thread_data_[gpu_index].class_id = cur_tree_id;
-        if (pthread_create(&host_threads_[gpu_index], nullptr, TrainTreeLearnerThread,
-          reinterpret_cast<void*>(&train_tree_learner_thread_data_[gpu_index]))) {
-          Log::Fatal("Error in creating tree training threads.");
-        }
-      }
-      for (int gpu_index = 0; gpu_index < num_gpu_; ++gpu_index) {
-        if (pthread_join(host_threads_[gpu_index], nullptr)) {
-          Log::Fatal("Error in joining tree training threads.");
-        }
-        new_tree[gpu_index].reset(train_tree_learner_thread_data_[gpu_index].tree.release());
-      }
+      nccl_topology_->DispatchPerDevice<NCCLGBDTComponent>(nccl_gbdt_components_,
+        [is_first_tree, cur_tree_id] (NCCLGBDTComponent* thread_data) -> void {
+          TrainTreeLearnerThread(thread_data, cur_tree_id, is_first_tree);
+      });
     }
 
-    if (new_tree[master_gpu_index_]->num_leaves() > 1) {
-      should_continue = true;
-      if (this->objective_function_ != nullptr && this->objective_function_->IsRenewTreeOutput()) {
-        Log::Fatal("Objective function with renewing is not supported for NCCLGBDT.");
-      }
-      // shrinkage by learning rate
-      for (int gpu_index = 0; gpu_index < num_gpu_; ++gpu_index) {
-        SetCUDADevice(gpu_index);
-        new_tree[gpu_index]->Shrinkage(this->shrinkage_rate_);
+    nccl_topology_->DispatchPerDevice<NCCLGBDTComponent>(nccl_gbdt_components_, [cur_tree_id, this, init_scores] (NCCLGBDTComponent* thread_data) -> void {
+      this->UpdateScoreThread(thread_data, cur_tree_id, this->config_->learning_rate, init_scores[cur_tree_id]);
+    });
+
+    nccl_topology_->RunOnMasterDevice<NCCLGBDTComponent, void>(nccl_gbdt_components_, [&should_continue, this, cur_tree_id] (NCCLGBDTComponent* thread_data) -> void {
+      if (thread_data->new_tree()->num_leaves() > 1) {
+        should_continue = true;
       }
-      CUDASUCCESS_OR_FATAL(cudaSetDevice(master_gpu_device_id_));
-      // update score
-      UpdateScore(new_tree, cur_tree_id);
-      if (std::fabs(init_scores[cur_tree_id]) > kEpsilon) {
-        for (int gpu_index = 0; gpu_index < num_gpu_; ++gpu_index) {
-          SetCUDADevice(gpu_index);
-          new_tree[gpu_index]->AddBias(init_scores[cur_tree_id]);
-        }
-        CUDASUCCESS_OR_FATAL(cudaSetDevice(master_gpu_device_id_));
+      for (auto& score_updater : this->valid_score_updater_) {
+        score_updater->AddScore(thread_data->new_tree(), cur_tree_id);
       }
-    } else {
-      // only add default score one-time
+    });
+
+    if (!should_continue) {
       if (this->models_.size() < static_cast<size_t>(this->num_tree_per_iteration_)) {
         Log::Warning("Training stopped with no splits.");
       }
     }
 
     // add model
-    this->models_.push_back(std::move(new_tree[master_gpu_index_]));
+    nccl_topology_->RunOnMasterDevice<NCCLGBDTComponent, void>(nccl_gbdt_components_, [this] (NCCLGBDTComponent* thread_data) -> void {
+      this->models_.emplace_back(thread_data->release_new_tree());
+    });
 
-    for (int gpu_index = 0; gpu_index < num_gpu_; ++gpu_index) {
-      if (gpu_index != master_gpu_index_) {
-        SetCUDADevice(gpu_index);
-        new_tree[gpu_index].reset(nullptr);
-      }
-    }
-    CUDASUCCESS_OR_FATAL(cudaSetDevice(master_gpu_device_id_));
+    nccl_topology_->RunOnNonMasterDevice<NCCLGBDTComponent, void>(nccl_gbdt_components_, [this] (NCCLGBDTComponent* thread_data) -> void {
+      thread_data->clear_new_tree();
+    });
   }
 
   if (!should_continue) {
@@ -358,45 +160,20 @@ bool NCCLGBDT<GBDT_T>::TrainOneIter(const score_t* gradients, const score_t* hes
 }
 
 template <typename GBDT_T>
-void* NCCLGBDT<GBDT_T>::UpdateScoreThread(void* thread_data) {
-  const UpdateScoreThreadData* update_score_thread_data = reinterpret_cast<const UpdateScoreThreadData*>(thread_data);
-  const int gpu_index = update_score_thread_data->gpu_index;
-  CUDASUCCESS_OR_FATAL(cudaSetDevice(gpu_index));
-  update_score_thread_data->gpu_score_updater->AddScore(
-    update_score_thread_data->gpu_tree_learner,
-    update_score_thread_data->tree,
-    update_score_thread_data->cur_tree_id);
-  return nullptr;
-}
-
-template <typename GBDT_T>
-void NCCLGBDT<GBDT_T>::UpdateScore(const std::vector<std::unique_ptr<Tree>>& tree, const int cur_tree_id) {
-  Common::FunctionTimer fun_timer("GBDT::UpdateScore", global_timer);
-  // update training score
-  if (!this->is_use_subset_) {
-    if (this->num_data_ - this->bag_data_cnt_ > 0) {
-      Log::Fatal("bagging is not supported for NCCLGBDT.");
-    }
-    for (int gpu_index = 0; gpu_index < num_gpu_; ++gpu_index) {
-      update_score_thread_data_[gpu_index].tree = tree[gpu_index].get();
-      update_score_thread_data_[gpu_index].cur_tree_id = cur_tree_id;
-      if (pthread_create(&host_threads_[gpu_index], nullptr, UpdateScoreThread,
-        reinterpret_cast<void*>(&update_score_thread_data_[gpu_index]))) {
-        Log::Fatal("Error in creating update score threads.");
-      }
+void NCCLGBDT<GBDT_T>::UpdateScoreThread(NCCLGBDTComponent* thread_data, const int cur_tree_id, const double shrinkage_rate, const double init_score) {
+  if (thread_data->new_tree()->num_leaves() > 1) {
+    // TODO(shiyu1994): implement bagging
+    if (thread_data->objective_function() != nullptr && thread_data->objective_function()->IsRenewTreeOutput()) {
+      // TODO(shiyu1994): implement renewing
     }
-    for (int gpu_index = 0; gpu_index < num_gpu_; ++gpu_index) {
-      if (pthread_join(host_threads_[gpu_index], nullptr)) {
-        Log::Fatal("Error in joining tree training threads.");
-      }
+    thread_data->new_tree()->Shrinkage(shrinkage_rate);
+    thread_data->train_score_updater()->AddScore(
+      thread_data->tree_learner(),
+      thread_data->new_tree(),
+      cur_tree_id);
+    if (std::fabs(init_score) > kEpsilon) {
+      thread_data->new_tree()->AddBias(init_score);
     }
-  } else {
-    Log::Fatal("bagging is not supported for NCCLGBDT.");
-  }
-
-  // update validation score
-  for (auto& score_updater : this->valid_score_updater_) {
-    score_updater->AddScore(tree[master_gpu_index_].get(), cur_tree_id);
   }
 }
 
@@ -405,18 +182,18 @@ std::vector<double> NCCLGBDT<GBDT_T>::EvalOneMetric(const Metric* metric, const
   if (score == this->train_score_updater_->score()) {
     // delegate to per gpu train score updater
     std::vector<double> tmp_score(num_data * this->num_class_, 0.0f);
-    for (int gpu_index = 0; gpu_index < num_gpu_; ++gpu_index) {
-      const data_size_t data_start = per_gpu_data_start_[gpu_index];
-      const data_size_t num_data_in_gpu = per_gpu_data_end_[gpu_index] - data_start;
+
+    nccl_topology_->RunPerDevice<NCCLGBDTComponent, void>(nccl_gbdt_components_, [this, &tmp_score] (NCCLGBDTComponent* thread_data) {
+      const data_size_t data_start = thread_data->data_start_index();
+      const data_size_t num_data_in_gpu = thread_data->num_data_in_gpu();
       for (int class_id = 0; class_id < this->num_class_; ++class_id) {
-        SetCUDADevice(gpu_index);
         CopyFromCUDADeviceToHost<double>(tmp_score.data() + class_id * this->num_data_ + data_start,
-          per_gpu_train_score_updater_[gpu_index]->score() + class_id * num_data_in_gpu,
+          thread_data->train_score_updater()->score() + class_id * num_data_in_gpu,
           static_cast<size_t>(num_data_in_gpu), __FILE__, __LINE__);
       }
-      CUDASUCCESS_OR_FATAL(cudaSetDevice(master_gpu_device_id_));
-      return metric->Eval(tmp_score.data(), this->objective_function_);
-    }
+    });
+
+    return metric->Eval(tmp_score.data(), this->objective_function_);
   } else {
     return GBDT_T::EvalOneMetric(metric, score, num_data);
   }
diff --git a/src/boosting/cuda/nccl_gbdt.hpp b/src/boosting/cuda/nccl_gbdt.hpp
index 1d808ef5e65f..7f648b1e42d6 100644
--- a/src/boosting/cuda/nccl_gbdt.hpp
+++ b/src/boosting/cuda/nccl_gbdt.hpp
@@ -9,10 +9,13 @@
 #ifdef USE_CUDA
 
 #include "../gbdt.h"
+#include <LightGBM/cuda/cuda_nccl_topology.hpp>
 #include <LightGBM/objective_function.h>
 #include <LightGBM/network.h>
 #include "cuda_score_updater.hpp"
 #include <pthread.h>
+#include <memory>
+#include "nccl_gbdt_component.hpp"
 
 namespace LightGBM {
 
@@ -100,17 +103,11 @@ class NCCLGBDT: public GBDT_T {
     }
   };
 
-  static void* BoostingThread(void* thread_data);
+  static void BoostingThread(NCCLGBDTComponent* thread_data);
 
-  static void* TrainTreeLearnerThread(void* thread_data);
+  static void TrainTreeLearnerThread(NCCLGBDTComponent* thread_data, const int class_id, const bool is_first_tree);
 
-  static void* UpdateScoreThread(void* thread_data);
-
-  void Bagging(int /*iter*/) override {
-    Log::Fatal("Bagging is not supported for NCCLGBDT.");
-  }
-
-  void InitNCCL();
+  static void UpdateScoreThread(NCCLGBDTComponent* thread_data, const int cur_tree_id, const double shrinkage_rate, const double init_score);
 
   double BoostFromAverage(int class_id, bool update_scorer) override;
 
@@ -126,27 +123,8 @@ class NCCLGBDT: public GBDT_T {
 
   std::vector<double> EvalOneMetric(const Metric* metric, const double* score, const data_size_t num_data) const override;
 
-  void SetCUDADevice(int gpu_id) const {
-    if (gpu_list_.empty()) {
-      CUDASUCCESS_OR_FATAL(cudaSetDevice(gpu_id));
-    } else {
-      CUDASUCCESS_OR_FATAL(cudaSetDevice(gpu_list_[gpu_id]));
-    }
-  }
 
-  int GetCUDADevice(int gpu_id) const {
-    if (gpu_list_.empty()) {
-      return gpu_id;
-    } else {
-      return gpu_list_[gpu_id];
-    }
-  }
-
-  int num_gpu_;
   int num_threads_;
-  int master_gpu_device_id_;
-  int master_gpu_index_;
-  std::vector<int> gpu_list_;
   std::vector<std::unique_ptr<ObjectiveFunction>> per_gpu_objective_functions_;
   std::vector<std::unique_ptr<ScoreUpdater>> per_gpu_train_score_updater_;
   std::vector<std::unique_ptr<CUDAVector<score_t>>> per_gpu_gradients_;
@@ -158,9 +136,14 @@ class NCCLGBDT: public GBDT_T {
   std::vector<BoostingThreadData> boosting_thread_data_;
   std::vector<TrainTreeLearnerThreadData> train_tree_learner_thread_data_;
   std::vector<UpdateScoreThreadData> update_score_thread_data_;
+
+  std::unique_ptr<NCCLTopology> nccl_topology_;
+
   std::vector<int> nccl_gpu_rank_;
   std::vector<ncclComm_t> nccl_communicators_;
   std::vector<std::unique_ptr<TreeLearner>> per_gpu_tree_learners_;
+
+  std::vector<std::unique_ptr<NCCLGBDTComponent>> nccl_gbdt_components_;
 };
 
 }  // namespace LightGBM
diff --git a/src/boosting/cuda/nccl_gbdt_component.hpp b/src/boosting/cuda/nccl_gbdt_component.hpp
new file mode 100644
index 000000000000..600f6c9b596f
--- /dev/null
+++ b/src/boosting/cuda/nccl_gbdt_component.hpp
@@ -0,0 +1,95 @@
+/*!
+ * Copyright (c) 2023 Microsoft Corporation. All rights reserved.
+ * Licensed under the MIT License. See LICENSE file in the project root for license information.
+ */
+
+#ifndef LIGHTGBM_BOOSTING_CUDA_NCCL_GBDT_COMPONENT_HPP_
+#define LIGHTGBM_BOOSTING_CUDA_NCCL_GBDT_COMPONENT_HPP_
+
+#ifdef USE_CUDA
+
+
+#include <LightGBM/cuda/cuda_objective_function.hpp>
+#include <LightGBM/objective_function.h>
+#include "cuda_score_updater.hpp"
+#include "../../treelearner/cuda/cuda_single_gpu_tree_learner.hpp"
+#include <LightGBM/tree.h>
+
+namespace LightGBM {
+
+class NCCLGBDTComponent: public NCCLInfo {
+ public:
+  NCCLGBDTComponent() {}
+
+  void Init(const Config* config, const Dataset* train_data, const int num_tree_per_iteration, const bool boosting_on_gpu, const bool is_constant_hessian) {
+    const data_size_t num_data_per_gpu = (train_data->num_data() + num_gpu_in_node_ - 1) / num_gpu_in_node_;
+    data_start_index_ = num_data_per_gpu * local_gpu_rank_;
+    data_end_index_ = std::min<data_size_t>(data_start_index_ + num_data_per_gpu, train_data->num_data());
+    num_data_in_gpu_ = data_end_index_ - data_start_index_;
+
+    dataset_.reset(new Dataset(num_data_in_gpu_));
+    dataset_->ReSize(num_data_in_gpu_);
+    dataset_->CopyFeatureMapperFrom(train_data);
+    std::vector<data_size_t> used_indices(num_data_in_gpu_);
+    for (data_size_t data_index = data_start_index_; data_index < data_end_index_; ++data_index) {
+      used_indices[data_index - data_start_index_] = data_index;
+    }
+    dataset_->CopySubrowToDevice(train_data, used_indices.data(), num_data_in_gpu_, true, gpu_device_id_);
+
+    objective_function_.reset(ObjectiveFunction::CreateObjectiveFunctionCUDA(config->objective, *config));
+    train_score_updater_.reset(new CUDAScoreUpdater(dataset_.get(), num_tree_per_iteration, boosting_on_gpu));
+    gradients_.reset(new CUDAVector<score_t>(num_data_in_gpu_));
+    hessians_.reset(new CUDAVector<score_t>(num_data_in_gpu_));
+    tree_learner_.reset(new CUDASingleGPUTreeLearner(config, boosting_on_gpu));
+  
+    tree_learner_->SetNCCLInfo(nccl_communicator_, nccl_gpu_rank_, local_gpu_rank_, gpu_device_id_, train_data->num_data());
+
+    objective_function_->Init(dataset_->metadata(), dataset_->num_data());
+    tree_learner_->Init(dataset_.get(), is_constant_hessian);
+  }
+
+  ObjectiveFunction* objective_function() { return objective_function_.get(); }
+
+  CUDAScoreUpdater* train_score_updater() { return train_score_updater_.get(); }
+
+  score_t* gradients() { return gradients_->RawData(); }
+
+  score_t* hessians() { return hessians_->RawData(); }
+
+  data_size_t num_data_in_gpu() const { return num_data_in_gpu_; }
+
+  CUDASingleGPUTreeLearner* tree_learner() { return tree_learner_.get(); }
+
+  void SetTree(Tree* tree) {
+    new_tree_.reset(tree);
+  }
+
+  data_size_t data_start_index() const { return data_start_index_; }
+
+  data_size_t data_end_index() const { return data_end_index_; }
+
+  Tree* new_tree() { return new_tree_.get(); }
+
+  Tree* release_new_tree() { return new_tree_.release(); }
+
+  void clear_new_tree() { new_tree_.reset(nullptr); }
+
+ private:
+  std::unique_ptr<ObjectiveFunction> objective_function_;
+  std::unique_ptr<CUDAScoreUpdater> train_score_updater_;
+  std::unique_ptr<CUDAVector<score_t>> gradients_;
+  std::unique_ptr<CUDAVector<score_t>> hessians_;
+  std::unique_ptr<Dataset> dataset_;
+  std::unique_ptr<CUDASingleGPUTreeLearner> tree_learner_;
+  std::unique_ptr<Tree> new_tree_;
+
+  data_size_t data_start_index_;
+  data_size_t data_end_index_;
+  data_size_t num_data_in_gpu_;
+};
+
+}  // namespace LightGBM
+
+#endif  // USE_CUDA
+
+#endif  // LIGHTGBM_BOOSTING_CUDA_NCCL_GBDT_COMPONENT_HPP_
diff --git a/src/boosting/gbdt.h b/src/boosting/gbdt.h
index 28ebee446fad..98f2f3235fd9 100644
--- a/src/boosting/gbdt.h
+++ b/src/boosting/gbdt.h
@@ -505,7 +505,7 @@ class GBDT : public GBDTBase {
   */
   std::string OutputMetric(int iter);
 
-  double BoostFromAverage(int class_id, bool update_scorer);
+  virtual double BoostFromAverage(int class_id, bool update_scorer);
 
   /*!
   * \brief Reset gradient buffers, must be called after sample strategy is reset
diff --git a/src/c_api.cpp b/src/c_api.cpp
index baf934db42b1..9e7c29d69dd8 100644
--- a/src/c_api.cpp
+++ b/src/c_api.cpp
@@ -108,7 +108,7 @@ class SingleRowPredictor {
 class Booster {
  public:
   explicit Booster(const char* filename) {
-    boosting_.reset(Boosting::CreateBoosting("gbdt", filename));
+    boosting_.reset(Boosting::CreateBoosting("gbdt", filename, std::string("cpu"), 0));
   }
 
   Booster(const Dataset* train_data,
@@ -122,7 +122,7 @@ class Booster {
                    "please use continued train with input score");
     }
 
-    boosting_.reset(Boosting::CreateBoosting(config_.boosting, nullptr));
+    boosting_.reset(Boosting::CreateBoosting(config_.boosting, nullptr, config_.device_type, config_.num_gpus));
 
     train_data_ = train_data;
     CreateObjectiveAndMetrics();
diff --git a/src/io/config_auto.cpp b/src/io/config_auto.cpp
index 394614af3f33..204a41c0bba3 100644
--- a/src/io/config_auto.cpp
+++ b/src/io/config_auto.cpp
@@ -318,8 +318,9 @@ const std::unordered_set<std::string>& Config::parameter_set() {
   "machines",
   "gpu_platform_id",
   "gpu_device_id",
+  "num_gpus",
+  "gpu_device_id_list",
   "gpu_use_dp",
-  "num_gpu",
   });
   return params;
 }
@@ -656,10 +657,11 @@ void Config::GetMembersFromString(const std::unordered_map<std::string, std::str
 
   GetInt(params, "gpu_device_id", &gpu_device_id);
 
-  GetBool(params, "gpu_use_dp", &gpu_use_dp);
+  GetInt(params, "num_gpus", &num_gpus);
+
+  GetString(params, "gpu_device_id_list", &gpu_device_id_list);
 
-  GetInt(params, "num_gpu", &num_gpu);
-  CHECK_GT(num_gpu, 0);
+  GetBool(params, "gpu_use_dp", &gpu_use_dp);
 }
 
 std::string Config::SaveMembersToString() const {
@@ -775,8 +777,9 @@ std::string Config::SaveMembersToString() const {
   str_buf << "[machines: " << machines << "]\n";
   str_buf << "[gpu_platform_id: " << gpu_platform_id << "]\n";
   str_buf << "[gpu_device_id: " << gpu_device_id << "]\n";
+  str_buf << "[num_gpus: " << num_gpus << "]\n";
+  str_buf << "[gpu_device_id_list: " << gpu_device_id_list << "]\n";
   str_buf << "[gpu_use_dp: " << gpu_use_dp << "]\n";
-  str_buf << "[num_gpu: " << num_gpu << "]\n";
   return str_buf.str();
 }
 
@@ -918,8 +921,9 @@ const std::unordered_map<std::string, std::vector<std::string>>& Config::paramet
     {"machines", {"workers", "nodes"}},
     {"gpu_platform_id", {}},
     {"gpu_device_id", {}},
+    {"num_gpus", {}},
+    {"gpu_device_id_list", {}},
     {"gpu_use_dp", {}},
-    {"num_gpu", {}},
   });
   return map;
 }
@@ -1061,8 +1065,9 @@ const std::unordered_map<std::string, std::string>& Config::ParameterTypes() {
     {"machines", "string"},
     {"gpu_platform_id", "int"},
     {"gpu_device_id", "int"},
+    {"num_gpus", "int"},
+    {"gpu_device_id_list", "string"},
     {"gpu_use_dp", "bool"},
-    {"num_gpu", "int"},
   });
   return map;
 }
diff --git a/src/io/dataset.cpp b/src/io/dataset.cpp
index e78f8a6b696c..7e77d5f819f6 100644
--- a/src/io/dataset.cpp
+++ b/src/io/dataset.cpp
@@ -833,9 +833,8 @@ void Dataset::ReSize(data_size_t num_data) {
   }
 }
 
-void Dataset::CopySubrow(const Dataset* fullset,
-                         const data_size_t* used_indices,
-                         data_size_t num_used_indices, bool need_meta_data) {
+
+void Dataset::CopySubrowHostPart(const Dataset* fullset, const data_size_t* used_indices, data_size_t num_used_indices, bool need_meta_data) {
   CHECK_EQ(num_used_indices, num_data_);
 
   std::vector<int> group_ids, subfeature_ids;
@@ -882,6 +881,13 @@ void Dataset::CopySubrow(const Dataset* fullset,
       }
     }
   }
+}
+
+void Dataset::CopySubrow(const Dataset* fullset,
+                         const data_size_t* used_indices,
+                         data_size_t num_used_indices, bool need_meta_data) {
+  CopySubrowHostPart(fullset, used_indices, num_used_indices, need_meta_data);
+
   // update CUDA storage for column data and metadata
   device_type_ = fullset->device_type_;
   gpu_device_id_ = fullset->gpu_device_id_;
@@ -889,7 +895,28 @@ void Dataset::CopySubrow(const Dataset* fullset,
   #ifdef USE_CUDA
   if (device_type_ == std::string("cuda")) {
     if (cuda_column_data_ == nullptr) {
-      cuda_column_data_.reset(new CUDAColumnData(fullset->num_data(), gpu_device_id_));
+      cuda_column_data_.reset(new CUDAColumnData(num_used_indices, gpu_device_id_));
+      metadata_.CreateCUDAMetadata(gpu_device_id_);
+    }
+    cuda_column_data_->CopySubrow(fullset->cuda_column_data(), used_indices, num_used_indices);
+  }
+  #endif  // USE_CUDA
+}
+
+void Dataset::CopySubrowToDevice(const Dataset* fullset,
+                                 const data_size_t* used_indices,
+                                 data_size_t num_used_indices, bool need_meta_data,
+                                 int gpu_device_id) {
+  CopySubrowHostPart(fullset, used_indices, num_used_indices, need_meta_data);
+
+  // update CUDA storage for column data and metadata
+  device_type_ = fullset->device_type_;
+  gpu_device_id_ = gpu_device_id;
+
+  #ifdef USE_CUDA
+  if (device_type_ == std::string("cuda")) {
+    if (cuda_column_data_ == nullptr) {
+      cuda_column_data_.reset(new CUDAColumnData(num_used_indices, gpu_device_id_));
       metadata_.CreateCUDAMetadata(gpu_device_id_);
     }
     cuda_column_data_->CopySubrow(fullset->cuda_column_data(), used_indices, num_used_indices);
diff --git a/src/objective/objective_function.cpp b/src/objective/objective_function.cpp
index a203017cf36e..1d758b361bca 100644
--- a/src/objective/objective_function.cpp
+++ b/src/objective/objective_function.cpp
@@ -3,6 +3,7 @@
  * Licensed under the MIT License. See LICENSE file in the project root for license information.
  */
 #include <LightGBM/objective_function.h>
+#include <LightGBM/cuda/cuda_objective_function.hpp>
 
 #include "binary_objective.hpp"
 #include "multiclass_objective.hpp"
@@ -17,52 +18,58 @@
 
 namespace LightGBM {
 
+#ifdef USE_CUDA
+ObjectiveFunction* ObjectiveFunction::CreateObjectiveFunctionCUDA(const std::string& type, const Config& config) {
+  if (type == std::string("regression")) {
+    return new CUDARegressionL2loss(config);
+  } else if (type == std::string("regression_l1")) {
+    return new CUDARegressionL1loss(config);
+  } else if (type == std::string("quantile")) {
+    return new CUDARegressionQuantileloss(config);
+  } else if (type == std::string("huber")) {
+    return new CUDARegressionHuberLoss(config);
+  } else if (type == std::string("fair")) {
+    return new CUDARegressionFairLoss(config);
+  } else if (type == std::string("poisson")) {
+    return new CUDARegressionPoissonLoss(config);
+  } else if (type == std::string("binary")) {
+    return new CUDABinaryLogloss(config);
+  } else if (type == std::string("lambdarank")) {
+    return new CUDALambdarankNDCG(config);
+  } else if (type == std::string("rank_xendcg")) {
+    return new CUDARankXENDCG(config);
+  } else if (type == std::string("multiclass")) {
+    return new CUDAMulticlassSoftmax(config);
+  } else if (type == std::string("multiclassova")) {
+    return new CUDAMulticlassOVA(config);
+  } else if (type == std::string("cross_entropy")) {
+    Log::Warning("Objective cross_entropy is not implemented in cuda version. Fall back to boosting on CPU.");
+    return new CrossEntropy(config);
+  } else if (type == std::string("cross_entropy_lambda")) {
+    Log::Warning("Objective cross_entropy_lambda is not implemented in cuda version. Fall back to boosting on CPU.");
+    return new CrossEntropyLambda(config);
+  } else if (type == std::string("mape")) {
+    Log::Warning("Objective mape is not implemented in cuda version. Fall back to boosting on CPU.");
+    return new RegressionMAPELOSS(config);
+  } else if (type == std::string("gamma")) {
+    Log::Warning("Objective gamma is not implemented in cuda version. Fall back to boosting on CPU.");
+    return new RegressionGammaLoss(config);
+  } else if (type == std::string("tweedie")) {
+    Log::Warning("Objective tweedie is not implemented in cuda version. Fall back to boosting on CPU.");
+    return new RegressionTweedieLoss(config);
+  } else if (type == std::string("custom")) {
+    Log::Warning("Using customized objective with cuda. This requires copying gradients from CPU to GPU, which can be slow.");
+    return nullptr;
+  }
+}
+#endif  // USE_CUDA
+
 ObjectiveFunction* ObjectiveFunction::CreateObjectiveFunction(const std::string& type, const Config& config) {
   #ifdef USE_CUDA
   if (config.device_type == std::string("cuda") &&
       config.data_sample_strategy != std::string("goss") &&
       config.boosting != std::string("rf")) {
-    if (type == std::string("regression")) {
-      return new CUDARegressionL2loss(config);
-    } else if (type == std::string("regression_l1")) {
-      return new CUDARegressionL1loss(config);
-    } else if (type == std::string("quantile")) {
-      return new CUDARegressionQuantileloss(config);
-    } else if (type == std::string("huber")) {
-      return new CUDARegressionHuberLoss(config);
-    } else if (type == std::string("fair")) {
-      return new CUDARegressionFairLoss(config);
-    } else if (type == std::string("poisson")) {
-      return new CUDARegressionPoissonLoss(config);
-    } else if (type == std::string("binary")) {
-      return new CUDABinaryLogloss(config);
-    } else if (type == std::string("lambdarank")) {
-      return new CUDALambdarankNDCG(config);
-    } else if (type == std::string("rank_xendcg")) {
-      return new CUDARankXENDCG(config);
-    } else if (type == std::string("multiclass")) {
-      return new CUDAMulticlassSoftmax(config);
-    } else if (type == std::string("multiclassova")) {
-      return new CUDAMulticlassOVA(config);
-    } else if (type == std::string("cross_entropy")) {
-      Log::Warning("Objective cross_entropy is not implemented in cuda version. Fall back to boosting on CPU.");
-      return new CrossEntropy(config);
-    } else if (type == std::string("cross_entropy_lambda")) {
-      Log::Warning("Objective cross_entropy_lambda is not implemented in cuda version. Fall back to boosting on CPU.");
-      return new CrossEntropyLambda(config);
-    } else if (type == std::string("mape")) {
-      Log::Warning("Objective mape is not implemented in cuda version. Fall back to boosting on CPU.");
-      return new RegressionMAPELOSS(config);
-    } else if (type == std::string("gamma")) {
-      Log::Warning("Objective gamma is not implemented in cuda version. Fall back to boosting on CPU.");
-      return new RegressionGammaLoss(config);
-    } else if (type == std::string("tweedie")) {
-      Log::Warning("Objective tweedie is not implemented in cuda version. Fall back to boosting on CPU.");
-      return new RegressionTweedieLoss(config);
-    } else if (type == std::string("custom")) {
-      Log::Warning("Using customized objective with cuda. This requires copying gradients from CPU to GPU, which can be slow.");
-      return nullptr;
-    }
+    return CreateObjectiveFunctionCUDA(type, config);
   } else {
   #endif  // USE_CUDA
     if (type == std::string("regression")) {
diff --git a/src/treelearner/cuda/cuda_single_gpu_tree_learner.hpp b/src/treelearner/cuda/cuda_single_gpu_tree_learner.hpp
index fa782ebaad25..da1a3f09ce90 100644
--- a/src/treelearner/cuda/cuda_single_gpu_tree_learner.hpp
+++ b/src/treelearner/cuda/cuda_single_gpu_tree_learner.hpp
@@ -23,7 +23,7 @@ namespace LightGBM {
 
 #define CUDA_SINGLE_GPU_TREE_LEARNER_BLOCK_SIZE (1024)
 
-class CUDASingleGPUTreeLearner: public SerialTreeLearner {
+class CUDASingleGPUTreeLearner: public SerialTreeLearner, public NCCLInfo {
  public:
   explicit CUDASingleGPUTreeLearner(const Config* config, const bool boosting_on_cuda);
 
diff --git a/src/treelearner/tree_learner.cpp b/src/treelearner/tree_learner.cpp
index 0018bdaf8b94..250019e27dbf 100644
--- a/src/treelearner/tree_learner.cpp
+++ b/src/treelearner/tree_learner.cpp
@@ -40,7 +40,7 @@ TreeLearner* TreeLearner::CreateTreeLearner(const std::string& learner_type, con
     }
   } else if (device_type == std::string("cuda")) {
     if (learner_type == std::string("serial")) {
-      if (config->num_gpu == 1) {
+      if (config->num_gpus == 1) {
         return new CUDASingleGPUTreeLearner(config, boosting_on_cuda);
       } else {
         Log::Fatal("Currently cuda version only supports training on a single GPU.");